From ad9c62bd8946621ed02ac94131a921222508a8bc Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 13 Oct 2025 18:59:01 +0000 Subject: [PATCH 01/82] KVM: arm64: VM exit to userspace to handle SEA When APEI fails to handle a stage-2 synchronous external abort (SEA), today KVM injects an asynchronous SError to the VCPU then resumes it, which usually results in unpleasant guest kernel panic. One major situation of guest SEA is when vCPU consumes recoverable uncorrected memory error (UER). Although SError and guest kernel panic effectively stops the propagation of corrupted memory, guest may re-use the corrupted memory if auto-rebooted; in worse case, guest boot may run into poisoned memory. So there is room to recover from an UER in a more graceful manner. Alternatively KVM can redirect the synchronous SEA event to VMM to - Reduce blast radius if possible. VMM can inject a SEA to VCPU via KVM's existing KVM_SET_VCPU_EVENTS API. If the memory poison consumption or fault is not from guest kernel, blast radius can be limited to the triggering thread in guest userspace, so VM can keep running. - Allow VMM to protect from future memory poison consumption by unmapping the page from stage-2, or to interrupt guest of the poisoned page so guest kernel can unmap it from stage-1 page table. - Allow VMM to track SEA events that VM customers care about, to restart VM when certain number of distinct poison events have happened, to provide observability to customers in log management UI. Introduce an userspace-visible feature to enable VMM handle SEA: - KVM_CAP_ARM_SEA_TO_USER. As the alternative fallback behavior when host APEI fails to claim a SEA, userspace can opt in this new capability to let KVM exit to userspace during SEA if it is not owned by host. - KVM_EXIT_ARM_SEA. A new exit reason is introduced for this. KVM fills kvm_run.arm_sea with as much as possible information about the SEA, enabling VMM to emulate SEA to guest by itself. - Sanitized ESR_EL2. The general rule is to keep only the bits useful for userspace and relevant to guest memory. - Flags indicating if faulting guest physical address is valid. - Faulting guest physical and virtual addresses if valid. Signed-off-by: Jiaqi Yan Co-developed-by: Oliver Upton Signed-off-by: Oliver Upton Link: https://msgid.link/20251013185903.1372553-2-jiaqiyan@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/kvm/arm.c | 5 +++ arch/arm64/kvm/mmu.c | 68 ++++++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 10 +++++ 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 64302c438355..366bf337ef64 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -350,6 +350,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 + /* Unhandled SEAs are taken to userspace */ +#define KVM_ARCH_FLAG_EXIT_SEA 11 unsigned long flags; /* VM-wide vCPU feature set */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..511d2e8ef6c7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -132,6 +132,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_ARM_SEA_TO_USER: + r = 0; + set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags); + break; default: break; } @@ -327,6 +331,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + case KVM_CAP_ARM_SEA_TO_USER: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..58cb169727a6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1899,8 +1899,48 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) read_unlock(&vcpu->kvm->mmu_lock); } +/* + * Returns true if the SEA should be handled locally within KVM if the abort + * is caused by a kernel memory allocation (e.g. stage-2 table memory). + */ +static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort + * taken from a guest EL to EL2 is due to a host-imposed access (e.g. + * stage-2 PTW). + */ + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return true; + + /* KVM owns the VNCR when the vCPU isn't in a nested context. */ + if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) + return true; + + /* + * Determining if an external abort during a table walk happened at + * stage-2 is only possible with S1PTW is set. Otherwise, since KVM + * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the + * PA of the stage-1 descriptor) can reach here and are reported + * with a TTW ESR value. + */ + return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); +} + int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 esr_mask = ESR_ELx_EC_MASK | + ESR_ELx_IL | + ESR_ELx_FnV | + ESR_ELx_EA | + ESR_ELx_CM | + ESR_ELx_WNR | + ESR_ELx_FSC; + u64 ipa; + /* * Give APEI the opportunity to claim the abort before handling it * within KVM. apei_claim_sea() expects to be called with IRQs enabled. @@ -1909,7 +1949,33 @@ int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) if (apei_claim_sea(NULL) == 0) return 1; - return kvm_inject_serror(vcpu); + if (host_owns_sea(vcpu, esr) || + !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) + return kvm_inject_serror(vcpu); + + /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ + if (kvm_has_ras(kvm)) + esr_mask |= ESR_ELx_SET_MASK; + + /* + * Exit to userspace, and provide faulting guest virtual and physical + * addresses in case userspace wants to emulate SEA to guest by + * writing to FAR_ELx and HPFAR_ELx registers. + */ + memset(&run->arm_sea, 0, sizeof(run->arm_sea)); + run->exit_reason = KVM_EXIT_ARM_SEA; + run->arm_sea.esr = esr & esr_mask; + + if (!(esr & ESR_ELx_FnV)) + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); + + ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (ipa != INVALID_GPA) { + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; + run->arm_sea.gpa = ipa; + } + + return 0; } /** diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..1e541193e98d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -179,6 +179,7 @@ struct kvm_xen_exit { #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 +#define KVM_EXIT_ARM_SEA 41 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -473,6 +474,14 @@ struct kvm_run { } setup_event_notify; }; } tdx; + /* KVM_EXIT_ARM_SEA */ + struct { +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; /* Fix the size of the union. */ char padding[256]; }; @@ -963,6 +972,7 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_ARM_SEA_TO_USER 245 struct kvm_irq_routing_irqchip { __u32 irqchip; From feee9ef7ac1648835c3b1e35ba4833cf7c8b3669 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 13 Oct 2025 18:59:02 +0000 Subject: [PATCH 02/82] KVM: selftests: Test for KVM_EXIT_ARM_SEA Test how KVM handles guest SEA when APEI is unable to claim it, and KVM_CAP_ARM_SEA_TO_USER is enabled. The behavior is triggered by consuming recoverable memory error (UER) injected via EINJ. The test asserts two major things: 1. KVM returns to userspace with KVM_EXIT_ARM_SEA exit reason, and has provided expected fault information, e.g. esr, flags, gva, gpa. 2. Userspace is able to handle KVM_EXIT_ARM_SEA by injecting SEA to guest and KVM injects expected SEA into the VCPU. Tested on a data center server running Siryn AmpereOne processor that has RAS support. Several things to notice before attempting to run this selftest: - The test relies on EINJ support in both firmware and kernel to inject UER. Otherwise the test will be skipped. - The under-test platform's APEI should be unable to claim the SEA. Otherwise the test will be skipped. - Some platform doesn't support notrigger in EINJ, which may cause APEI and GHES to offline the memory before guest can consume injected UER, and making test unable to trigger SEA. Signed-off-by: Jiaqi Yan Link: https://msgid.link/20251013185903.1372553-3-jiaqiyan@google.com Signed-off-by: Oliver Upton --- tools/arch/arm64/include/asm/esr.h | 2 + tools/testing/selftests/kvm/Makefile.kvm | 1 + .../testing/selftests/kvm/arm64/sea_to_user.c | 331 ++++++++++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 1 + 4 files changed, 335 insertions(+) create mode 100644 tools/testing/selftests/kvm/arm64/sea_to_user.c diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h index bd592ca81571..0fa17b3af1f7 100644 --- a/tools/arch/arm64/include/asm/esr.h +++ b/tools/arch/arm64/include/asm/esr.h @@ -141,6 +141,8 @@ #define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) #define ESR_ELx_AR_SHIFT (14) #define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_VNCR_SHIFT (13) +#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_CM_SHIFT (8) #define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..02a7663c097b 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -163,6 +163,7 @@ TEST_GEN_PROGS_arm64 += arm64/hypercalls TEST_GEN_PROGS_arm64 += arm64/external_aborts TEST_GEN_PROGS_arm64 += arm64/page_fault_test TEST_GEN_PROGS_arm64 += arm64/psci_test +TEST_GEN_PROGS_arm64 += arm64/sea_to_user TEST_GEN_PROGS_arm64 += arm64/set_id_regs TEST_GEN_PROGS_arm64 += arm64/smccc_filter TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config diff --git a/tools/testing/selftests/kvm/arm64/sea_to_user.c b/tools/testing/selftests/kvm/arm64/sea_to_user.c new file mode 100644 index 000000000000..573dd790aeb8 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/sea_to_user.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test KVM returns to userspace with KVM_EXIT_ARM_SEA if host APEI fails + * to handle SEA and userspace has opt-ed in KVM_CAP_ARM_SEA_TO_USER. + * + * After reaching userspace with expected arm_sea info, also test userspace + * injecting a synchronous external data abort into the guest. + * + * This test utilizes EINJ to generate a REAL synchronous external data + * abort by consuming a recoverable uncorrectable memory error. Therefore + * the device under test must support EINJ in both firmware and host kernel, + * including the notrigger feature. Otherwise the test will be skipped. + * The under-test platform's APEI should be unable to claim SEA. Otherwise + * the test will also be skipped. + */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" + +#define PAGE_PRESENT (1ULL << 63) +#define PAGE_PHYSICAL 0x007fffffffffffffULL +#define PAGE_ADDR_MASK (~(0xfffULL)) + +/* Group ISV and ISS[23:14]. */ +#define ESR_ELx_INST_SYNDROME ((ESR_ELx_ISV) | (ESR_ELx_SAS) | \ + (ESR_ELx_SSE) | (ESR_ELx_SRT_MASK) | \ + (ESR_ELx_SF) | (ESR_ELx_AR)) + +#define EINJ_ETYPE "/sys/kernel/debug/apei/einj/error_type" +#define EINJ_ADDR "/sys/kernel/debug/apei/einj/param1" +#define EINJ_MASK "/sys/kernel/debug/apei/einj/param2" +#define EINJ_FLAGS "/sys/kernel/debug/apei/einj/flags" +#define EINJ_NOTRIGGER "/sys/kernel/debug/apei/einj/notrigger" +#define EINJ_DOIT "/sys/kernel/debug/apei/einj/error_inject" +/* Memory Uncorrectable non-fatal. */ +#define ERROR_TYPE_MEMORY_UER 0x10 +/* Memory address and mask valid (param1 and param2). */ +#define MASK_MEMORY_UER 0b10 + +/* Guest virtual address region = [2G, 3G). */ +#define START_GVA 0x80000000UL +#define VM_MEM_SIZE 0x40000000UL +/* Note: EINJ_OFFSET must < VM_MEM_SIZE. */ +#define EINJ_OFFSET 0x01234badUL +#define EINJ_GVA ((START_GVA) + (EINJ_OFFSET)) + +static vm_paddr_t einj_gpa; +static void *einj_hva; +static uint64_t einj_hpa; +static bool far_invalid; + +static uint64_t translate_to_host_paddr(unsigned long vaddr) +{ + uint64_t pinfo; + int64_t offset = vaddr / getpagesize() * sizeof(pinfo); + int fd; + uint64_t page_addr; + uint64_t paddr; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + ksft_exit_fail_perror("Failed to open /proc/self/pagemap"); + if (pread(fd, &pinfo, sizeof(pinfo), offset) != sizeof(pinfo)) { + close(fd); + ksft_exit_fail_perror("Failed to read /proc/self/pagemap"); + } + + close(fd); + + if ((pinfo & PAGE_PRESENT) == 0) + ksft_exit_fail_perror("Page not present"); + + page_addr = (pinfo & PAGE_PHYSICAL) << MIN_PAGE_SHIFT; + paddr = page_addr + (vaddr & (getpagesize() - 1)); + return paddr; +} + +static void write_einj_entry(const char *einj_path, uint64_t val) +{ + char cmd[256] = {0}; + FILE *cmdfile = NULL; + + sprintf(cmd, "echo %#lx > %s", val, einj_path); + cmdfile = popen(cmd, "r"); + + if (pclose(cmdfile) == 0) + ksft_print_msg("echo %#lx > %s - done\n", val, einj_path); + else + ksft_exit_fail_perror("Failed to write EINJ entry"); +} + +static void inject_uer(uint64_t paddr) +{ + if (access("/sys/firmware/acpi/tables/EINJ", R_OK) == -1) + ksft_test_result_skip("EINJ table no available in firmware"); + + if (access(EINJ_ETYPE, R_OK | W_OK) == -1) + ksft_test_result_skip("EINJ module probably not loaded?"); + + write_einj_entry(EINJ_ETYPE, ERROR_TYPE_MEMORY_UER); + write_einj_entry(EINJ_FLAGS, MASK_MEMORY_UER); + write_einj_entry(EINJ_ADDR, paddr); + write_einj_entry(EINJ_MASK, ~0x0UL); + write_einj_entry(EINJ_NOTRIGGER, 1); + write_einj_entry(EINJ_DOIT, 1); +} + +/* + * When host APEI successfully claims the SEA caused by guest_code, kernel + * will send SIGBUS signal with BUS_MCEERR_AR to test thread. + * + * We set up this SIGBUS handler to skip the test for that case. + */ +static void sigbus_signal_handler(int sig, siginfo_t *si, void *v) +{ + ksft_print_msg("SIGBUS (%d) received, dumping siginfo...\n", sig); + ksft_print_msg("si_signo=%d, si_errno=%d, si_code=%d, si_addr=%p\n", + si->si_signo, si->si_errno, si->si_code, si->si_addr); + if (si->si_code == BUS_MCEERR_AR) + ksft_test_result_skip("SEA is claimed by host APEI\n"); + else + ksft_test_result_fail("Exit with signal unhandled\n"); + + exit(0); +} + +static void setup_sigbus_handler(void) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = sigbus_signal_handler; + act.sa_flags = SA_SIGINFO; + TEST_ASSERT(sigaction(SIGBUS, &act, NULL) == 0, + "Failed to setup SIGBUS handler"); +} + +static void guest_code(void) +{ + uint64_t guest_data; + + /* Consumes error will cause a SEA. */ + guest_data = *(uint64_t *)EINJ_GVA; + + GUEST_FAIL("Poison not protected by SEA: gva=%#lx, guest_data=%#lx\n", + EINJ_GVA, guest_data); +} + +static void expect_sea_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + u64 far = read_sysreg(far_el1); + bool expect_far_invalid = far_invalid; + + GUEST_PRINTF("Handling Guest SEA\n"); + GUEST_PRINTF("ESR_EL1=%#lx, FAR_EL1=%#lx\n", esr, far); + + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + + if (expect_far_invalid) { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, ESR_ELx_FnV); + GUEST_PRINTF("Guest observed garbage value in FAR\n"); + } else { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, 0); + GUEST_ASSERT_EQ(far, EINJ_GVA); + } + + GUEST_DONE(); +} + +static void vcpu_inject_sea(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events = {}; + + events.exception.ext_dabt_pending = true; + vcpu_events_set(vcpu, &events); +} + +static void run_vm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + bool guest_done = false; + struct kvm_run *run = vcpu->run; + u64 esr; + + /* Resume the vCPU after error injection to consume the error. */ + vcpu_run(vcpu); + + ksft_print_msg("Dump kvm_run info about KVM_EXIT_%s\n", + exit_reason_str(run->exit_reason)); + ksft_print_msg("kvm_run.arm_sea: esr=%#llx, flags=%#llx\n", + run->arm_sea.esr, run->arm_sea.flags); + ksft_print_msg("kvm_run.arm_sea: gva=%#llx, gpa=%#llx\n", + run->arm_sea.gva, run->arm_sea.gpa); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_SEA); + + esr = run->arm_sea.esr; + TEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_LOW); + TEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + TEST_ASSERT_EQ(ESR_ELx_ISS2(esr), 0); + TEST_ASSERT_EQ((esr & ESR_ELx_INST_SYNDROME), 0); + TEST_ASSERT_EQ(esr & ESR_ELx_VNCR, 0); + + if (!(esr & ESR_ELx_FnV)) { + ksft_print_msg("Expect gva to match given FnV bit is 0\n"); + TEST_ASSERT_EQ(run->arm_sea.gva, EINJ_GVA); + } + + if (run->arm_sea.flags & KVM_EXIT_ARM_SEA_FLAG_GPA_VALID) { + ksft_print_msg("Expect gpa to match given KVM_EXIT_ARM_SEA_FLAG_GPA_VALID is set\n"); + TEST_ASSERT_EQ(run->arm_sea.gpa, einj_gpa & PAGE_ADDR_MASK); + } + + far_invalid = esr & ESR_ELx_FnV; + + /* Inject a SEA into guest and expect handled in SEA handler. */ + vcpu_inject_sea(vcpu); + + /* Expect the guest to reach GUEST_DONE gracefully. */ + do { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_PRINTF: + ksft_print_msg("From guest: %s", uc.buffer); + break; + case UCALL_DONE: + ksft_print_msg("Guest done gracefully!\n"); + guest_done = 1; + break; + case UCALL_ABORT: + ksft_print_msg("Guest aborted!\n"); + guest_done = 1; + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected ucall: %lu\n", uc.cmd); + } + } while (!guest_done); +} + +static struct kvm_vm *vm_create_with_sea_handler(struct kvm_vcpu **vcpu) +{ + size_t backing_page_size; + size_t guest_page_size; + size_t alignment; + uint64_t num_guest_pages; + vm_paddr_t start_gpa; + enum vm_mem_backing_src_type src_type = VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB; + struct kvm_vm *vm; + + backing_page_size = get_backing_src_pagesz(src_type); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + alignment = max(backing_page_size, guest_page_size); + num_guest_pages = VM_MEM_SIZE / guest_page_size; + + vm = __vm_create_with_one_vcpu(vcpu, num_guest_pages, guest_code); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(*vcpu); + + vm_install_sync_handler(vm, + /*vector=*/VECTOR_SYNC_CURRENT, + /*ec=*/ESR_ELx_EC_DABT_CUR, + /*handler=*/expect_sea_handler); + + start_gpa = (vm->max_gfn - num_guest_pages) * guest_page_size; + start_gpa = align_down(start_gpa, alignment); + + vm_userspace_mem_region_add( + /*vm=*/vm, + /*src_type=*/src_type, + /*guest_paddr=*/start_gpa, + /*slot=*/1, + /*npages=*/num_guest_pages, + /*flags=*/0); + + virt_map(vm, START_GVA, start_gpa, num_guest_pages); + + ksft_print_msg("Mapped %#lx pages: gva=%#lx to gpa=%#lx\n", + num_guest_pages, START_GVA, start_gpa); + return vm; +} + +static void vm_inject_memory_uer(struct kvm_vm *vm) +{ + uint64_t guest_data; + + einj_gpa = addr_gva2gpa(vm, EINJ_GVA); + einj_hva = addr_gva2hva(vm, EINJ_GVA); + + /* Populate certain data before injecting UER. */ + *(uint64_t *)einj_hva = 0xBAADCAFE; + guest_data = *(uint64_t *)einj_hva; + ksft_print_msg("Before EINJect: data=%#lx\n", + guest_data); + + einj_hpa = translate_to_host_paddr((unsigned long)einj_hva); + + ksft_print_msg("EINJ_GVA=%#lx, einj_gpa=%#lx, einj_hva=%p, einj_hpa=%#lx\n", + EINJ_GVA, einj_gpa, einj_hva, einj_hpa); + + inject_uer(einj_hpa); + ksft_print_msg("Memory UER EINJected\n"); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SEA_TO_USER)); + + setup_sigbus_handler(); + + vm = vm_create_with_sea_handler(&vcpu); + vm_enable_cap(vm, KVM_CAP_ARM_SEA_TO_USER, 0); + vm_inject_memory_uer(vm); + run_vm(vm, vcpu); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..234b0b7bf6e8 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2025,6 +2025,7 @@ static struct exit_reason { KVM_EXIT_STRING(NOTIFY), KVM_EXIT_STRING(LOONGARCH_IOCSR), KVM_EXIT_STRING(MEMORY_FAULT), + KVM_EXIT_STRING(ARM_SEA), }; /* From 4debb5e8952e43c06c183a2efe2dd7820c55f196 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 13 Oct 2025 18:59:03 +0000 Subject: [PATCH 03/82] Documentation: kvm: new UAPI for handling SEA Document the new userspace-visible features and APIs for handling synchronous external abort (SEA) - KVM_CAP_ARM_SEA_TO_USER: How userspace enables the new feature. - KVM_EXIT_ARM_SEA: exit userspace gets when it needs to handle SEA and what userspace gets while taking the SEA. Signed-off-by: Jiaqi Yan Link: https://msgid.link/20251013185903.1372553-4-jiaqiyan@google.com [ oliver: make documentation concise, remove implementation detail ] Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 57061fa29e6a..27f726ff8fe0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7286,6 +7286,41 @@ exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the ``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. Userspace need not do anything if it does not wish to support a TDVMCALL. + +:: + + /* KVM_EXIT_ARM_SEA */ + struct { + #define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; + +Used on arm64 systems. When the VM capability ``KVM_CAP_ARM_SEA_TO_USER`` is +enabled, a KVM exits to userspace if a guest access causes a synchronous +external abort (SEA) and the host APEI fails to handle the SEA. + +``esr`` is set to a sanitized value of ESR_EL2 from the exception taken to KVM, +consisting of the following fields: + + - ``ESR_EL2.EC`` + - ``ESR_EL2.IL`` + - ``ESR_EL2.FnV`` + - ``ESR_EL2.EA`` + - ``ESR_EL2.CM`` + - ``ESR_EL2.WNR`` + - ``ESR_EL2.FSC`` + - ``ESR_EL2.SET`` (when FEAT_RAS is implemented for the VM) + +``gva`` is set to the value of FAR_EL2 from the exception taken to KVM when +``ESR_EL2.FnV == 0``. Otherwise, the value of ``gva`` is unknown. + +``gpa`` is set to the faulting IPA from the exception taken to KVM when +the ``KVM_EXIT_ARM_SEA_FLAG_GPA_VALID`` flag is set. Otherwise, the value of +``gpa`` is unknown. + :: /* Fix the size of the union. */ @@ -8703,6 +8738,18 @@ This capability indicate to the userspace whether a PFNMAP memory region can be safely mapped as cacheable. This relies on the presence of force write back (FWB) feature support on the hardware. +7.45 KVM_CAP_ARM_SEA_TO_USER +---------------------------- + +:Architecture: arm64 +:Target: VM +:Parameters: none +:Returns: 0 on success, -EINVAL if unsupported. + +When this capability is enabled, KVM may exit to userspace for SEAs taken to +EL2 resulting from a guest access. See ``KVM_EXIT_ARM_SEA`` for more +information. + 8. Other capabilities. ====================== From 297877069bc2fa079fb2a60ae91ca9abb481074a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 01:38:21 -0800 Subject: [PATCH 04/82] KVM: arm64: Drop useless __GFP_HIGHMEM from kvm struct allocation A recent change on the receiving end of vmalloc() started warning about unsupported GFP flags passed by the caller. Nathan reports that this warning fires in kvm_arch_alloc_vm(), owing to the fact that KVM is passing a meaningless __GFP_HIGHMEM. Do as the warning says and fix the code. Cc: Vishal Moola (Oracle) Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/kvmarm/20251118224448.GA998046@ax162/ Acked-by: Vishal Moola (Oracle) Reviewed-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://msgid.link/20251119093822.2513142-2-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..e791fa52f874 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -440,7 +440,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); + return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) From cb17d79ff51d41f656bcf7928330b2e9c0003583 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 01:38:22 -0800 Subject: [PATCH 05/82] KVM: arm64: Use kvzalloc() for kvm struct allocation Physically-allocated KVM structs aren't necessary when in VHE mode as there's no need to share with the hyp's address space. Of course, there can still be a performance benefit from physical allocations. Use kvzalloc() for opportunistic physical allocations. Acked-by: Vishal Moola (Oracle) Reviewed-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://msgid.link/20251119093822.2513142-3-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e791fa52f874..ecbe2c8dc00c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -440,7 +440,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + return kvzalloc(sz, GFP_KERNEL_ACCOUNT); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) From 31df012da496968d8d4368f693ad45dfcbfba40b Mon Sep 17 00:00:00 2001 From: Maximilian Dittgen Date: Wed, 19 Nov 2025 14:57:43 +0100 Subject: [PATCH 06/82] KVM: selftests: Assert GICR_TYPER.Processor_Number matches selftest CPU number The selftests GIC library and tests assume that the GICR_TYPER.Processor_number associated with a given CPU is the same as the CPU's selftest index. Since this assumption is not guaranteed by specification, add an assert in gicv3_cpu_init() that validates this is true. Signed-off-by: Maximilian Dittgen Link: https://msgid.link/20251119135744.68552-1-mdittgen@amazon.de Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/lib/arm64/gic_v3.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..f81025cd32e2 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -298,12 +298,17 @@ static void gicv3_cpu_init(unsigned int cpu) volatile void *sgi_base; unsigned int i; volatile void *redist_base_cpu; + u64 typer; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); + /* Verify assumption that GICR_TYPER.Processor_number == cpu */ + typer = readq_relaxed(redist_base_cpu + GICR_TYPER); + GUEST_ASSERT_EQ(GICR_TYPER_CPU_NUMBER(typer), cpu); + gicv3_enable_redist(redist_base_cpu); /* From 85f329df293119d6ba1a26453d109556631081a4 Mon Sep 17 00:00:00 2001 From: Maximilian Dittgen Date: Wed, 19 Nov 2025 14:57:44 +0100 Subject: [PATCH 07/82] KVM: selftests: SYNC after guest ITS setup in vgic_lpi_stress vgic_lpi_stress sends MAPTI and MAPC commands during guest GIC setup to map interrupt events to ITT entries and collection IDs to redistributors, respectively. We have no guarantee that the ITS will finish handling these mapping commands before the selftest calls KVM_SIGNAL_MSI to inject LPIs to the guest. If LPIs are injected before ITS mapping completes, the ITS cannot properly pass the interrupt on to the redistributor. Fix by adding a SYNC command to the selftests ITS library, then calling SYNC after ITS mapping to ensure mapping completes before signal_lpi() writes to GITS_TRANSLATER. Signed-off-by: Maximilian Dittgen Link: https://msgid.link/20251119135744.68552-2-mdittgen@amazon.de Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c | 4 ++++ tools/testing/selftests/kvm/include/arm64/gic_v3_its.h | 1 + tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c | 10 ++++++++++ 3 files changed, 15 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index 687d04463983..e857a605f577 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -118,6 +118,10 @@ static void guest_setup_gic(void) guest_setup_its_mappings(); guest_invalidate_all_rdists(); + + /* SYNC to ensure ITS setup is complete */ + for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++) + its_send_sync_cmd(test_data.cmdq_base_va, cpuid); } static void guest_code(size_t nr_lpis) diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h index 3722ed9c8f96..58feef3eb386 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h +++ b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h @@ -15,5 +15,6 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, u32 collection_id, u32 intid); void its_send_invall_cmd(void *cmdq_base, u32 collection_id); +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id); #endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 09f270545646..aec1b69a4de3 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -246,3 +246,13 @@ void its_send_invall_cmd(void *cmdq_base, u32 collection_id) its_send_cmd(cmdq_base, &cmd); } + +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_SYNC); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); + + its_send_cmd(cmdq_base, &cmd); +} From 156f70afcfecfc45be5fdc2e4adebc5ea70a93b0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 14:11:50 -0800 Subject: [PATCH 08/82] KVM: arm64: Only drop references on empty tables in stage2_free_walker A subsequent change to the way KVM frees stage-2s will invoke the free walker on sub-ranges of the VM's IPA space, meaning there's potential for only partially visiting a table's PTEs. Split the leaf and table visitors and only drop references on a table when the page count reaches 1, implying there are no valid PTEs that need to be visited. Invalidate the table PTE to avoid traversing the stale reference. Link: https://msgid.link/20251113052452.975081-2-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 42 +++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c351b4abd5db..6d6a23f7dedb 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1535,20 +1535,46 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } -static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, - enum kvm_pgtable_walk_flags visit) +static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + mm_ops->put_page(ctx->ptep); + return 0; +} + +static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + + /* + * Drop references and clear the now stale PTE to avoid rewalking the + * freed page table. + */ + mm_ops->put_page(ctx->ptep); + mm_ops->put_page(childp); + kvm_clear_pte(ctx->ptep); + return 0; +} + +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ if (!stage2_pte_is_counted(ctx->old)) return 0; - mm_ops->put_page(ctx->ptep); - - if (kvm_pte_table(ctx->old, ctx->level)) - mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); - - return 0; + switch (visit) { + case KVM_PGTABLE_WALK_LEAF: + return stage2_free_leaf(ctx); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_free_table_post(ctx); + default: + return -EINVAL; + } } void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) From d68d66e57e2be9541fa67bafabdfe8826bb73799 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Thu, 13 Nov 2025 05:24:51 +0000 Subject: [PATCH 09/82] KVM: arm64: Split kvm_pgtable_stage2_destroy() Split kvm_pgtable_stage2_destroy() into two: - kvm_pgtable_stage2_destroy_range(), that performs the page-table walk and free the entries over a range of addresses. - kvm_pgtable_stage2_destroy_pgd(), that frees the PGD. This refactoring enables subsequent patches to free large page-tables in chunks, calling cond_resched() between each chunk, to yield the CPU as necessary. Existing callers of kvm_pgtable_stage2_destroy(), that probably cannot take advantage of this (such as nVMHE), will continue to function as is. Signed-off-by: Raghavendra Rao Ananta Suggested-by: Oliver Upton Link: https://msgid.link/20251113052452.975081-3-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 30 ++++++++++++++++++++++++++++ arch/arm64/include/asm/kvm_pkvm.h | 4 +++- arch/arm64/kvm/hyp/pgtable.c | 25 +++++++++++++++++++---- arch/arm64/kvm/mmu.c | 12 +++++++++-- arch/arm64/kvm/pkvm.c | 11 ++++++++-- 5 files changed, 73 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 2888b5d03757..1246216616b5 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -355,6 +355,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return pteref; +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* @@ -384,6 +389,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return rcu_dereference_raw(pteref); +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) @@ -551,6 +561,26 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2 */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +/** + * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @addr: Intermediate physical address at which to place the mapping. + * @size: Size of the mapping. + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * + * It is assumed that the rest of the page-table is freed before this operation. + */ +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); + /** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 08be89c95466..0aecd4ac5f45 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -180,7 +180,9 @@ struct pkvm_mapping { int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops); -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 6d6a23f7dedb..0882896dbf8f 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1577,21 +1577,38 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, } } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); +} + +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); + + /* + * Since the pgtable is unlinked at this point, and not shared with + * other walkers, safely deference pgd with kvm_dereference_pteref_raw() + */ + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); pgt->pgd = NULL; } +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); + kvm_pgtable_stage2_destroy_pgd(pgt); +} + void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..c2bc1eba032c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,6 +904,14 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +static void kvm_stage2_destroy(struct kvm_pgtable *pgt) +{ + unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); + + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); +} + /** * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure @@ -980,7 +988,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1081,7 +1089,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); kfree(pgt); } } diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 24f0f8a8c943..d7a0f69a9982 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -344,9 +344,16 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return 0; } -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); + __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + /* Expected to be called after all pKVM mappings have been released. */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, From 4ddfab5436b6918ce4e28f610e670e040a304152 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Thu, 13 Nov 2025 05:24:52 +0000 Subject: [PATCH 10/82] KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables When a large VM, specifically one that holds a significant number of PTEs, gets abruptly destroyed, the following warning is seen during the page-table walk: sched: CPU 0 need_resched set for > 100018840 ns (100 ticks) without schedule CPU: 0 UID: 0 PID: 9617 Comm: kvm_page_table_ Tainted: G O 6.16.0-smp-DEV #3 NONE Tainted: [O]=OOT_MODULE Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x3c/0xb8 dump_stack+0x18/0x30 resched_latency_warn+0x7c/0x88 sched_tick+0x1c4/0x268 update_process_times+0xa8/0xd8 tick_nohz_handler+0xc8/0x168 __hrtimer_run_queues+0x11c/0x338 hrtimer_interrupt+0x104/0x308 arch_timer_handler_phys+0x40/0x58 handle_percpu_devid_irq+0x8c/0x1b0 generic_handle_domain_irq+0x48/0x78 gic_handle_irq+0x1b8/0x408 call_on_irq_stack+0x24/0x30 do_interrupt_handler+0x54/0x78 el1_interrupt+0x44/0x88 el1h_64_irq_handler+0x18/0x28 el1h_64_irq+0x84/0x88 stage2_free_walker+0x30/0xa0 (P) __kvm_pgtable_walk+0x11c/0x258 __kvm_pgtable_walk+0x180/0x258 __kvm_pgtable_walk+0x180/0x258 __kvm_pgtable_walk+0x180/0x258 kvm_pgtable_walk+0xc4/0x140 kvm_pgtable_stage2_destroy+0x5c/0xf0 kvm_free_stage2_pgd+0x6c/0xe8 kvm_uninit_stage2_mmu+0x24/0x48 kvm_arch_flush_shadow_all+0x80/0xa0 kvm_mmu_notifier_release+0x38/0x78 __mmu_notifier_release+0x15c/0x250 exit_mmap+0x68/0x400 __mmput+0x38/0x1c8 mmput+0x30/0x68 exit_mm+0xd4/0x198 do_exit+0x1a4/0xb00 do_group_exit+0x8c/0x120 get_signal+0x6d4/0x778 do_signal+0x90/0x718 do_notify_resume+0x70/0x170 el0_svc+0x74/0xd8 el0t_64_sync_handler+0x60/0xc8 el0t_64_sync+0x1b0/0x1b8 The warning is seen majorly on the host kernels that are configured not to force-preempt, such as CONFIG_PREEMPT_NONE=y. To avoid this, instead of walking the entire page-table in one go, split it into smaller ranges, by checking for cond_resched() between each range. Since the path is executed during VM destruction, after the page-table structure is unlinked from the KVM MMU, relying on cond_resched_rwlock_write() isn't necessary. Signed-off-by: Raghavendra Rao Ananta Link: https://msgid.link/20251113052452.975081-4-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c2bc1eba032c..f86d17ad50a7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,11 +904,35 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +/* + * Assume that @pgt is valid and unlinked from the KVM MMU to free the + * page-table without taking the kvm_mmu_lock and without performing any + * TLB invalidations. + * + * Also, the range of addresses can be large enough to cause need_resched + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke + * cond_resched() periodically to prevent hogging the CPU for a long time + * and schedule something else, if required. + */ +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, + phys_addr_t end) +{ + u64 next; + + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, + next - addr); + if (next != end) + cond_resched(); + } while (addr = next, addr != end); +} + static void kvm_stage2_destroy(struct kvm_pgtable *pgt) { unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); + stage2_destroy_range(pgt, 0, BIT(ia_bits)); KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); } From dc31124379b69a758af740bbd981e9e9f04a61d5 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:43 -0800 Subject: [PATCH 11/82] arm64: Detect FEAT_XNX Detect the feature in anticipation of using it in KVM. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-2-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/cpufeature.c | 7 +++++++ arch/arm64/tools/cpucaps | 1 + 2 files changed, 8 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5ed401ff79e3..aa3ecae252d3 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3088,6 +3088,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_GICV5_LEGACY, .matches = test_has_gicv5_legacy, }, + { + .desc = "XNX", + .capability = ARM64_HAS_XNX, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, XNX, IMP) + }, {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 1b32c1232d28..ee74199107d3 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -64,6 +64,7 @@ HAS_TLB_RANGE HAS_VA52 HAS_VIRT_HOST_EXTN HAS_WFXT +HAS_XNX HAFT HW_DBM KVM_HVHE From 2608563b466b9192a9356b18463005da6e138bf9 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:44 -0800 Subject: [PATCH 12/82] KVM: arm64: Add support for FEAT_XNX stage-2 permissions FEAT_XNX adds support for encoding separate execute permissions for EL0 and EL1 at stage-2. Add support for this to the page table library, hiding the unintuitive encoding scheme behind generic pX and uX permission flags. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-3-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 15 ++++--- arch/arm64/kvm/hyp/pgtable.c | 58 ++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 2888b5d03757..c72149a607d6 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -89,7 +89,7 @@ typedef u64 kvm_pte_t; #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S2_XN GENMASK(54, 53) #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) @@ -251,12 +251,15 @@ enum kvm_pgtable_stage2_flags { * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { - KVM_PGTABLE_PROT_X = BIT(0), - KVM_PGTABLE_PROT_W = BIT(1), - KVM_PGTABLE_PROT_R = BIT(2), + KVM_PGTABLE_PROT_PX = BIT(0), + KVM_PGTABLE_PROT_UX = BIT(1), + KVM_PGTABLE_PROT_X = KVM_PGTABLE_PROT_PX | + KVM_PGTABLE_PROT_UX, + KVM_PGTABLE_PROT_W = BIT(2), + KVM_PGTABLE_PROT_R = BIT(3), - KVM_PGTABLE_PROT_DEVICE = BIT(3), - KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), + KVM_PGTABLE_PROT_DEVICE = BIT(4), + KVM_PGTABLE_PROT_NORMAL_NC = BIT(5), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c351b4abd5db..e1d75f965027 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -661,11 +661,37 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) +static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr) +{ + bool px, ux; + u8 xn; + + px = prot & KVM_PGTABLE_PROT_PX; + ux = prot & KVM_PGTABLE_PROT_UX; + + if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux) + return -EINVAL; + + if (px && ux) + xn = 0b00; + else if (!px && ux) + xn = 0b01; + else if (!px && !ux) + xn = 0b10; + else + xn = 0b11; + + *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN; + *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn); + return 0; +} + static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + int r; switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) { @@ -685,8 +711,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr = KVM_S2_MEMATTR(pgt, NORMAL); } - if (!(prot & KVM_PGTABLE_PROT_X)) - attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + r = stage2_set_xn_attr(prot, &attr); + if (r) + return r; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -715,8 +742,19 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) prot |= KVM_PGTABLE_PROT_R; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) prot |= KVM_PGTABLE_PROT_W; - if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) - prot |= KVM_PGTABLE_PROT_X; + + switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) { + case 0b00: + prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX; + break; + case 0b01: + prot |= KVM_PGTABLE_PROT_UX; + break; + case 0b11: + prot |= KVM_PGTABLE_PROT_PX; + break; + default: + } return prot; } @@ -1290,9 +1328,9 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { - int ret; + kvm_pte_t xn = 0, set = 0, clr = 0; s8 level; - kvm_pte_t set = 0, clr = 0; + int ret; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) return -EINVAL; @@ -1303,8 +1341,12 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_W) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - if (prot & KVM_PGTABLE_PROT_X) - clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + ret = stage2_set_xn_attr(prot, &xn); + if (ret) + return ret; + + set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; + clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN; ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) From d93febe2ed2e0491af9d47f0ee6d4b01918877f4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:45 -0800 Subject: [PATCH 13/82] KVM: arm64: nv: Forward FEAT_XNX permissions to the shadow stage-2 Add support for FEAT_XNX to shadow stage-2 MMUs, being careful to only evaluate XN[0] when the feature is actually exposed to the VM. Restructure the layering of permissions in the fault handler to assume pX and uX then restricting based on the guest's stage-2 afterwards. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-4-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_nested.h | 37 +++++++++++++++++++++++++++-- arch/arm64/kvm/mmu.c | 23 ++++++++++++++---- arch/arm64/kvm/nested.c | 5 +++- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index f7c06a840963..5d967b60414c 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -120,9 +120,42 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) return trans->writable; } -static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) +static inline bool kvm_has_xnx(struct kvm *kvm) { - return !(trans->desc & BIT(54)); + return cpus_have_final_cap(ARM64_HAS_XNX) && + kvm_has_feat(kvm, ID_AA64MMFR1_EL1, XNX, IMP); +} + +static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *trans) +{ + u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); + + if (!kvm_has_xnx(kvm)) + xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + + switch (xn) { + case 0b00: + case 0b01: + return true; + default: + return false; + } +} + +static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *trans) +{ + u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); + + if (!kvm_has_xnx(kvm)) + xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + + switch (xn) { + case 0b00: + case 0b11: + return true; + default: + return false; + } } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..96f1786c72fe 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1521,6 +1521,16 @@ static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, *prot |= kvm_encode_nested_level(nested); } +static void adjust_nested_exec_perms(struct kvm *kvm, + struct kvm_s2_trans *nested, + enum kvm_pgtable_prot *prot) +{ + if (!kvm_s2_trans_exec_el0(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_UX; + if (!kvm_s2_trans_exec_el1(kvm, nested)) + *prot &= ~KVM_PGTABLE_PROT_PX; +} + #define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED) static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1572,11 +1582,12 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (writable) prot |= KVM_PGTABLE_PROT_W; - if (exec_fault || - (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && - (!nested || kvm_s2_trans_executable(nested)))) + if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) prot |= KVM_PGTABLE_PROT_X; + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + kvm_fault_lock(kvm); if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; @@ -1851,11 +1862,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= KVM_PGTABLE_PROT_NORMAL_NC; else prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && - (!nested || kvm_s2_trans_executable(nested))) { + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { prot |= KVM_PGTABLE_PROT_X; } + if (nested) + adjust_nested_exec_perms(kvm, nested, &prot); + /* * Under the premise of getting a FSC_PERM fault, we just need to relax * permissions only if vma_pagesize equals fault_granule. Otherwise, diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index f04cda40545b..92b2a69f0b89 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -788,7 +788,10 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans) return 0; if (kvm_vcpu_trap_is_iabt(vcpu)) { - forward_fault = !kvm_s2_trans_executable(trans); + if (vcpu_mode_priv(vcpu)) + forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans); + else + forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans); } else { bool write_fault = kvm_is_write_fault(vcpu); From 8cb4ecec5e366b7dbbf200629a22624ad2340af5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:51 +0000 Subject: [PATCH 14/82] irqchip/gic: Add missing GICH_HCR control bits The GICH_HCR description is missing a bunch of control bits that control the maintenance interrupt. Add them. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-2-maz@kernel.org Signed-off-by: Oliver Upton --- include/linux/irqchip/arm-gic.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 2223f95079ce..d45fa19f9e47 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -86,7 +86,13 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_LRENPIE (1 << 2) #define GICH_HCR_NPIE (1 << 3) +#define GICH_HCR_VGrp0EIE (1 << 4) +#define GICH_HCR_VGrp0DIE (1 << 5) +#define GICH_HCR_VGrp1EIE (1 << 6) +#define GICH_HCR_VGrp1DIE (1 << 7) +#define GICH_HCR_EOICOUNT GENMASK(31, 27) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) From fa8f11e8e18383d234c77ba08d347aed7883d39a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:52 +0000 Subject: [PATCH 15/82] irqchip/gic: Expose CPU interface VA to KVM Future changes will require KVM to be able to perform deactivations by writing to the physical CPU interface. Add the corresponding VA to the kvm_info structure, and let KVM stash it. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-3-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 1 + drivers/irqchip/irq-gic.c | 3 +++ include/kvm/arm_vgic.h | 3 +++ include/linux/irqchip/arm-vgic-info.h | 2 ++ 4 files changed, 9 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 381673f03c39..441efef80d60 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -385,6 +385,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info) kvm_vgic_global_state.can_emulate_gicv2 = true; kvm_vgic_global_state.vcpu_base = info->vcpu.start; + kvm_vgic_global_state.gicc_base = info->gicc_base; kvm_vgic_global_state.type = VGIC_V2; kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 1269ab8eb726..ec70c84e9f91 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1459,6 +1459,8 @@ static void __init gic_of_setup_kvm_info(struct device_node *node) if (ret) return; + gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base; + if (static_branch_likely(&supports_deactivate_key)) vgic_set_kvm_info(&gic_v2_kvm_info); } @@ -1620,6 +1622,7 @@ static void __init gic_acpi_setup_kvm_info(void) return; gic_v2_kvm_info.maint_irq = irq; + gic_v2_kvm_info.gicc_base = gic_data[0].cpu_base.common_base; vgic_set_kvm_info(&gic_v2_kvm_info); } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 7a0b972eb1b1..577723f5599b 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -59,6 +59,9 @@ struct vgic_global { /* virtual control interface mapping, HYP VA */ void __iomem *vctrl_hyp; + /* Physical CPU interface, kernel VA */ + void __iomem *gicc_base; + /* Number of implemented list registers */ int nr_lr; diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index a470a73a805a..67d9d960273b 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -24,6 +24,8 @@ struct gic_kvm_info { enum gic_type type; /* Virtual CPU interface */ struct resource vcpu; + /* GICv2 GICC VA */ + void __iomem *gicc_base; /* Interrupt number */ unsigned int maint_irq; /* No interrupt mask, no need to use the above field */ From 08f4f41c1e95ffb1ce525a07d25daa577110d748 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:53 +0000 Subject: [PATCH 16/82] irqchip/apple-aic: Spit out ICH_MISR_EL2 value on spurious vGIC MI It is all good and well to scream about spurious vGIC maintenance interrupts. It would be even better to output the reason why, which is already checked, but not printed out. The unsuspecting kernel tinkerer thanks you. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-4-maz@kernel.org Signed-off-by: Oliver Upton --- drivers/irqchip/irq-apple-aic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 032d66dceb8e..4607f4943b19 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -411,12 +411,15 @@ static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs) if (is_kernel_in_hyp_mode() && (read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && read_sysreg_s(SYS_ICH_MISR_EL2) != 0) { + u64 val; + generic_handle_domain_irq(aic_irqc->hw_domain, AIC_FIQ_HWIRQ(AIC_VGIC_MI)); if (unlikely((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_En) && - read_sysreg_s(SYS_ICH_MISR_EL2))) { - pr_err_ratelimited("vGIC IRQ fired and not handled by KVM, disabling.\n"); + (val = read_sysreg_s(SYS_ICH_MISR_EL2)))) { + pr_err_ratelimited("vGIC IRQ fired and not handled by KVM (MISR=%llx), disabling.\n", + val); sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0); } } From 8d3dfab1d305d61359454d9c09b736f077a9fce4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:54 +0000 Subject: [PATCH 17/82] KVM: arm64: Turn vgic-v3 errata traps into a patched-in constant The trap bits are currently only set to manage CPU errata. However, we are about to make use of them for purposes beyond beating broken CPUs into submission. For this purpose, turn these errata-driven bits into a patched-in constant that is merged with the KVM-driven value at the point of programming the ICH_HCR_EL2 register, rather than being directly stored with with the shadow value.. This allows the KVM code to distinguish between a trap being handled for the purpose of an erratum workaround, or for KVM's own need. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-5-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/image-vars.h | 1 + arch/arm64/kvm/hyp/vgic-v3-sr.c | 21 ++++--- arch/arm64/kvm/vgic/vgic-v3-nested.c | 12 +--- arch/arm64/kvm/vgic/vgic-v3.c | 83 +++++++++++++++++----------- arch/arm64/kvm/vgic/vgic.h | 16 ++++++ 5 files changed, 84 insertions(+), 49 deletions(-) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5369763606e7..85bc629270bd 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -91,6 +91,7 @@ KVM_NVHE_ALIAS(spectre_bhb_patch_loop_mitigation_enable); KVM_NVHE_ALIAS(spectre_bhb_patch_wa3); KVM_NVHE_ALIAS(spectre_bhb_patch_clearbhb); KVM_NVHE_ALIAS(alt_cb_patch_nops); +KVM_NVHE_ALIAS(kvm_compute_ich_hcr_trap_bits); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index acd909b7f225..e72d436dd6a3 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -14,6 +14,8 @@ #include #include +#include "../../vgic/vgic.h" + #define vtr_to_max_lr_idx(v) ((v) & 0xf) #define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) #define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5)) @@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n) return val; } +static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if) +{ + return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits(); +} + void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { u64 used_lrs = cpu_if->used_lrs; @@ -218,7 +225,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) elrsr = read_gicreg(ICH_ELRSR_EL2); - write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2); + write_gicreg(0, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) @@ -237,7 +244,7 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) int i; if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) __gic_v3_set_lr(cpu_if->vgic_lr[i], i); @@ -307,14 +314,14 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * If we need to trap system registers, we must write - * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected. Note that this also applies if we don't expect - * any system register access (no vgic at all). + * If we need to trap system registers, we must write ICH_HCR_EL2 + * anyway, even if no interrupts are being injected. Note that this + * also applies if we don't expect any system register access (no + * vgic at all). In any case, no need to provide MI configuration. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); + write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2); } void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 7f1259b49c50..1fc9e0780abe 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -298,19 +298,9 @@ static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val = 0; int i; - /* - * If we're on a system with a broken vgic that requires - * trapping, propagate the trapping requirements. - * - * Ah, the smell of rotten fruits... - */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap)) - val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | - ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR); - s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val; + s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2); s_cpu_if->vgic_sre = host_if->vgic_sre; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6fbb4b099855..8c1494508682 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -301,20 +301,9 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) return; /* Hide GICv3 sysreg if necessary */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); - return; - } - - if (group0_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0; - if (group1_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1; - if (common_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC; - if (dir_trap) - vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -635,8 +624,50 @@ static const struct midr_range broken_seis[] = { static bool vgic_v3_broken_seis(void) { - return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) && - is_midr_in_range_list(broken_seis)); + return (is_kernel_in_hyp_mode() && + is_midr_in_range_list(broken_seis) && + (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS)); +} + +void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, + int nr_inst) +{ + u32 insn, oinsn, rd; + u64 hcr = 0; + + if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) { + group0_trap = true; + group1_trap = true; + } + + if (vgic_v3_broken_seis()) { + /* We know that these machines have ICH_HCR_EL2.TDIR */ + group0_trap = true; + group1_trap = true; + dir_trap = true; + } + + if (group0_trap) + hcr |= ICH_HCR_EL2_TALL0; + if (group1_trap) + hcr |= ICH_HCR_EL2_TALL1; + if (common_trap) + hcr |= ICH_HCR_EL2_TC; + if (dir_trap) + hcr |= ICH_HCR_EL2_TDIR; + + /* Compute target register */ + oinsn = le32_to_cpu(*origptr); + rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn); + + /* movz rd, #(val & 0xffff) */ + insn = aarch64_insn_gen_movewide(rd, + (u16)hcr, + 0, + AARCH64_INSN_VARIANT_64BIT, + AARCH64_INSN_MOVEWIDE_ZERO); + *updptr = cpu_to_le32(insn); } /** @@ -650,6 +681,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; + u64 traps; int ret; has_v2 = ich_vtr_el2 >> 63; @@ -708,29 +740,18 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (has_v2) static_branch_enable(&vgic_v3_has_v2_compat); - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { - group0_trap = true; - group1_trap = true; - } - if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); - kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; - group0_trap = true; - group1_trap = true; - if (ich_vtr_el2 & ICH_VTR_EL2_TDS) - dir_trap = true; - else - common_trap = true; } - if (group0_trap || group1_trap || common_trap | dir_trap) { + traps = vgic_ich_hcr_trap_bits(); + if (traps) { kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", - group0_trap ? "G0" : "", - group1_trap ? "G1" : "", - common_trap ? "C" : "", - dir_trap ? "D" : ""); + (traps & ICH_HCR_EL2_TALL0) ? "G0" : "", + (traps & ICH_HCR_EL2_TALL1) ? "G1" : "", + (traps & ICH_HCR_EL2_TC) ? "C" : "", + (traps & ICH_HCR_EL2_TDIR) ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index ac5f9c5d2b98..0ecadfa00397 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -164,6 +164,22 @@ static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } +void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst); + +static inline u64 vgic_ich_hcr_trap_bits(void) +{ + u64 hcr; + + /* All the traps are in the bottom 16bits */ + asm volatile(ALTERNATIVE_CB("movz %0, #0\n", + ARM64_ALWAYS_SYSTEM, + kvm_compute_ich_hcr_trap_bits) + : "=r" (hcr)); + + return hcr; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC From 567ebfedb5bd204a8ce6a11695f02730f1bf57f4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:55 +0000 Subject: [PATCH 18/82] KVM: arm64: vgic-v3: Fix GICv3 trapping in protected mode As we are about to start trapping a bunch of extra things, augment the pKVM trap description with all the registers trapped by ICH_HCR_EL2.TC, making them legal instead of resulting in a UNDEF injection in the guest. While we're at it, ensure that pKVM captures the vgic model so that it can be checked by the emulation code. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-6-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 3 +++ arch/arm64/kvm/hyp/nvhe/sys_regs.c | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 43bde061b65d..8911338961c5 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -337,6 +337,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* CTR_EL0 is always under host control, even for protected VMs. */ hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0; + /* Preserve the vgic model so that GICv3 emulation works */ + hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model; + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 82da9b03692d..3108b5185c20 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -444,6 +444,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Scalable Vector Registers are restricted. */ + HOST_HANDLED(SYS_ICC_PMR_EL1), + RAZ_WI(SYS_ERRIDR_EL1), RAZ_WI(SYS_ERRSELR_EL1), RAZ_WI(SYS_ERXFR_EL1), @@ -457,9 +459,12 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Limited Ordering Regions Registers are restricted. */ + HOST_HANDLED(SYS_ICC_DIR_EL1), + HOST_HANDLED(SYS_ICC_RPR_EL1), HOST_HANDLED(SYS_ICC_SGI1R_EL1), HOST_HANDLED(SYS_ICC_ASGI1R_EL1), HOST_HANDLED(SYS_ICC_SGI0R_EL1), + HOST_HANDLED(SYS_ICC_CTLR_EL1), { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, HOST_HANDLED(SYS_CCSIDR_EL1), From 2a28810cbb8b21a4016182617cc1fd72eddf4a36 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:56 +0000 Subject: [PATCH 19/82] KVM: arm64: GICv3: Detect and work around the lack of ICV_DIR_EL1 trapping A long time ago, an unsuspecting architect forgot to add a trap bit for ICV_DIR_EL1 in ICH_HCR_EL2. Which was unfortunate, but what's a bit of spec between friends? Thankfully, this was fixed in a later revision, and ARM "deprecates" the lack of trapping ability. Unfortuantely, a few (billion) CPUs went out with that defect, anything ARMv8.0 from ARM, give or take. And on these CPUs, you can't trap DIR on its own, full stop. As the next best thing, we can trap everything in the common group, which is a tad expensive, but hey ho, that's what you get. You can otherwise recycle the HW in the neaby bin. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-7-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/virt.h | 7 ++++- arch/arm64/kernel/cpufeature.c | 52 ++++++++++++++++++++++++++++++++++ arch/arm64/kernel/hyp-stub.S | 5 ++++ arch/arm64/kvm/vgic/vgic-v3.c | 3 ++ arch/arm64/tools/cpucaps | 1 + 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index aa280f356b96..8eb63d329497 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -40,8 +40,13 @@ */ #define HVC_FINALISE_EL2 3 +/* + * HVC_GET_ICH_VTR_EL2 - Retrieve the ICH_VTR_EL2 value + */ +#define HVC_GET_ICH_VTR_EL2 4 + /* Max number of HYP stub hypercalls */ -#define HVC_STUB_HCALL_NR 4 +#define HVC_STUB_HCALL_NR 5 /* Error returned when an invalid stub number is passed into x0 */ #define HVC_STUB_ERR 0xbadca11 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5ed401ff79e3..5de51cb1b8fe 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2303,6 +2303,49 @@ static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry } #endif +static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, + int scope) +{ + static const struct midr_range has_vgic_v3[] = { + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), + {}, + }; + struct arm_smccc_res res = {}; + + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); + BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); + if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF) && + !is_midr_in_range_list(has_vgic_v3)) + return false; + + if (!is_hyp_mode_available()) + return false; + + if (cpus_have_cap(ARM64_HAS_GICV5_LEGACY)) + return true; + + if (is_kernel_in_hyp_mode()) + res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2); + else + arm_smccc_1_1_hvc(HVC_GET_ICH_VTR_EL2, &res); + + if (res.a0 == HVC_STUB_ERR) + return false; + + return res.a1 & ICH_VTR_EL2_TDS; +} + #ifdef CONFIG_ARM64_BTI static void bti_enable(const struct arm64_cpu_capabilities *__unused) { @@ -2814,6 +2857,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_gic_prio_relaxed_sync, }, #endif + { + /* + * Depends on having GICv3 + */ + .desc = "ICV_DIR_EL1 trapping", + .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = can_trap_icv_dir_el1, + }, #ifdef CONFIG_ARM64_E0PD { .desc = "E0PD", diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 36e2d26b54f5..085bc9972f6b 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -54,6 +54,11 @@ SYM_CODE_START_LOCAL(elx_sync) 1: cmp x0, #HVC_FINALISE_EL2 b.eq __finalise_el2 + cmp x0, #HVC_GET_ICH_VTR_EL2 + b.ne 2f + mrs_s x1, SYS_ICH_VTR_EL2 + b 9f + 2: cmp x0, #HVC_SOFT_RESTART b.ne 3f mov x0, x2 diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 8c1494508682..1b6c3071ec80 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -648,6 +648,9 @@ void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt, dir_trap = true; } + if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR)) + common_trap = true; + if (group0_trap) hcr |= ICH_HCR_EL2_TALL0; if (group1_trap) diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 1b32c1232d28..116d1a7b688c 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -40,6 +40,7 @@ HAS_GICV5_CPUIF HAS_GICV5_LEGACY HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC +HAS_ICH_HCR_EL2_TDIR HAS_HCR_NV1 HAS_HCX HAS_LDAPR From a4413a7c31cfca49d3f4830cf8a45edf4a713f63 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:57 +0000 Subject: [PATCH 20/82] KVM: arm64: Repack struct vgic_irq fields struct vgic_irq has grown over the years, in a rather bad way. Repack it using bitfields so that the individual flags, and move things around a bit so that it a bit smaller. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-8-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v4.c | 5 ++++- include/kvm/arm_vgic.h | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 548aec9d5a72..09c3e9eb23f8 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -163,6 +163,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); struct irq_desc *desc; unsigned long flags; + bool pending; int ret; raw_spin_lock_irqsave(&irq->irq_lock, flags); @@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu) irq->hw = false; ret = irq_get_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, - &irq->pending_latch); + &pending); WARN_ON(ret); + irq->pending_latch = pending; + desc = irq_to_desc(irq->host_irq); irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); unlock: diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 577723f5599b..e84a1bc5cf17 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -123,6 +123,7 @@ struct irq_ops { struct vgic_irq { raw_spinlock_t irq_lock; /* Protects the content of the struct */ + u32 intid; /* Guest visible INTID */ struct rcu_head rcu; struct list_head ap_list; @@ -137,17 +138,17 @@ struct vgic_irq { * affinity reg (v3). */ - u32 intid; /* Guest visible INTID */ - bool line_level; /* Level only */ - bool pending_latch; /* The pending latch state used to calculate - * the pending state for both level - * and edge triggered IRQs. */ - bool active; - bool pending_release; /* Used for LPIs only, unreferenced IRQ + bool pending_release:1; /* Used for LPIs only, unreferenced IRQ * pending a release */ - bool enabled; - bool hw; /* Tied to HW IRQ */ + bool pending_latch:1; /* The pending latch state used to calculate + * the pending state for both level + * and edge triggered IRQs. */ + enum vgic_irq_config config:1; /* Level or edge */ + bool line_level:1; /* Level only */ + bool enabled:1; + bool active:1; + bool hw:1; /* Tied to HW IRQ */ refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ @@ -159,7 +160,6 @@ struct vgic_irq { u8 active_source; /* GICv2 SGIs only */ u8 priority; u8 group; /* 0 == group 0, 1 == group 1 */ - enum vgic_irq_config config; /* Level or edge */ struct irq_ops *ops; From 879a7fd4fd64656d953f887e6a18e13e0b9a9f8f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:58 +0000 Subject: [PATCH 21/82] KVM: arm64: Add tracking of vgic_irq being present in a LR We currently cannot identify whether an interrupt is queued into a LR. It wasn't needed until now, but that's about to change. Add yet another flag to track that state. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-9-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 6 ++++++ arch/arm64/kvm/vgic/vgic-v3.c | 6 ++++++ include/kvm/arm_vgic.h | 1 + 3 files changed, 13 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 441efef80d60..74efacba38d4 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -101,6 +101,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); + irq->on_lr = false; + raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } @@ -124,6 +126,8 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) u32 val = irq->intid; bool allow_pending = true; + WARN_ON(irq->on_lr); + if (irq->active) { val |= GICH_LR_ACTIVE_BIT; if (vgic_irq_is_sgi(irq->intid)) @@ -194,6 +198,8 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) /* The GICv2 LR only holds five bits of priority. */ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + irq->on_lr = true; + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 1b6c3071ec80..e3f4b27e0225 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -97,6 +97,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); + irq->on_lr = false; + raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } @@ -111,6 +113,8 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) u64 val = irq->intid; bool allow_pending = true, is_v2_sgi; + WARN_ON(irq->on_lr); + is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2); @@ -185,6 +189,8 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + + irq->on_lr = true; } void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index e84a1bc5cf17..ec349c5a4a8b 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -149,6 +149,7 @@ struct vgic_irq { bool enabled:1; bool active:1; bool hw:1; /* Tied to HW IRQ */ + bool on_lr:1; /* Present in a CPU LR */ refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ From 0dc433e79ad031801842e3d8bc5d9729e14f5067 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:59 +0000 Subject: [PATCH 22/82] KVM: arm64: Add LR overflow handling documentation Add a bit of documentation describing how we are dealing with LR overflow. This is mostly a braindump of how things are expected to work. For now anyway. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-10-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 81 +++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 6dd5a10081e2..7ee253a9fb77 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -825,7 +825,86 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu, return count; } -/* Requires the VCPU's ap_list_lock to be held. */ +/* + * Dealing with LR overflow is close to black magic -- dress accordingly. + * + * We have to present an almost infinite number of interrupts through a very + * limited number of registers. Therefore crucial decisions must be made to + * ensure we feed the most relevant interrupts into the LRs, and yet have + * some facilities to let the guest interact with those that are not there. + * + * All considerations below are in the context of interrupts targeting a + * single vcpu with non-idle state (either pending, active, or both), + * colloquially called the ap_list: + * + * - Pending interrupts must have priority over active interrupts. This also + * excludes pending+active interrupts. This ensures that a guest can + * perform priority drops on any number of interrupts, and yet be + * presented the next pending one. + * + * - Deactivation of interrupts outside of the LRs must be tracked by using + * either the EOIcount-driven maintenance interrupt, and sometimes by + * trapping the DIR register. + * + * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the + * point that made it into the LRs, and deactivate interrupts that would + * have made it onto the LRs if we had the space. + * + * - The MI-generation bits must be used to try and force an exit when the + * guest has done enough changes to the LRs that we want to reevaluate the + * situation: + * + * - if the total number of pending interrupts exceeds the number of + * LR, NPIE must be set in order to exit once no pending interrupts + * are present in the LRs, allowing us to populate the next batch. + * + * - if there are active interrupts outside of the LRs, then LRENPIE + * must be set so that we exit on deactivation of one of these, and + * work out which one is to be deactivated. Note that this is not + * enough to deal with EOImode=1, see below. + * + * - if the overall number of interrupts exceeds the number of LRs, + * then UIE must be set to allow refilling of the LRs once the + * majority of them has been processed. + * + * - as usual, MI triggers are only an optimisation, since we cannot + * rely on the MI being delivered in timely manner... + * + * - EOImode=1 creates some additional problems: + * + * - deactivation can happen in any order, and we cannot rely on + * EOImode=0's coupling of priority-drop and deactivation which + * imposes strict reverse Ack order. This means that DIR must + * trap if we have active interrupts outside of the LRs. + * + * - deactivation of SPIs can occur on any CPU, while the SPI is only + * present in the ap_list of the CPU that actually ack-ed it. In that + * case, EOIcount doesn't provide enough information, and we must + * resort to trapping DIR even if we don't overflow the LRs. Bonus + * point for not trapping DIR when no SPIs are pending or active in + * the whole VM. + * + * - LPIs do not suffer the same problem as SPIs on deactivation, as we + * have to essentially discard the active state, see below. + * + * - Virtual LPIs have an active state (surprise!), which gets removed on + * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI + * is not present in the LR (surprise again!). Special care must therefore + * be taken to remove the active state from any activated LPI when exiting + * from the guest. This is in a way no different from what happens on the + * physical side. We still rely on the running priority to have been + * removed from the APRs, irrespective of the LPI being present in the LRs + * or not. + * + * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as + * they are not managed in SW and don't have a true active state. So only + * set vSGIEOICount when no SGIs are in the ap_list. + * + * - GICv2 SGIs with multiple sources are injected one source at a time, as + * if they were made pending sequentially. This may mean that we don't + * always present the HPPI if other interrupts with lower priority are + * pending in the LRs. Big deal. + */ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; From 73c9726975af1c2bf8d062017c67bcf4fb8821d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:00 +0000 Subject: [PATCH 23/82] KVM: arm64: GICv3: Drop LPI active state when folding LRs Despite LPIs not having an active state, *virtual* LPIs do have one, which gets cleared on EOI. So far, so good. However, this leads to a small problem: when an active LPI is not in the LRs, that EOImode==0 and that the guest EOIs it, EOIcount doesn't get bumped up. Which means that in these condition, the LPI would stay active forever. Clearly, we can't have that. So if we spot an active LPI, we drop that state. It's pretty pointless anyway, and only serves as a way to trip SW over. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-11-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index e3f4b27e0225..81d22f615fa6 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -72,7 +72,9 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit, note deactivation */ + /* Always preserve the active bit for !LPIs, note deactivation */ + if (irq->intid >= VGIC_MIN_LPI) + val &= ~ICH_LR_ACTIVE_BIT; deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); From f4ded7b0848e6fcc9c882a1fdaa925d921c932f1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:01 +0000 Subject: [PATCH 24/82] KVM: arm64: GICv3: Preserve EOIcount on exit EOIcount is how the virtual CPU interface signals that the guest is deactivating interrupts outside of the LRs when EOImode==0. We therefore need to preserve that information so that we can find out what actually needs deactivating. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-12-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index e72d436dd6a3..9bfcbfd91118 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -225,6 +225,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) elrsr = read_gicreg(ICH_ELRSR_EL2); + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + write_gicreg(0, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) { From 00c6d0d4a80582a43578380f5283940c2e16eec8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:02 +0000 Subject: [PATCH 25/82] KVM: arm64: GICv3: Decouple ICH_HCR_EL2 programming from LRs Not programming ICH_HCR_EL2 while no LRs are populated is a bit of an issue, as we otherwise don't see any maintenance interrupt when the guest interacts with the LRs. Decouple the two and always program the control register, even when we don't have to touch the LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-13-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 9bfcbfd91118..2509b52bbd62 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -219,20 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } } - if (used_lrs || cpu_if->its_vpe.its_vm) { + if (used_lrs) { int i; u32 elrsr; elrsr = read_gicreg(ICH_ELRSR_EL2); - if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { - u64 val = read_gicreg(ICH_HCR_EL2); - cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; - cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; - } - - write_gicreg(0, ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) { if (elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; @@ -242,6 +234,14 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) __gic_v3_set_lr(0, i); } } + + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { + u64 val = read_gicreg(ICH_HCR_EL2); + cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; + cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount; + } + + write_gicreg(0, ICH_HCR_EL2); } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) @@ -249,12 +249,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) u64 used_lrs = cpu_if->used_lrs; int i; - if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); + write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } + for (i = 0; i < used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Ensure that writes to the LRs, and on non-VHE systems ensure that From 438e47b697f7913bbb9f44e48b7b6e98389c9e0e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:03 +0000 Subject: [PATCH 26/82] KVM: arm64: GICv3: Extract LR folding primitive As we are going to need to handle deactivation for interrupts that are not in the LRs, split vgic_v3_fold_lr_state() into a helper that deals with a single interrupt, and the function that loops over the used LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-14-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 88 +++++++++++++++++------------------ 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 81d22f615fa6..6b7d7b4048f0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -33,78 +33,76 @@ static bool lr_signals_eoi_mi(u64 lr_val) !(lr_val & ICH_LR_HW); } -void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - int lr; + struct vgic_irq *irq; + bool is_v2_sgi = false; + bool deactivated; + u32 intid; - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + intid = val & ICH_LR_VIRTUAL_ID_MASK; + } else { + intid = val & GICH_LR_VIRTUALID; + is_v2_sgi = vgic_irq_is_sgi(intid); + } - cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE; + irq = vgic_get_vcpu_irq(vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + return; - for (lr = 0; lr < cpuif->used_lrs; lr++) { - u64 val = cpuif->vgic_lr[lr]; - u32 intid, cpuid; - struct vgic_irq *irq; - bool is_v2_sgi = false; - bool deactivated; - - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { - intid = val & ICH_LR_VIRTUAL_ID_MASK; - } else { - intid = val & GICH_LR_VIRTUALID; - is_v2_sgi = vgic_irq_is_sgi(intid); - } - - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_vcpu_irq(vcpu, intid); - if (!irq) /* An LPI could have been unmapped. */ - continue; - - raw_spin_lock(&irq->irq_lock); + /* Notify fds when the guest EOI'ed a level-triggered IRQ */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit for !LPIs, note deactivation */ if (irq->intid >= VGIC_MIN_LPI) val &= ~ICH_LR_ACTIVE_BIT; deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); - if (irq->active && is_v2_sgi) - irq->active_source = cpuid; - /* Edge is the only case where we preserve the pending bit */ if (irq->config == VGIC_CONFIG_EDGE && - (val & ICH_LR_PENDING_BIT)) { + (val & ICH_LR_PENDING_BIT)) irq->pending_latch = true; - if (is_v2_sgi) - irq->source |= (1 << cpuid); - } - /* * Clear soft pending state when level irqs have been acked. */ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; + if (is_v2_sgi) { + u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val); + + if (irq->active) + irq->active_source = cpuid; + + if (val & ICH_LR_PENDING_BIT) + irq->source |= BIT(cpuid); + } + /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); irq->on_lr = false; - - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); } + vgic_put_irq(vcpu->kvm, irq); +} + +void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < cpuif->used_lrs; lr++) + vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]); + cpuif->used_lrs = 0; } From 1ae0448ca7976281e7ec1d2cd1c861fbc8f8631e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:04 +0000 Subject: [PATCH 27/82] KVM: arm64: GICv3: Extract LR computing primitive Split vgic_v3_populate_lr() into two, so that we have another primitive that computes the LR from a vgic_irq, but doesn't update anything in the shadow structure. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-15-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 49 +++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6b7d7b4048f0..bcce7f35a6d6 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -107,7 +107,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) } /* Requires the irq to be locked already */ -void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; @@ -154,6 +154,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; + if (is_v2_sgi) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= ICH_LR_EOI; + } + } + + if (irq->group) + val |= ICH_LR_GROUP; + + val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; + + return val; +} + +void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 model = vcpu->kvm->arch.vgic.vgic_model; + u64 val = vgic_v3_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + + if (val & ICH_LR_PENDING_BIT) { if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; @@ -161,16 +190,9 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) model == KVM_DEV_TYPE_ARM_VGIC_V2) { u32 src = ffs(irq->source); - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { + irq->source &= ~BIT(src - 1); + if (irq->source) irq->pending_latch = true; - val |= ICH_LR_EOI; - } } } @@ -183,13 +205,6 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; - if (irq->group) - val |= ICH_LR_GROUP; - - val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; - irq->on_lr = true; } From 5ceb3dac80229684c8e57993f12106cbad23f7ac Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:05 +0000 Subject: [PATCH 28/82] KVM: arm64: GICv2: Preserve EOIcount on exit EOIcount is how the virtual CPU interface signals that the guest is deactivating interrupts outside of the LRs when EOImode==0. We therefore need to preserve that information so that we can find out what actually needs deactivating, just like we already do on GICv3. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-16-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 74efacba38d4..5cfbe5898342 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -437,6 +437,12 @@ void vgic_v2_save_state(struct kvm_vcpu *vcpu) return; if (used_lrs) { + if (vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr & GICH_HCR_LRENPIE) { + u32 val = readl_relaxed(base + GICH_HCR); + + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_EOICOUNT; + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= val & GICH_HCR_EOICOUNT; + } save_lrs(vcpu, base); writel_relaxed(0, base + GICH_HCR); } From a00c88ac1f90992e618cf4737e2d1c551c13aed6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:06 +0000 Subject: [PATCH 29/82] KVM: arm64: GICv2: Decouple GICH_HCR programming from LRs being loaded Not programming GICH_HCR while no LRs are populated is a bit of an issue, as we otherwise don't see any maintenance interrupt when the guest interacts with the LRs. Decouple the two and always program the control register, even when we don't have to touch the LRs. This is very similar to what we are already doing for GICv3. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-17-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 5cfbe5898342..a0d803c5b08a 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -430,22 +430,25 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) void vgic_v2_save_state(struct kvm_vcpu *vcpu) { + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; void __iomem *base = kvm_vgic_global_state.vctrl_base; u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; if (!base) return; - if (used_lrs) { - if (vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr & GICH_HCR_LRENPIE) { - u32 val = readl_relaxed(base + GICH_HCR); - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_EOICOUNT; - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= val & GICH_HCR_EOICOUNT; - } + if (used_lrs) save_lrs(vcpu, base); - writel_relaxed(0, base + GICH_HCR); + + if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) { + u32 val = readl_relaxed(base + GICH_HCR); + + cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT; + cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT; } + + writel_relaxed(0, base + GICH_HCR); } void vgic_v2_restore_state(struct kvm_vcpu *vcpu) @@ -458,13 +461,10 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu) if (!base) return; - if (used_lrs) { - writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); - for (i = 0; i < used_lrs; i++) { - writel_relaxed(cpu_if->vgic_lr[i], - base + GICH_LR0 + (i * 4)); - } - } + writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); + + for (i = 0; i < used_lrs; i++) + writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4)); } void vgic_v2_load(struct kvm_vcpu *vcpu) From 3aa9a50c2007e4090b0b5b3c79aed6f63b5e6c49 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:07 +0000 Subject: [PATCH 30/82] KVM: arm64: GICv2: Extract LR folding primitive As we are going to need to handle deactivation for interrupts that are not in the LRs, split vgic_v2_fold_lr_state() into a helper that deals with a single interrupt, and the function that loops over the used LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-18-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 67 +++++++++++++++++------------------ 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index a0d803c5b08a..fb8efdd4196b 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -39,43 +39,23 @@ static bool lr_signals_eoi_mi(u32 lr_val) !(lr_val & GICH_LR_HW); } -/* - * transfer the content of the LRs back into the corresponding ap_list: - * - active bit is transferred as is - * - pending bit is - * - transferred as is in case of edge sensitive IRQs - * - set to the line-level (resample time) for level sensitive IRQs - */ -void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; - int lr; + u32 cpuid, intid = val & GICH_LR_VIRTUALID; + struct vgic_irq *irq; + bool deactivated; - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + /* Extract the source vCPU id from the LR */ + cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7; - cpuif->vgic_hcr &= ~GICH_HCR_UIE; + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); - for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) { - u32 val = cpuif->vgic_lr[lr]; - u32 cpuid, intid = val & GICH_LR_VIRTUALID; - struct vgic_irq *irq; - bool deactivated; - - /* Extract the source vCPU id from the LR */ - cpuid = val & GICH_LR_PHYSID_CPUID; - cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; - cpuid &= 7; - - /* Notify fds when the guest EOI'ed a level-triggered SPI */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - - irq = vgic_get_vcpu_irq(vcpu, intid); - - raw_spin_lock(&irq->irq_lock); + irq = vgic_get_vcpu_irq(vcpu, intid); + scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit, note deactivation */ deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); @@ -102,11 +82,28 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); irq->on_lr = false; - - raw_spin_unlock(&irq->irq_lock); - vgic_put_irq(vcpu->kvm, irq); } + vgic_put_irq(vcpu->kvm, irq); +} + +/* + * transfer the content of the LRs back into the corresponding ap_list: + * - active bit is transferred as is + * - pending bit is + * - transferred as is in case of edge sensitive IRQs + * - set to the line-level (resample time) for level sensitive IRQs + */ +void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + + for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) + vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]); + cpuif->used_lrs = 0; } From 0660bc4a2b70e7158f63ea1777132d1c93188fe8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:08 +0000 Subject: [PATCH 31/82] KVM: arm64: GICv2: Extract LR computing primitive Split vgic_v2_populate_lr() into two helpers, so that we have another primitive that computes the LR from a vgic_irq, but doesn't update anything in the shadow structure. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-19-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 63 ++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index fb8efdd4196b..5a2165a8d22c 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -107,18 +107,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->used_lrs = 0; } -/* - * Populates the particular LR with the state of a given IRQ: - * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq - * - for a level sensitive IRQ the pending state value is unchanged; - * it is dictated directly by the input level - * - * If @irq describes an SGI with multiple sources, we choose the - * lowest-numbered source VCPU and clear that bit in the source bitmap. - * - * The irq_lock must be held by the caller. - */ -void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 val = irq->intid; bool allow_pending = true; @@ -164,22 +153,52 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) if (allow_pending && irq_is_pending(irq)) { val |= GICH_LR_PENDING_BIT; + if (vgic_irq_is_sgi(irq->intid)) { + u32 src = ffs(irq->source); + + if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", + irq->intid)) + return 0; + + val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; + if (irq->source & ~BIT(src - 1)) + val |= GICH_LR_EOI; + } + } + + /* The GICv2 LR only holds five bits of priority. */ + val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; + + return val; +} + +/* + * Populates the particular LR with the state of a given IRQ: + * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq + * - for a level sensitive IRQ the pending state value is unchanged; + * it is dictated directly by the input level + * + * If @irq describes an SGI with multiple sources, we choose the + * lowest-numbered source VCPU and clear that bit in the source bitmap. + * + * The irq_lock must be held by the caller. + */ +void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) +{ + u32 val = vgic_v2_compute_lr(vcpu, irq); + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; + + if (val & GICH_LR_PENDING_BIT) { if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; if (vgic_irq_is_sgi(irq->intid)) { u32 src = ffs(irq->source); - if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", - irq->intid)) - return; - - val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; - irq->source &= ~(1 << (src - 1)); - if (irq->source) { + irq->source &= ~BIT(src - 1); + if (irq->source) irq->pending_latch = true; - val |= GICH_LR_EOI; - } } } @@ -196,8 +215,6 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; irq->on_lr = true; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val; } void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) From dd598fc1139f7181118719574a4e270e51e0a0eb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:09 +0000 Subject: [PATCH 32/82] KVM: arm64: Compute vgic state irrespective of the number of interrupts As we are going to rely on the [G]ICH_HCR{,_EL2} register to be programmed with MI information at all times, slightly de-optimise the flush/sync code to always be called. This is rather lightweight when no interrupts are in flight. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-20-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 7ee253a9fb77..39346baa2677 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -985,8 +985,6 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu) /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - int used_lrs; - /* If nesting, emulate the HW effect from L0 to L1 */ if (vgic_state_is_nested(vcpu)) { vgic_v3_sync_nested(vcpu); @@ -996,20 +994,10 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu)) vgic_v3_nested_update_mi(vcpu); - /* An empty ap_list_head implies used_lrs == 0 */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) - return; - if (can_access_vgic_from_kernel()) vgic_save_state(vcpu); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) - used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; - else - used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; - - if (used_lrs) - vgic_fold_lr_state(vcpu); + vgic_fold_lr_state(vcpu); vgic_prune_ap_list(vcpu); } @@ -1053,29 +1041,10 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu)) vgic_v3_nested_update_mi(vcpu); - /* - * If there are no virtual interrupts active or pending for this - * VCPU, then there is no work to do and we can bail out without - * taking any lock. There is a potential race with someone injecting - * interrupts to the VCPU, but it is a benign race as the VCPU will - * either observe the new interrupt before or after doing this check, - * and introducing additional synchronization mechanism doesn't change - * this. - * - * Note that we still need to go through the whole thing if anything - * can be directly injected (GICv4). - */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && - !vgic_supports_direct_irqs(vcpu->kvm)) - return; - DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { - raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock) vgic_flush_lr_state(vcpu); - raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); - } if (can_access_vgic_from_kernel()) vgic_restore_state(vcpu); From cf72ee63711916ad808f82eb054dd9d69727a5bf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:10 +0000 Subject: [PATCH 33/82] KVM: arm64: Eagerly save VMCR on exit We currently save/restore the VMCR register in a pretty lazy way (on load/put, consistently with what we do with the APRs). However, we are going to need the group-enable bits that are backed by VMCR on each entry (so that we can avoid injecting interrupts for disabled groups). Move the synchronisation from put to sync, which results in some minor churn in the nVHE hypercalls to simplify things. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-21-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 2 +- arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kvm/arm.c | 3 +-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 7 ++++--- arch/arm64/kvm/hyp/vgic-v3-sr.c | 15 +++------------ arch/arm64/kvm/vgic/vgic-v2.c | 2 +- arch/arm64/kvm/vgic/vgic-v3-nested.c | 2 +- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- 8 files changed, 13 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9da54d4ee49e..f8adbd535b4a 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -79,7 +79,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_range, __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, - __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___pkvm_reserve_vm, __KVM_HOST_SMCCC_FUNC___pkvm_unreserve_vm, diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index e6be1f5d0967..dbf16a9f6772 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -82,7 +82,7 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); -void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..733195ef183e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -659,8 +659,7 @@ nommu: void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { if (is_protected_kvm_enabled()) { - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, - &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3); kvm_call_hyp_nvhe(__pkvm_vcpu_put); } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 29430c031095..a7c689152f68 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -157,6 +157,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags; host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr; + host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr; for (i = 0; i < hyp_cpu_if->used_lrs; ++i) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } @@ -464,11 +465,11 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) __vgic_v3_init_lrs(); } -static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) +static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); - __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if)); + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); } static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt) @@ -616,7 +617,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), + HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), HANDLE_FUNC(__pkvm_reserve_vm), HANDLE_FUNC(__pkvm_unreserve_vm), diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 2509b52bbd62..cafbb41b4c33 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -235,6 +235,8 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } } + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) { u64 val = read_gicreg(ICH_HCR_EL2); cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount; @@ -332,10 +334,6 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { u64 val; - if (!cpu_if->vgic_sre) { - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - } - /* Only restore SRE if the host implements the GICv2 interface */ if (static_branch_unlikely(&vgic_v3_has_v2_compat)) { val = read_gicreg(ICC_SRE_EL2); @@ -357,7 +355,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) write_gicreg(0, ICH_HCR_EL2); } -static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { u64 val; u32 nr_pre_bits; @@ -518,13 +516,6 @@ static void __vgic_v3_write_vmcr(u32 vmcr) write_gicreg(vmcr, ICH_VMCR_EL2); } -void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) -{ - __vgic_v3_save_aprs(cpu_if); - if (cpu_if->vgic_sre) - cpu_if->vgic_vmcr = __vgic_v3_read_vmcr(); -} - void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if) { __vgic_v3_compat_mode_enable(); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 5a2165a8d22c..07e93acafd04 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -451,6 +451,7 @@ void vgic_v2_save_state(struct kvm_vcpu *vcpu) if (!base) return; + cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); if (used_lrs) save_lrs(vcpu, base); @@ -495,6 +496,5 @@ void vgic_v2_put(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR); cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 1fc9e0780abe..1531e4907c65 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -340,7 +340,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) u64 val; int i; - __vgic_v3_save_vmcr_aprs(s_cpu_if); + __vgic_v3_save_aprs(s_cpu_if); __vgic_v3_deactivate_traps(s_cpu_if); __vgic_v3_save_state(s_cpu_if); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index bcce7f35a6d6..5b276e303aab 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -815,7 +815,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) } if (likely(!is_protected_kvm_enabled())) - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) From 6780a756044c396f59e98befed537dbba4a085db Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:11 +0000 Subject: [PATCH 34/82] KVM: arm64: Revamp vgic maintenance interrupt configuration We currently don't use the maintenance interrupt very much, apart from EOI on level interrupts, and for LR underflow in limited cases. However, as we are moving toward a setup where active interrupts can live outside of the LRs, we need to use the MIs in a more diverse set of cases. Add a new helper that produces a digest of the ap_list, and use that summary to set the various control bits as required. This slightly changes the way v2 SGIs are handled, as they used to count for more than one interrupt, but not anymore. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-22-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 12 ++++- arch/arm64/kvm/vgic/vgic-v3.c | 18 ++++++- arch/arm64/kvm/vgic/vgic.c | 91 ++++++++++++----------------------- arch/arm64/kvm/vgic/vgic.h | 19 +++++++- 4 files changed, 74 insertions(+), 66 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 07e93acafd04..f53bc5528897 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -26,11 +26,19 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - cpuif->vgic_hcr |= GICH_HCR_UIE; + cpuif->vgic_hcr = GICH_HCR_EN; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= GICH_HCR_UIE; } static bool lr_signals_eoi_mi(u32 lr_val) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 5b276e303aab..81f1de9e3897 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -20,11 +20,25 @@ static bool common_trap; static bool dir_trap; static bool gicv4_enable; -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + cpuif->vgic_hcr = ICH_HCR_EL2_En; + + if (irqs_pending_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE; + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE; + if (irqs_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; + + if (!als->nr_sgi) + cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; } static bool lr_signals_eoi_mi(u64 lr_val) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 39346baa2677..7e6f02d48fff 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -791,38 +791,30 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr) vgic_v3_clear_lr(vcpu, lr); } -static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_set_underflow(vcpu); - else - vgic_v3_set_underflow(vcpu); -} - -/* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu, - bool *multi_sgi) +static void summarize_ap_list(struct kvm_vcpu *vcpu, + struct ap_list_summary *als) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; - int count = 0; - - *multi_sgi = false; lockdep_assert_held(&vgic_cpu->ap_list_lock); + *als = (typeof(*als)){}; + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - int w; + guard(raw_spinlock)(&irq->irq_lock); - raw_spin_lock(&irq->irq_lock); - /* GICv2 SGIs can count for more than one... */ - w = vgic_irq_get_lr_count(irq); - raw_spin_unlock(&irq->irq_lock); + if (unlikely(vgic_target_oracle(irq) != vcpu)) + continue; - count += w; - *multi_sgi |= (w > 1); + if (!irq->active) + als->nr_pend++; + else + als->nr_act++; + + if (irq->intid < VGIC_NR_SGIS) + als->nr_sgi++; } - return count; } /* @@ -908,60 +900,39 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu, static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct ap_list_summary als; struct vgic_irq *irq; - int count; - bool multi_sgi; - u8 prio = 0xff; - int i = 0; + int count = 0; lockdep_assert_held(&vgic_cpu->ap_list_lock); - count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) + summarize_ap_list(vcpu, &als); + + if (irqs_outside_lrs(&als)) vgic_sort_ap_list(vcpu); - count = 0; - list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - raw_spin_lock(&irq->irq_lock); + scoped_guard(raw_spinlock, &irq->irq_lock) { + if (likely(vgic_target_oracle(irq) == vcpu)) { + vgic_populate_lr(vcpu, irq, count++); + } + } - /* - * If we have multi-SGIs in the pipeline, we need to - * guarantee that they are all seen before any IRQ of - * lower priority. In that case, we need to filter out - * these interrupts by exiting early. This is easy as - * the AP list has been sorted already. - */ - if (multi_sgi && irq->priority > prio) { - raw_spin_unlock(&irq->irq_lock); + if (count == kvm_vgic_global_state.nr_lr) break; - } - - if (likely(vgic_target_oracle(irq) == vcpu)) { - vgic_populate_lr(vcpu, irq, count++); - - if (irq->source) - prio = irq->priority; - } - - raw_spin_unlock(&irq->irq_lock); - - if (count == kvm_vgic_global_state.nr_lr) { - if (!list_is_last(&irq->ap_list, - &vgic_cpu->ap_list_head)) - vgic_set_underflow(vcpu); - break; - } } /* Nuke remaining LRs */ - for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++) vgic_clear_lr(vcpu, i); - if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; - else + vgic_v2_configure_hcr(vcpu, &als); + } else { vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; + vgic_v3_configure_hcr(vcpu, &als); + } } static inline bool can_access_vgic_from_kernel(void) diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 0ecadfa00397..4a0733869cb5 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -236,6 +236,21 @@ struct its_ite { u32 event_id; }; +struct ap_list_summary { + unsigned int nr_pend; /* purely pending, not active */ + unsigned int nr_act; /* active, or active+pending */ + unsigned int nr_sgi; /* any SGI */ +}; + +#define irqs_outside_lrs(s) \ + (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr) + +#define irqs_pending_outside_lrs(s) \ + ((s)->nr_pend > kvm_vgic_global_state.nr_lr) + +#define irqs_active_outside_lrs(s) \ + ((s)->nr_act && irqs_outside_lrs(s)) + int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, @@ -262,7 +277,7 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); @@ -302,7 +317,7 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq) void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_enable(struct kvm_vcpu *vcpu); From f04b8a5a83dbaff310ff919190123db238d35952 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:12 +0000 Subject: [PATCH 35/82] KVM: arm64: Turn kvm_vgic_vcpu_enable() into kvm_vgic_vcpu_reset() Now that we always reconfigure the vgic HCR register on entry, the "enable" part of kvm_vgic_vcpu_enable() is pretty useless. Removing the enable bits from these functions makes it plain that they are just about computing the reset state. Just rename the functions accordingly. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-23-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 8 ++++---- arch/arm64/kvm/vgic/vgic-v2.c | 5 +---- arch/arm64/kvm/vgic/vgic-v3.c | 5 +---- arch/arm64/kvm/vgic/vgic.h | 4 ++-- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1796b1a22a72..6d5e5d708f23 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -353,12 +353,12 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) return ret; } -static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu) +static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu) { if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_enable(vcpu); + vgic_v2_reset(vcpu); else - vgic_v3_enable(vcpu); + vgic_v3_reset(vcpu); } /* @@ -405,7 +405,7 @@ int vgic_init(struct kvm *kvm) } kvm_for_each_vcpu(idx, vcpu, kvm) - kvm_vgic_vcpu_enable(vcpu); + kvm_vgic_vcpu_reset(vcpu); ret = kvm_vgic_setup_default_irq_routing(kvm); if (ret) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index f53bc5528897..18856186be7b 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -285,7 +285,7 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; } -void vgic_v2_enable(struct kvm_vcpu *vcpu) +void vgic_v2_reset(struct kvm_vcpu *vcpu) { /* * By forcing VMCR to zero, the GIC will restore the binary @@ -293,9 +293,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu) * anyway. */ vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; } /* check for overlapping regions and for regions crossing the end of memory */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 81f1de9e3897..780cc92c79e0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -293,7 +293,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) -void vgic_v3_enable(struct kvm_vcpu *vcpu) +void vgic_v3_reset(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; @@ -323,9 +323,6 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) kvm_vgic_global_state.ich_vtr_el2); vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, kvm_vgic_global_state.ich_vtr_el2) + 1; - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EL2_En; } void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 4a0733869cb5..e48294521541 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -285,7 +285,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v2_enable(struct kvm_vcpu *vcpu); +void vgic_v2_reset(struct kvm_vcpu *vcpu); int vgic_v2_probe(const struct gic_kvm_info *info); int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, @@ -320,7 +320,7 @@ void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_v3_enable(struct kvm_vcpu *vcpu); +void vgic_v3_reset(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); From 76b2eda65cccb452a2d112809a2995ee7533f963 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:13 +0000 Subject: [PATCH 36/82] KVM: arm64: Make vgic_target_oracle() globally available Make the internal crystal ball global, so that implementation-specific code can use it. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-24-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 2 +- arch/arm64/kvm/vgic/vgic.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 7e6f02d48fff..004010104659 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -237,7 +237,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) * * Requires the IRQ lock to be held. */ -static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) { lockdep_assert_held(&irq->irq_lock); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index e48294521541..037efb620082 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -261,6 +261,7 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); +struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); From 05984ba67eb6fe554afb355368a037f9eec1dd43 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:14 +0000 Subject: [PATCH 37/82] KVM: arm64: Invert ap_list sorting to push active interrupts out Having established that pending interrupts should have priority to be moved into the LRs over the active interrupts, implement this in the ap_list sorting. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-25-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 004010104659..c7a5454ac4c9 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -270,10 +270,7 @@ struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) * well, the first items in the list being the first things populated in the * LRs. * - * A hard rule is that active interrupts can never be pushed out of the LRs - * (and therefore take priority) since we cannot reliably trap on deactivation - * of IRQs and therefore they have to be present in the LRs. - * + * Pending, non-active interrupts must be placed at the head of the list. * Otherwise things should be sorted by the priority field and the GIC * hardware support will take care of preemption of priority groups etc. * @@ -298,21 +295,21 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, raw_spin_lock(&irqa->irq_lock); raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); - if (irqa->active || irqb->active) { - ret = (int)irqb->active - (int)irqa->active; + penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active; + pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active; + + ret = (int)pendb - (int)penda; + if (ret) goto out; - } - penda = irqa->enabled && irq_is_pending(irqa); - pendb = irqb->enabled && irq_is_pending(irqb); - - if (!penda || !pendb) { - ret = (int)pendb - (int)penda; + /* Both pending and enabled, sort by priority (lower number first) */ + ret = (int)irqa->priority - (int)irqb->priority; + if (ret) goto out; - } - /* Both pending and enabled, sort by priority */ - ret = irqa->priority - irqb->priority; + /* Finally, HW bit active interrupts have priority over non-HW ones */ + ret = (int)irqb->hw - (int)irqa->hw; + out: raw_spin_unlock(&irqb->irq_lock); raw_spin_unlock(&irqa->irq_lock); From 33c1f60b3213c766f434f1be1988d8b211b106a9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:15 +0000 Subject: [PATCH 38/82] KVM: arm64: Move undeliverable interrupts to the end of ap_list Interrupts in the ap_list that cannot be acted upon because they are not enabled, or that their group is not enabled, shouldn't make it into the LRs if we are space-constrained. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-26-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index c7a5454ac4c9..abe01c9c6b36 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -265,6 +265,11 @@ struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) return NULL; } +struct vgic_sort_info { + struct kvm_vcpu *vcpu; + struct vgic_vmcr vmcr; +}; + /* * The order of items in the ap_lists defines how we'll pack things in LRs as * well, the first items in the list being the first things populated in the @@ -273,6 +278,7 @@ struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) * Pending, non-active interrupts must be placed at the head of the list. * Otherwise things should be sorted by the priority field and the GIC * hardware support will take care of preemption of priority groups etc. + * Interrupts that are not deliverable should be at the end of the list. * * Return negative if "a" sorts before "b", 0 to preserve order, and positive * to sort "b" before "a". @@ -282,6 +288,8 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, { struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list); struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list); + struct vgic_sort_info *info = priv; + struct kvm_vcpu *vcpu = info->vcpu; bool penda, pendb; int ret; @@ -295,6 +303,17 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a, raw_spin_lock(&irqa->irq_lock); raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); + /* Undeliverable interrupts should be last */ + ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu); + if (ret) + goto out; + + /* Same thing for interrupts targeting a disabled group */ + ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0); + if (ret) + goto out; + penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active; pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active; @@ -320,10 +339,12 @@ out: static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_sort_info info = { .vcpu = vcpu, }; lockdep_assert_held(&vgic_cpu->ap_list_lock); - list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); + vgic_get_vmcr(vcpu, &info.vmcr); + list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp); } /* From a69e2d6f8934bdb9d08a6740ca6c7a44525e2e95 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:16 +0000 Subject: [PATCH 39/82] KVM: arm64: Use MI to detect groups being enabled/disabled Add the maintenance interrupt to force an exit when the guest enables/disables individual groups, so that we can resort the ap_list accordingly. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-27-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 5 +++++ arch/arm64/kvm/vgic/vgic-v3.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 18856186be7b..9a2de03f74c3 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -39,6 +39,11 @@ void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, cpuif->vgic_hcr |= GICH_HCR_LRENPIE; if (irqs_outside_lrs(als)) cpuif->vgic_hcr |= GICH_HCR_UIE; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ? + GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ? + GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE; } static bool lr_signals_eoi_mi(u32 lr_val) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 780cc92c79e0..312226cc2565 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -39,6 +39,11 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, if (!als->nr_sgi) cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount; + + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ? + ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; + cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? + ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; } static bool lr_signals_eoi_mi(u64 lr_val) From 3cfd59f81e0f3fbdf8a1b2f576bdc63ab6cc3277 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:17 +0000 Subject: [PATCH 40/82] KVM: arm64: GICv3: Handle LR overflow when EOImode==0 Now that we can identify interrupts that have not made it into the LRs, it becomes relatively easy to use EOIcount to walk the overflow list. What is a bit odd is that we compute a fake LR for the original state of the interrupt, clear the active bit, and feed into the existing logic for processing. In a way, this is what would have happened if the interrupt was in an LR. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-28-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 312226cc2565..d4f27f451c8f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -112,16 +112,62 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) vgic_put_irq(vcpu->kvm, irq); } +static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + +static void vgic_v3_deactivate_phys(u32 intid) +{ + if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY)) + gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI); + else + gic_write_dir(intid); +} + void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr); + struct vgic_irq *irq; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); for (int lr = 0; lr < cpuif->used_lrs; lr++) vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]); + /* + * EOIMode=0: use EOIcount to emulate deactivation. We are + * guaranteed to deactivate in reverse order of the activation, so + * just pick one active interrupt after the other in the ap_list, + * and replay the deactivation as if the CPU was doing it. We also + * rely on priority drop to have taken place, and the list to be + * sorted by priority. + */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u64 lr; + + /* + * I would have loved to write this using a scoped_guard(), + * but using 'continue' here is a total train wreck. + */ + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; + } + + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + eoicount--; + } + cpuif->used_lrs = 0; } From cd4f6ee99b28f10692c2444c8dc0bab77357a25e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:18 +0000 Subject: [PATCH 41/82] KVM: arm64: GICv3: Handle deactivation via ICV_DIR_EL1 traps Deactivation via ICV_DIR_EL1 is both relatively straightforward (we have the interrupt that needs deactivation) and really awkward. The main issue is that the interrupt may either be in an LR on another CPU, or ourside of any LR. In the former case, we process the deactivation is if ot was a write to GICD_CACTIVERn, which is already implemented as a big hammer IPI'ing all vcpus. In the latter case, we just perform a normal deactivation, similar to what we do for EOImode==0. Another annoying aspect is that we need to tell the CPU owning the interrupt that its ap_list needs laudering. We use a brand new vcpu request to that effect. Note that this doesn't address deactivation via the GICV MMIO view, which will be taken care of in a later change. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-29-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 4 ++ arch/arm64/kvm/hyp/vgic-v3-sr.c | 3 ++ arch/arm64/kvm/sys_regs.c | 19 ++++++- arch/arm64/kvm/vgic/vgic-v3.c | 85 +++++++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic.c | 11 ++++ arch/arm64/kvm/vgic/vgic.h | 1 + include/kvm/arm_vgic.h | 1 + 8 files changed, 123 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 64302c438355..7501a2ee4dd4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -54,6 +54,7 @@ #define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9) #define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10) +#define KVM_REQ_VGIC_PROCESS_UPDATE KVM_ARCH_REQ(11) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 733195ef183e..fe13f9777f9c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1041,6 +1041,10 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) */ kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); + /* Process interrupts deactivated through a trap */ + if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu)) + kvm_vgic_process_async_update(vcpu); + if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) kvm_update_stolen_time(vcpu); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index cafbb41b4c33..f2f585455144 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -1247,6 +1247,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; + /* Full exit if required to handle overflow deactivation... */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) + return 0; fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e67eb39ddc11..1b69d6e2d720 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -666,6 +666,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, return true; } +static bool access_gic_dir(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + + if (!p->is_write) + return undef_access(vcpu, p, r); + + vgic_v3_deactivate(vcpu, p->regval); + + return true; +} + static bool trap_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -3370,7 +3385,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, - { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, @@ -4495,7 +4510,7 @@ static const struct sys_reg_desc cp15_regs[] = { { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, - { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir }, { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index d4f27f451c8f..d83edf02d072 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -12,6 +12,7 @@ #include #include +#include "vgic-mmio.h" #include "vgic.h" static bool group0_trap; @@ -171,6 +172,90 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->used_lrs = 0; } +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + struct kvm_vcpu *target_vcpu = NULL; + struct vgic_irq *irq; + unsigned long flags; + bool mmio = false; + u64 lr = 0; + + /* + * We only deal with DIR when EOIMode==1, and only for SGI, + * PPI or SPI. + */ + if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) || + val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* + * EOIMode=1: we must rely on traps to handle deactivate of + * overflowing interrupts, as there is no ordering guarantee and + * EOIcount isn't being incremented. Priority drop will have taken + * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs. + * + * Three possibities: + * + * - The irq is not queued on any CPU, and there is nothing to + * do, + * + * - Or the irq is in an LR, meaning that its state is not + * directly observable. Treat it bluntly by making it as if + * this was a write to GICD_ICACTIVER, which will force an + * exit on all vcpus. If it hurts, don't do that. + * + * - Or the irq is active, but not in an LR, and we can + * directly deactivate it by building a pseudo-LR, fold it, + * and queue a request to prune the resulting ap_list, + */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; + } + + if (lr & ICH_LR_HW) + vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + + vgic_v3_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + /* Requires the irq to be locked already */ static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index abe01c9c6b36..cbba6c2988d1 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -990,6 +990,17 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) vgic_prune_ap_list(vcpu); } +/* Sync interrupts that were deactivated through a DIR trap */ +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + vgic_prune_ap_list(vcpu); + local_irq_restore(flags); +} + static inline void vgic_restore_state(struct kvm_vcpu *vcpu) { if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 037efb620082..01ff6d4aa9da 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -318,6 +318,7 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq) void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); +void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val); void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index ec349c5a4a8b..b798546755a3 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -421,6 +421,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1); From 295b69216558367e5e833eb6d92ab2b476a8ad64 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:19 +0000 Subject: [PATCH 42/82] KVM: arm64: GICv3: Add GICv2 SGI handling to deactivation primitive The GICv2 SGIs require additional handling for deactivation, as they are effectively multiple interrrupts muxed into one. Make sure we check for the source CPU when deactivating. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-30-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index d83edf02d072..9fcee5121fe5 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -176,11 +176,20 @@ void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; + u32 model = vcpu->kvm->arch.vgic.vgic_model; struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false, is_v2_sgi; struct vgic_irq *irq; unsigned long flags; - bool mmio = false; u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 && + val < VGIC_NR_SGIS); /* * We only deal with DIR when EOIMode==1, and only for SGI, @@ -216,6 +225,9 @@ void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) * - Or the irq is active, but not in an LR, and we can * directly deactivate it by building a pseudo-LR, fold it, * and queue a request to prune the resulting ap_list, + * + * Special care must be taken to match the source CPUID when + * deactivating a GICv2 SGI. */ scoped_guard(raw_spinlock, &irq->irq_lock) { target_vcpu = irq->vcpu; @@ -233,6 +245,12 @@ void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val) goto put; } + /* GICv2 SGI: check that the cpuid matches */ + if (is_v2_sgi && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + /* (with a Dalek voice) DEACTIVATE!!!! */ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT; } From 70fd60bdedc9ff4c4830a8b379fb65e6ba1e819f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:20 +0000 Subject: [PATCH 43/82] KVM: arm64: GICv3: Set ICH_HCR_EL2.TDIR when interrupts overflow LR capacity Now that we are ready to handle deactivation through ICV_DIR_EL1, set the trap bit if we have active interrupts outside of the LRs. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-31-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 9fcee5121fe5..09f86bf6fe7b 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -45,6 +45,13 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE; cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ? ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; + + /* + * Note that we set the trap irrespective of EOIMode, as that + * can change behind our back without any warning... + */ + if (irqs_active_outside_lrs(als)) + cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } static bool lr_signals_eoi_mi(u64 lr_val) From 1c3b3cadcd69f7415e8b3b1b1e81459e0e8c9f33 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:21 +0000 Subject: [PATCH 44/82] KVM: arm64: GICv3: Add SPI tracking to handle asymmetric deactivation SPIs are specially annpying, as they can be activated on a CPU and deactivated on another. WHich means that when an SPI is in flight anywhere, all CPUs need to have their TDIR trap bit set. This translates into broadcasting an IPI across all CPUs to make sure they set their trap bit, The number of in-flight SPIs is kept in an atomic variable so that CPUs can turn the trap bit off as soon as possible. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-32-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-init.c | 1 + arch/arm64/kvm/vgic/vgic-v3.c | 21 +++++++++++++++------ arch/arm64/kvm/vgic/vgic.c | 25 +++++++++++++++++++++++-- include/kvm/arm_vgic.h | 3 +++ 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 6d5e5d708f23..52de99c0f01c 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -188,6 +188,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; + dist->active_spis = (atomic_t)ATOMIC_INIT(0); dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!dist->spis) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 09f86bf6fe7b..55847fbad4d0 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -47,10 +47,17 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE; /* + * Dealing with EOImode=1 is a massive source of headache. Not + * only do we need to track that we have active interrupts + * outside of the LRs and force DIR to be trapped, we also + * need to deal with SPIs that can be deactivated on another + * CPU. + * * Note that we set the trap irrespective of EOIMode, as that * can change behind our back without any warning... */ - if (irqs_active_outside_lrs(als)) + if (irqs_active_outside_lrs(als) || + atomic_read(&vcpu->kvm->arch.vgic.active_spis)) cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } @@ -78,11 +85,6 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) if (!irq) /* An LPI could have been unmapped. */ return; - /* Notify fds when the guest EOI'ed a level-triggered IRQ */ - if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - scoped_guard(raw_spinlock, &irq->irq_lock) { /* Always preserve the active bit for !LPIs, note deactivation */ if (irq->intid >= VGIC_MIN_LPI) @@ -117,6 +119,13 @@ static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val) irq->on_lr = false; } + /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */ + if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) { + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis); + } + vgic_put_irq(vcpu->kvm, irq); } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index cbba6c2988d1..83969c18ef03 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -367,6 +367,17 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne return false; } +static bool vgic_model_needs_bcst_kick(struct kvm *kvm) +{ + /* + * A GICv3 (or GICv3-like) system exposing a GICv3 to the + * guest needs a broadcast kick to set TDIR globally, even if + * the bit doesn't really exist (we still need to check for + * the shadow bit in the DIR emulation fast-path). + */ + return (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); +} + /* * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list. * Do the queuing if necessary, taking the right locks in the right order. @@ -379,6 +390,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags) __releases(&irq->irq_lock) { struct kvm_vcpu *vcpu; + bool bcast; lockdep_assert_held(&irq->irq_lock); @@ -453,11 +465,20 @@ retry: list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; + /* A new SPI may result in deactivation trapping on all vcpus */ + bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) && + vgic_valid_spi(vcpu->kvm, irq->intid) && + atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0); + raw_spin_unlock(&irq->irq_lock); raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); + if (!bcast) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } else { + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING); + } return true; } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index b798546755a3..6a4d3d205596 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,6 +263,9 @@ struct vgic_dist { /* The GIC maintenance IRQ for nested hypervisors. */ u32 mi_intid; + /* Track the number of in-flight active SPIs */ + atomic_t active_spis; + /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { From ca3c34da3644a24daf248be5dba72783c338dad4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:22 +0000 Subject: [PATCH 45/82] KVM: arm64: GICv3: Handle in-LR deactivation when possible Even when we have either an LR overflow or SPIs in flight, it is extremely likely that the interrupt being deactivated is still in the LRs, and that going all the way back to the the generic trap handling code is a waste of time. Instead, try and deactivate in place when possible, and only if this fails, perform a full exit. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-33-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 38 ++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index f2f585455144..71199e1a9294 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -792,7 +792,7 @@ static void __vgic_v3_bump_eoicount(void) write_gicreg(hcr, ICH_HCR_EL2); } -static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) { u32 vid = vcpu_get_reg(vcpu, rt); u64 lr_val; @@ -800,19 +800,25 @@ static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* EOImode == 0, nothing to be done here */ if (!(vmcr & ICH_VMCR_EOIM_MASK)) - return; + return 1; /* No deactivate to be performed on an LPI */ if (vid >= VGIC_MIN_LPI) - return; + return 1; lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); - if (lr == -1) { - __vgic_v3_bump_eoicount(); - return; + if (lr != -1) { + __vgic_v3_clear_active_lr(lr, lr_val); + return 1; } - __vgic_v3_clear_active_lr(lr, lr_val); + return 0; +} + +static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) +{ + if (!___vgic_v3_write_dir(vcpu, vmcr, rt)) + __vgic_v3_bump_eoicount(); } static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) @@ -1247,9 +1253,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) case SYS_ICC_DIR_EL1: if (unlikely(is_read)) return 0; - /* Full exit if required to handle overflow deactivation... */ - if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) - return 0; + /* + * Full exit if required to handle overflow deactivation, + * unless we can emulate it in the LRs (likely the majority + * of the cases). + */ + if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) { + int ret; + + ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(), + kvm_vcpu_sys_get_rt(vcpu)); + if (ret) + __kvm_skip_instr(vcpu); + + return ret; + } fn = __vgic_v3_write_dir; break; case SYS_ICC_RPR_EL1: From 84792050e0392fbc1f285f9d9a0266b8480f6f06 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:23 +0000 Subject: [PATCH 46/82] KVM: arm64: GICv3: Avoid broadcast kick on CPUs lacking TDIR CPUs lacking TDIR always trap ICV_DIR_EL1, no matter what, since we have ICH_HCR_EL2.TC set permanently. For these CPUs, it is useless to use a broadcast kick on SPI injection, as the sole purpose of this is to set TDIR. We can therefore skip this on these CPUs, which are challenged enough not to be burdened by extra IPIs. As a consequence, permanently set the TDIR bit in the shadow state to notify the fast-path emulation code of the exit reason. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-34-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3.c | 6 +++++- arch/arm64/kvm/vgic/vgic.c | 13 ++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 55847fbad4d0..968aa9d89be6 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -53,10 +53,14 @@ void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, * need to deal with SPIs that can be deactivated on another * CPU. * + * On systems that do not implement TDIR, force the bit in the + * shadow state anyway to avoid IPI-ing on these poor sods. + * * Note that we set the trap irrespective of EOIMode, as that * can change behind our back without any warning... */ - if (irqs_active_outside_lrs(als) || + if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) || + irqs_active_outside_lrs(als) || atomic_read(&vcpu->kvm->arch.vgic.active_spis)) cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR; } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 83969c18ef03..693ec005c996 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -370,12 +370,15 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne static bool vgic_model_needs_bcst_kick(struct kvm *kvm) { /* - * A GICv3 (or GICv3-like) system exposing a GICv3 to the - * guest needs a broadcast kick to set TDIR globally, even if - * the bit doesn't really exist (we still need to check for - * the shadow bit in the DIR emulation fast-path). + * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest + * needs a broadcast kick to set TDIR globally. + * + * For systems that do not have TDIR (ARM's own v8.0 CPUs), the + * shadow TDIR bit is always set, and so is the register's TC bit, + * so no need to kick the CPUs. */ - return (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); + return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3); } /* From eb33ffa2bd3f1842d2960aff7484869fc64aa2fb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:24 +0000 Subject: [PATCH 47/82] KVM: arm64: GICv3: nv: Resync LRs/VMCR/HCR early for better MI emulation The current approach to nested GICv3 support is to not do anything while L2 is running, wait a transition from L2 to L1 to resync LRs, VMCR and HCR, and only then evaluate the state to decide whether to generate a maintenance interrupt. This doesn't provide a good quality of emulation, and it would be far preferable to find out early that we need to perform a switch. Move the LRs/VMCR and HCR resync into vgic_v3_sync_nested(), so that we have most of the state available. As we turning the vgic off at this stage to avoid a screaming host MI, add a new helper vgic_v3_flush_nested() that switches the vgic on again. The MI can then be directly injected as required. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-35-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_hyp.h | 1 + arch/arm64/kvm/hyp/vgic-v3-sr.c | 2 +- arch/arm64/kvm/vgic/vgic-v3-nested.c | 69 ++++++++++++++++------------ arch/arm64/kvm/vgic/vgic.c | 6 ++- arch/arm64/kvm/vgic/vgic.h | 1 + 5 files changed, 46 insertions(+), 33 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index dbf16a9f6772..76ce2b94bd97 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -77,6 +77,7 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); u64 __gic_v3_get_lr(unsigned int lr); +void __gic_v3_set_lr(u64 val, int lr); void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 71199e1a9294..99342c13e179 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -60,7 +60,7 @@ u64 __gic_v3_get_lr(unsigned int lr) unreachable(); } -static void __gic_v3_set_lr(u64 val, int lr) +void __gic_v3_set_lr(u64 val, int lr) { switch (lr & 0xf) { case 0: diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 1531e4907c65..40f7a37e0685 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -70,13 +70,14 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) * - on L2 put: perform the inverse transformation, so that the result of L2 * running becomes visible to L1 in the VNCR-accessible registers. * - * - there is nothing to do on L2 entry, as everything will have happened - * on load. However, this is the point where we detect that an interrupt - * targeting L1 and prepare the grand switcheroo. + * - there is nothing to do on L2 entry apart from enabling the vgic, as + * everything will have happened on load. However, this is the point where + * we detect that an interrupt targeting L1 and prepare the grand + * switcheroo. * - * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1 - * interrupt. The L0 active state will be cleared by the HW if the L1 - * interrupt was itself backed by a HW interrupt. + * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate + * corresponding the L1 interrupt. The L0 active state will be cleared by + * the HW if the L1 interrupt was itself backed by a HW interrupt. * * Maintenance Interrupt (MI) management: * @@ -265,15 +266,30 @@ static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); } +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu) +{ + u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2); +} + void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); int i; for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { - u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + u64 val, host_lr, lr; struct vgic_irq *irq; + host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); + + /* Propagate the new LR state */ + lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + val = lr & ~ICH_LR_STATE; + val |= host_lr & ICH_LR_STATE; + __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); + if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) continue; @@ -286,12 +302,21 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ continue; - lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); - if (!(lr & ICH_LR_STATE)) + if (!(host_lr & ICH_LR_STATE)) irq->active = false; vgic_put_irq(vcpu->kvm, irq); } + + /* We need these to be synchronised to generate the MI */ + __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2)); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount); + __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount); + + write_sysreg_s(0, SYS_ICH_HCR_EL2); + isb(); + + vgic_v3_nested_update_mi(vcpu); } static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu, @@ -324,7 +349,8 @@ void vgic_v3_load_nested(struct kvm_vcpu *vcpu) __vgic_v3_restore_vmcr_aprs(cpu_if); __vgic_v3_activate_traps(cpu_if); - __vgic_v3_restore_state(cpu_if); + for (int i = 0; i < cpu_if->used_lrs; i++) + __gic_v3_set_lr(cpu_if->vgic_lr[i], i); /* * Propagate the number of used LRs for the benefit of the HYP @@ -337,36 +363,19 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif; - u64 val; int i; __vgic_v3_save_aprs(s_cpu_if); - __vgic_v3_deactivate_traps(s_cpu_if); - __vgic_v3_save_state(s_cpu_if); - - /* - * Translate the shadow state HW fields back to the virtual ones - * before copying the shadow struct back to the nested one. - */ - val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); - val &= ~ICH_HCR_EL2_EOIcount_MASK; - val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); - __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val); - __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr); for (i = 0; i < 4; i++) { __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); } - for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { - val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); + for (i = 0; i < s_cpu_if->used_lrs; i++) + __gic_v3_set_lr(0, i); - val &= ~ICH_LR_STATE; - val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; - - __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - } + __vgic_v3_deactivate_traps(s_cpu_if); vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 693ec005c996..892595fdbbff 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -1049,8 +1049,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) * abort the entry procedure and inject the exception at the * beginning of the run loop. * - * - Otherwise, do exactly *NOTHING*. The guest state is - * already loaded, and we can carry on with running it. + * - Otherwise, do exactly *NOTHING* apart from enabling the virtual + * CPU interface. The guest state is already loaded, and we can + * carry on with running it. * * If we have NV, but are not in a nested state, compute the * maintenance interrupt state, as it may fire. @@ -1059,6 +1060,7 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) if (kvm_vgic_vcpu_pending_irq(vcpu)) kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu); + vgic_v3_flush_nested(vcpu); return; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 01ff6d4aa9da..e93bdb485f07 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -445,6 +445,7 @@ static inline bool kvm_has_gicv3(struct kvm *kvm) return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } +void vgic_v3_flush_nested(struct kvm_vcpu *vcpu); void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); void vgic_v3_load_nested(struct kvm_vcpu *vcpu); void vgic_v3_put_nested(struct kvm_vcpu *vcpu); From 6dd333c8942b2e5bb5927af843b56ec2857db7c7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:25 +0000 Subject: [PATCH 48/82] KVM: arm64: GICv3: nv: Plug L1 LR sync into deactivation primitive Pretty much like the rest of the LR handling, deactivation of an L2 interrupt gets reflected in the L1 LRs, and therefore must be propagated into the L1 shadow state if the interrupt is HW-bound. Instead of directly handling the active state (which looks a bit off as it ignores locking and L1->L0 HW propagation), use the new deactivation primitive to perform the deactivation and deal with the required maintenance. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-36-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v3-nested.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 40f7a37e0685..15e7033a7937 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -280,7 +280,6 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { u64 val, host_lr, lr; - struct vgic_irq *irq; host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); @@ -290,7 +289,14 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) val |= host_lr & ICH_LR_STATE; __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) + /* + * Deactivation of a HW interrupt: the LR must have the HW + * bit set, have been in a non-invalid state before the run, + * and now be in an invalid state. If any of that doesn't + * hold, we're done with this LR. + */ + if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) && + !(host_lr & ICH_LR_STATE))) continue; /* @@ -298,14 +304,7 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) * need to emulate the HW effect between the guest hypervisor * and the nested guest. */ - irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ - continue; - - if (!(host_lr & ICH_LR_STATE)) - irq->active = false; - - vgic_put_irq(vcpu->kvm, irq); + vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); } /* We need these to be synchronised to generate the MI */ From 78ffc28456f5981f0e54007fe124e20610abd0ea Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:26 +0000 Subject: [PATCH 49/82] KVM: arm64: GICv3: Force exit to sync ICH_HCR_EL2.En FEAT_NV2 is pretty terrible for anything that tries to enforce immediate effects, and writing to ICH_HCR_EL2 in the hope to disable a maintenance interrupt is vain. This only hits memory, and the guest hasn't cleared anything -- the MI will fire. For example, running the vgic_irq test under NV results in about 800 maintenance interrupts being actually handled by the L1 guest, when none were expected. As a cheap workaround, read back ICH_MISR_EL2 after writing 0 to ICH_HCR_EL2. This is very cheap on real HW, and causes a trap to the host in NV, giving it the opportunity to retire the pending MI. With this, the above test runs to completion without any MI being actually handled. Yes, this is really poor... Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-37-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 7 +++++++ arch/arm64/kvm/vgic/vgic-v3-nested.c | 6 ++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 99342c13e179..0b670a033fd8 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -244,6 +244,13 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) } write_gicreg(0, ICH_HCR_EL2); + + /* + * Hack alert: On NV, this results in a trap so that the above write + * actually takes effect... No synchronisation is necessary, as we + * only care about the effects when this traps. + */ + read_gicreg(ICH_MISR_EL2); } void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 15e7033a7937..61b44f3f2bf1 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -94,8 +94,10 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) * * - because most of the ICH_*_EL2 registers live in the VNCR page, the * quality of emulation is poor: L1 can setup the vgic so that an MI would - * immediately fire, and not observe anything until the next exit. Trying - * to read ICH_MISR_EL2 would do the trick, for example. + * immediately fire, and not observe anything until the next exit. + * Similarly, a pending MI is not immediately disabled by clearing + * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for + * example. * * System register emulation: * From 281c6c06e2a7bc331cbe02ad21f1390820d28d59 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:27 +0000 Subject: [PATCH 50/82] KVM: arm64: GICv2: Handle LR overflow when EOImode==0 Similarly to the GICv3 version, handle the EOIcount-driven deactivation by walking the overflow list. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-38-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-v2.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 9a2de03f74c3..bbd4d003fde8 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -100,6 +100,8 @@ static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val) vgic_put_irq(vcpu->kvm, irq); } +static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq); + /* * transfer the content of the LRs back into the corresponding ap_list: * - active bit is transferred as is @@ -111,12 +113,37 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr); + struct vgic_irq *irq; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]); + /* See the GICv3 equivalent for the EOIcount handling rationale */ + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { + u32 lr; + + if (!eoicount) { + break; + } else { + guard(raw_spinlock)(&irq->irq_lock); + + if (!(likely(vgic_target_oracle(irq) == vcpu) && + irq->active)) + continue; + + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + vgic_v2_fold_lr(vcpu, lr); + eoicount--; + } + cpuif->used_lrs = 0; } From 255de897e7fb918a34845167c572b5bf8e1d9d79 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:28 +0000 Subject: [PATCH 51/82] KVM: arm64: GICv2: Handle deactivation via GICV_DIR traps Add the plumbing of GICv2 interrupt deactivation via GICV_DIR. This requires adding a new device so that we can easily decode the DIR address. The deactivation itself is very similar to the GICv3 version. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-39-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/vgic/vgic-mmio-v2.c | 24 +++++++++ arch/arm64/kvm/vgic/vgic-mmio.h | 1 + arch/arm64/kvm/vgic/vgic-v2.c | 85 ++++++++++++++++++++++++++++++ arch/arm64/kvm/vgic/vgic.h | 1 + include/kvm/arm_vgic.h | 1 + 5 files changed, 112 insertions(+) diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index f25fccb1f8e6..406845b3117c 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, vgic_set_vmcr(vcpu, &vmcr); } +static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_deactivate(vcpu, val); + else + vgic_v3_deactivate(vcpu, val); +} + static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = { REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT, vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4, VGIC_ACCESS_32bit), + REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE, + vgic_mmio_read_raz, vgic_mmio_write_dir, + vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, + 4, VGIC_ACCESS_32bit), }; unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) @@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev) return SZ_4K; } +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev) +{ + dev->regions = vgic_v2_cpu_registers; + dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); + + kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops); + + return KVM_VGIC_V2_CPU_SIZE; +} + int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) { const struct vgic_register_region *region; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index 5b490a4dfa5e..50dc80220b0f 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, const u32 val); unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); +unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index bbd4d003fde8..bc52d44a573d 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -9,6 +9,7 @@ #include #include +#include "vgic-mmio.h" #include "vgic.h" static inline void vgic_v2_write_lr(int lr, u32 val) @@ -147,6 +148,79 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->used_lrs = 0; } +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; + struct kvm_vcpu *target_vcpu = NULL; + bool mmio = false; + struct vgic_irq *irq; + unsigned long flags; + u64 lr = 0; + u8 cpuid; + + /* Snapshot CPUID, and remove it from the INTID */ + cpuid = FIELD_GET(GENMASK_ULL(12, 10), val); + val &= ~GENMASK_ULL(12, 10); + + /* We only deal with DIR when EOIMode==1 */ + if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK)) + return; + + /* Make sure we're in the same context as LR handling */ + local_irq_save(flags); + + irq = vgic_get_vcpu_irq(vcpu, val); + if (WARN_ON_ONCE(!irq)) + goto out; + + /* See the corresponding v3 code for the rationale */ + scoped_guard(raw_spinlock, &irq->irq_lock) { + target_vcpu = irq->vcpu; + + /* Not on any ap_list? */ + if (!target_vcpu) + goto put; + + /* + * Urgh. We're deactivating something that we cannot + * observe yet... Big hammer time. + */ + if (irq->on_lr) { + mmio = true; + goto put; + } + + /* SGI: check that the cpuid matches */ + if (val < VGIC_NR_SGIS && irq->active_source != cpuid) { + target_vcpu = NULL; + goto put; + } + + /* (with a Dalek voice) DEACTIVATE!!!! */ + lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT; + } + + if (lr & GICH_LR_HW) + writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr), + kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE); + + vgic_v2_fold_lr(vcpu, lr); + +put: + vgic_put_irq(vcpu->kvm, irq); + +out: + local_irq_restore(flags); + + if (mmio) + vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32)); + + /* Force the ap_list to be pruned */ + if (target_vcpu) + kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu); +} + static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { u32 val = irq->intid; @@ -346,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base) int vgic_v2_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned int len; int ret = 0; if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || @@ -369,6 +444,16 @@ int vgic_v2_map_resources(struct kvm *kvm) return ret; } + len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev); + dist->cpuif_iodev.base_addr = dist->vgic_cpu_base; + dist->cpuif_iodev.iodev_type = IODEV_CPUIF; + dist->cpuif_iodev.redist_vcpu = NULL; + + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base, + len, &dist->cpuif_iodev.dev); + if (ret) + return ret; + if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, kvm_vgic_global_state.vcpu_base, diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index e93bdb485f07..5f0fc96b4dc2 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -277,6 +277,7 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); +void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 6a4d3d205596..b261fb3968d0 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -287,6 +287,7 @@ struct vgic_dist { struct vgic_irq *spis; struct vgic_io_device dist_iodev; + struct vgic_io_device cpuif_iodev; bool has_its; bool table_write_in_progress; From 07bb1c5622a54e2fd3f5c5a86969a2e7ad7f7376 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:29 +0000 Subject: [PATCH 52/82] KVM: arm64: GICv2: Always trap GICV_DIR register Since we can't decide to trap the DIR register on a per-vcpu basis, always trap the second page of the GIC CPU interface. Yes, this is costly. On the bright side, no sane SW should use EOImode==1 on GICv2... Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-40-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c | 4 ++++ arch/arm64/kvm/vgic/vgic-v2.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c index 78579b31a420..5fd99763b54d 100644 --- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c +++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return -1; } + /* Handle deactivation as a normal exit */ + if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE) + return 0; + rd = kvm_vcpu_dabt_get_rd(vcpu); addr = kvm_vgic_global_state.vcpu_hyp_va; addr += fault_ipa - vgic->vgic_cpu_base; diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index bc52d44a573d..585491fbda80 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -457,7 +457,7 @@ int vgic_v2_map_resources(struct kvm *kvm) if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, kvm_vgic_global_state.vcpu_base, - KVM_VGIC_V2_CPU_SIZE, true); + KVM_VGIC_V2_CPU_SIZE - SZ_4K, true); if (ret) { kvm_err("Unable to remap VGIC CPU to VCPU\n"); return ret; From a1650de7c160aace941d27d39b60c38f6f795aa1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:30 +0000 Subject: [PATCH 53/82] KVM: arm64: selftests: gic_v3: Add irq group setting helper Being able to set the group of an interrupt is pretty useful. Add such a helper. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-41-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/include/arm64/gic.h | 1 + tools/testing/selftests/kvm/lib/arm64/gic.c | 6 ++++++ .../testing/selftests/kvm/lib/arm64/gic_private.h | 1 + tools/testing/selftests/kvm/lib/arm64/gic_v3.c | 15 +++++++++++++++ 4 files changed, 23 insertions(+) diff --git a/tools/testing/selftests/kvm/include/arm64/gic.h b/tools/testing/selftests/kvm/include/arm64/gic.h index baeb3c859389..cc7a7f34ed37 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic.h +++ b/tools/testing/selftests/kvm/include/arm64/gic.h @@ -57,6 +57,7 @@ void gic_irq_set_pending(unsigned int intid); void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_irq_set_group(unsigned int intid, bool group); void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, vm_paddr_t pend_table); diff --git a/tools/testing/selftests/kvm/lib/arm64/gic.c b/tools/testing/selftests/kvm/lib/arm64/gic.c index 7abbf8866512..b023868fe0b8 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic.c @@ -155,3 +155,9 @@ void gic_irq_set_config(unsigned int intid, bool is_edge) GUEST_ASSERT(gic_common_ops); gic_common_ops->gic_irq_set_config(intid, is_edge); } + +void gic_irq_set_group(unsigned int intid, bool group) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_group(intid, group); +} diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_private.h b/tools/testing/selftests/kvm/lib/arm64/gic_private.h index d24e9ecc96c6..b6a7e30c3eb1 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/arm64/gic_private.h @@ -25,6 +25,7 @@ struct gic_common_ops { void (*gic_irq_clear_pending)(uint32_t intid); bool (*gic_irq_get_pending)(uint32_t intid); void (*gic_irq_set_config)(uint32_t intid, bool is_edge); + void (*gic_irq_set_group)(uint32_t intid, bool group); }; extern const struct gic_common_ops gicv3_ops; diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..3e4e1a6a4f7c 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -293,6 +293,20 @@ static void gicv3_enable_redist(volatile void *redist_base) } } +static void gicv3_set_group(uint32_t intid, bool grp) +{ + uint32_t cpu_or_dist; + uint32_t val; + + cpu_or_dist = (get_intid_range(intid) == SPI_RANGE) ? DIST_BIT : guest_get_vcpuid(); + val = gicv3_reg_readl(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4); + if (grp) + val |= BIT(intid % 32); + else + val &= ~BIT(intid % 32); + gicv3_reg_writel(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4, val); +} + static void gicv3_cpu_init(unsigned int cpu) { volatile void *sgi_base; @@ -400,6 +414,7 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_clear_pending = gicv3_irq_clear_pending, .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, + .gic_irq_set_group = gicv3_set_group, }; void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, From 2366295c76c2e09b969b4a5a0829d750bb1ab062 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:31 +0000 Subject: [PATCH 54/82] KVM: arm64: selftests: gic_v3: Disable Group-0 interrupts by default Make sure G0 is disabled at the point of initialising the GIC. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-42-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/lib/arm64/gic_v3.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 3e4e1a6a4f7c..5b0fd95c6b48 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -342,6 +342,8 @@ static void gicv3_cpu_init(unsigned int cpu) /* Set a default priority threshold */ write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); + /* Disable Group-0 interrupts */ + write_sysreg_s(ICC_IGRPEN0_EL1_MASK, SYS_ICC_IGRPEN1_EL1); /* Enable non-secure Group-1 interrupts */ write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); } From 27392612c8823f4b65240949eb0dc77de946285d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:32 +0000 Subject: [PATCH 55/82] KVM: arm64: selftests: vgic_irq: Fix GUEST_ASSERT_IAR_EMPTY() helper No, 0 is not a spurious INTID. Never been, never was. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-43-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 6338f5bbdb70..a77562b2976a 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -205,7 +205,7 @@ static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, do { \ uint32_t _intid; \ _intid = gic_get_and_ack_irq(); \ - GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS); \ + GUEST_ASSERT(_intid == IAR_SPURIOUS); \ } while (0) #define CAT_HELPER(a, b) a ## b From 8b7888c5114d280b071f341c072775ee222178b1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:33 +0000 Subject: [PATCH 56/82] KVM: arm64: selftests: vgic_irq: Change configuration before enabling interrupt The architecture is pretty clear that changing the configuration of an enable interrupt is not OK. It doesn't really matter here, but doing the right thing is not more expensive. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-44-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index a77562b2976a..a8919ef3cea2 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -473,12 +473,12 @@ static void guest_code(struct test_args *args) gic_init(GIC_V3, 1); - for (i = 0; i < nr_irqs; i++) - gic_irq_enable(i); - for (i = MIN_SPI; i < nr_irqs; i++) gic_irq_set_config(i, !level_sensitive); + for (i = 0; i < nr_irqs; i++) + gic_irq_enable(i); + gic_set_eoi_split(args->eoi_split); reset_priorities(args); From 5053c2ab92a1e7cbfd3705be2f4371bf843aad2c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:34 +0000 Subject: [PATCH 57/82] KVM: arm64: selftests: vgic_irq: Exclude timer-controlled interrupts The PPI injection API is clear that you can't inject the timer PPIs from userspace, since they are controlled by the timers themselves. Add an exclusion list for this purpose. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-45-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 31 ++++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index a8919ef3cea2..b0415bdb8952 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -359,8 +359,9 @@ static uint32_t wait_for_and_activate_irq(void) * interrupts for the whole test. */ static void test_inject_preemption(struct test_args *args, - uint32_t first_intid, int num, - kvm_inject_cmd cmd) + uint32_t first_intid, int num, + const unsigned long *exclude, + kvm_inject_cmd cmd) { uint32_t intid, prio, step = KVM_PRIO_STEPS; int i; @@ -379,6 +380,10 @@ static void test_inject_preemption(struct test_args *args, for (i = 0; i < num; i++) { uint32_t tmp; intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + KVM_INJECT(cmd, intid); /* Each successive IRQ will preempt the previous one. */ tmp = wait_for_and_activate_irq(); @@ -390,6 +395,10 @@ static void test_inject_preemption(struct test_args *args, /* finish handling the IRQs starting with the highest priority one. */ for (i = 0; i < num; i++) { intid = num - i - 1 + first_intid; + + if (exclude && test_bit(intid - first_intid, exclude)) + continue; + gic_set_eoi(intid); if (args->eoi_split) gic_set_dir(intid); @@ -397,8 +406,12 @@ static void test_inject_preemption(struct test_args *args, local_irq_enable(); - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { + if (exclude && test_bit(i, exclude)) + continue; + GUEST_ASSERT(!gic_irq_get_active(i + first_intid)); + } GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); GUEST_ASSERT_IAR_EMPTY(); @@ -442,14 +455,20 @@ static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) * number of concurrently active IRQs. The number of LRs implemented is * IMPLEMENTATION DEFINED, however, it seems that most implement 4. */ + /* Timer PPIs cannot be injected from userspace */ + static const unsigned long ppi_exclude = (BIT(27 - MIN_PPI) | + BIT(30 - MIN_PPI) | + BIT(28 - MIN_PPI) | + BIT(26 - MIN_PPI)); + if (f->sgi) - test_inject_preemption(args, MIN_SGI, 4, f->cmd); + test_inject_preemption(args, MIN_SGI, 4, NULL, f->cmd); if (f->ppi) - test_inject_preemption(args, MIN_PPI, 4, f->cmd); + test_inject_preemption(args, MIN_PPI, 4, &ppi_exclude, f->cmd); if (f->spi) - test_inject_preemption(args, MIN_SPI, 4, f->cmd); + test_inject_preemption(args, MIN_SPI, 4, NULL, f->cmd); } static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) From fd5fa1c8d09a77c0986158af5b522f6d35830329 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:35 +0000 Subject: [PATCH 58/82] KVM: arm64: selftests: vgic_irq: Remove LR-bound limitation Good news: our GIC emulation is not completely broken, and we can activate as many interrupts as we want. Bump the test to cover all the SGIs, all the allowed PPIs, and 31 SPIs. Yes, 31, because we have 31 available priorities, and the test is not happy with having two interrupts with the same priority. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-46-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index b0415bdb8952..9d4761f1a320 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -449,12 +449,6 @@ static void test_injection_failure(struct test_args *args, static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) { - /* - * Test up to 4 levels of preemption. The reason is that KVM doesn't - * currently implement the ability to have more than the number-of-LRs - * number of concurrently active IRQs. The number of LRs implemented is - * IMPLEMENTATION DEFINED, however, it seems that most implement 4. - */ /* Timer PPIs cannot be injected from userspace */ static const unsigned long ppi_exclude = (BIT(27 - MIN_PPI) | BIT(30 - MIN_PPI) | @@ -462,26 +456,25 @@ static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) BIT(26 - MIN_PPI)); if (f->sgi) - test_inject_preemption(args, MIN_SGI, 4, NULL, f->cmd); + test_inject_preemption(args, MIN_SGI, 16, NULL, f->cmd); if (f->ppi) - test_inject_preemption(args, MIN_PPI, 4, &ppi_exclude, f->cmd); + test_inject_preemption(args, MIN_PPI, 16, &ppi_exclude, f->cmd); if (f->spi) - test_inject_preemption(args, MIN_SPI, 4, NULL, f->cmd); + test_inject_preemption(args, MIN_SPI, 31, NULL, f->cmd); } static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) { - /* Test up to 4 active IRQs. Same reason as in test_preemption. */ if (f->sgi) - guest_restore_active(args, MIN_SGI, 4, f->cmd); + guest_restore_active(args, MIN_SGI, 16, f->cmd); if (f->ppi) - guest_restore_active(args, MIN_PPI, 4, f->cmd); + guest_restore_active(args, MIN_PPI, 16, f->cmd); if (f->spi) - guest_restore_active(args, MIN_SPI, 4, f->cmd); + guest_restore_active(args, MIN_SPI, 31, f->cmd); } static void guest_code(struct test_args *args) From b6c68612ab4171e07a7c2ba8864b967207fc3add Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:36 +0000 Subject: [PATCH 59/82] KVM: arm64: selftests: vgic_irq: Perform EOImode==1 deactivation in ack order When EOImode==1, perform the deactivation in the order of activation, just to make things a bit worse for KVM. Yes, I'm nasty. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-47-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 9d4761f1a320..72f7bb0d201e 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -400,8 +400,18 @@ static void test_inject_preemption(struct test_args *args, continue; gic_set_eoi(intid); - if (args->eoi_split) - gic_set_dir(intid); + } + + if (args->eoi_split) { + for (i = 0; i < num; i++) { + intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + + if (args->eoi_split) + gic_set_dir(intid); + } } local_irq_enable(); From d2dee2e849834564293ec9c33165df56dd441399 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:37 +0000 Subject: [PATCH 60/82] KVM: arm64: selftests: vgic_irq: Add asymmetric SPI deaectivation test Add a new test case that makes an interrupt pending on a vcpu, activates it, do the priority drop, and then get *another* vcpu to do the deactivation. Special care is taken not to trigger an exit in the process, so that we are sure that the active interrupt is in an LR. Joy. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-48-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 105 +++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 72f7bb0d201e..a53ab809fe8a 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -29,6 +29,7 @@ struct test_args { bool level_sensitive; /* 1 is level, 0 is edge */ int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */ bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */ + uint32_t shared_data; }; /* @@ -801,6 +802,109 @@ done: kvm_vm_free(vm); } +static void guest_code_asym_dir(struct test_args *args, int cpuid) +{ + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + if (cpuid == 0) { + uint32_t intid; + + local_irq_disable(); + + gic_set_priority(MIN_PPI, IRQ_DEFAULT_PRIO); + gic_irq_enable(MIN_SPI); + gic_irq_set_pending(MIN_SPI); + + intid = wait_for_and_activate_irq(); + GUEST_ASSERT_EQ(intid, MIN_SPI); + + gic_set_eoi(intid); + isb(); + + WRITE_ONCE(args->shared_data, MIN_SPI); + dsb(ishst); + + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) == MIN_SPI); + GUEST_ASSERT(!gic_irq_get_active(MIN_SPI)); + } else { + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) != MIN_SPI); + + gic_set_dir(MIN_SPI); + isb(); + + WRITE_ONCE(args->shared_data, 0); + dsb(ishst); + } + + GUEST_DONE(); +} + +static void *test_vcpu_run(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct ucall uc; + + while (1) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + return NULL; +} + +static void test_vgic_two_cpus(void *gcode) +{ + pthread_t thr[2]; + struct kvm_vcpu *vcpus[2]; + struct test_args args = {}; + struct kvm_vm *vm; + vm_vaddr_t args_gva; + int gic_fd, ret; + + vm = vm_create_with_vcpus(2, gcode, vcpus); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpus[0]); + vcpu_init_descriptor_tables(vcpus[1]); + + /* Setup the guest args page (so it gets the args). */ + args_gva = vm_vaddr_alloc_page(vm); + memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); + vcpu_args_set(vcpus[0], 2, args_gva, 0); + vcpu_args_set(vcpus[1], 2, args_gva, 1); + + gic_fd = vgic_v3_setup(vm, 2, 64); + + ret = pthread_create(&thr[0], NULL, test_vcpu_run, vcpus[0]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 0 (%d)\n", ret); + ret = pthread_create(&thr[1], NULL, test_vcpu_run, vcpus[1]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 1 (%d)\n", ret); + + pthread_join(thr[0], NULL); + pthread_join(thr[1], NULL); + + close(gic_fd); + kvm_vm_free(vm); +} + static void help(const char *name) { printf( @@ -857,6 +961,7 @@ int main(int argc, char **argv) test_vgic(nr_irqs, false /* level */, true /* eoi_split */); test_vgic(nr_irqs, true /* level */, false /* eoi_split */); test_vgic(nr_irqs, true /* level */, true /* eoi_split */); + test_vgic_two_cpus(guest_code_asym_dir); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } From 1c9c71ac1b9f86b3d1841c703e3e928b2ec224c7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:38 +0000 Subject: [PATCH 61/82] KVM: arm64: selftests: vgic_irq: Add Group-0 enable test Add a new test case that inject a Group-0 interrupt together with a bunch of Group-1 interrupts, Ack/EOI the G1 interrupts, and only then enable G0, expecting to get the G0 interrupt. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-49-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 49 ++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index a53ab809fe8a..ff2c75749f5c 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -846,6 +846,54 @@ static void guest_code_asym_dir(struct test_args *args, int cpuid) GUEST_DONE(); } +static void guest_code_group_en(struct test_args *args, int cpuid) +{ + uint32_t intid; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(0); + gic_set_priority_mask(CPU_PRIO_MASK); + /* SGI0 is G0, which is disabled */ + gic_irq_set_group(0, 0); + + /* Configure all SGIs with decreasing priority */ + for (intid = 0; intid < MIN_PPI; intid++) { + gic_set_priority(intid, (intid + 1) * 8); + gic_irq_enable(intid); + gic_irq_set_pending(intid); + } + + /* Ack and EOI all G1 interrupts */ + for (int i = 1; i < MIN_PPI; i++) { + intid = wait_for_and_activate_irq(); + + GUEST_ASSERT(intid < MIN_PPI); + gic_set_eoi(intid); + isb(); + } + + /* + * Check that SGI0 is still pending, inactive, and that we cannot + * ack anything. + */ + GUEST_ASSERT(gic_irq_get_pending(0)); + GUEST_ASSERT(!gic_irq_get_active(0)); + GUEST_ASSERT_IAR_EMPTY(); + GUEST_ASSERT(read_sysreg_s(SYS_ICC_IAR0_EL1) == IAR_SPURIOUS); + + /* Open the G0 gates, and verify we can ack SGI0 */ + write_sysreg_s(1, SYS_ICC_IGRPEN0_EL1); + isb(); + + do { + intid = read_sysreg_s(SYS_ICC_IAR0_EL1); + } while (intid == IAR_SPURIOUS); + + GUEST_ASSERT(intid == 0); + GUEST_DONE(); +} + static void *test_vcpu_run(void *arg) { struct kvm_vcpu *vcpu = arg; @@ -962,6 +1010,7 @@ int main(int argc, char **argv) test_vgic(nr_irqs, true /* level */, false /* eoi_split */); test_vgic(nr_irqs, true /* level */, true /* eoi_split */); test_vgic_two_cpus(guest_code_asym_dir); + test_vgic_two_cpus(guest_code_group_en); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } From de8842327728d07b5d836688a66ae5fa56902527 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:39 +0000 Subject: [PATCH 62/82] KVM: arm64: selftests: vgic_irq: Add timer deactivation test Add a new test case that triggers the HW deactivation emulation path when trapping ICV_DIR_EL1. This is obviously tied to the way KVM works now, but the test follows the expected architectural behaviour. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-50-maz@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_irq.c | 65 ++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index ff2c75749f5c..9858187c7b6e 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -894,6 +894,70 @@ static void guest_code_group_en(struct test_args *args, int cpuid) GUEST_DONE(); } +static void guest_code_timer_spi(struct test_args *args, int cpuid) +{ + uint32_t intid; + u64 val; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + /* Add a pending SPI so that KVM starts trapping DIR */ + gic_set_priority(MIN_SPI + cpuid, IRQ_DEFAULT_PRIO); + gic_irq_set_pending(MIN_SPI + cpuid); + + /* Configure the timer with a higher priority, make it pending */ + gic_set_priority(27, IRQ_DEFAULT_PRIO - 8); + + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* Enable both interrupts */ + gic_irq_enable(MIN_SPI + cpuid); + gic_irq_enable(27); + + /* The timer must fire */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + /* Check that we can deassert it */ + write_sysreg(0, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(!gic_irq_get_pending(27)); + + /* + * Priority drop, deactivation -- we expect that the host + * deactivation will have been effective + */ + gic_set_eoi(27); + gic_set_dir(27); + + GUEST_ASSERT(!gic_irq_get_active(27)); + + /* Do it one more time */ + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* The timer must fire again */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + GUEST_DONE(); +} + static void *test_vcpu_run(void *arg) { struct kvm_vcpu *vcpu = arg; @@ -1011,6 +1075,7 @@ int main(int argc, char **argv) test_vgic(nr_irqs, true /* level */, true /* eoi_split */); test_vgic_two_cpus(guest_code_asym_dir); test_vgic_two_cpus(guest_code_group_en); + test_vgic_two_cpus(guest_code_timer_spi); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } From 64d67e7add109bfc54eac454558a4355af879ba7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Nov 2025 16:01:44 +0000 Subject: [PATCH 63/82] KVM: arm64: Convert ICH_HCR_EL2_TDIR cap to EARLY_LOCAL_CPU_FEATURE Suzuki notices that making the ICH_HCR_EL2_TDIR capability a system one isn't a very good idea, should we end-up with CPUs that have asymmetric TDIR support (somehow unlikely, but you never know what level of stupidity vendors are up to). For this hypothetical setup, making this an "EARLY_LOCAL_CPU_FEATURE" is a much better option. This is actually consistent with what we already do with GICv5 legacy interface, so flip the capability over. Reported-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Fixes: 2a28810cbb8b2 ("KVM: arm64: GICv3: Detect and work around the lack of ICV_DIR_EL1 trapping") Link: https://lore.kernel.org/r/5df713d4-8b79-4456-8fd1-707ca89a61b6@arm.com Reviewed-by: Suzuki K Poulose Link: https://msgid.link/20251125160144.1086511-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/cpufeature.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5de51cb1b8fe..75fb9a0efcc8 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2325,14 +2325,14 @@ static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry, BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV3_CPUIF); BUILD_BUG_ON(ARM64_HAS_ICH_HCR_EL2_TDIR <= ARM64_HAS_GICV5_LEGACY); - if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF) && + if (!this_cpu_has_cap(ARM64_HAS_GICV3_CPUIF) && !is_midr_in_range_list(has_vgic_v3)) return false; if (!is_hyp_mode_available()) return false; - if (cpus_have_cap(ARM64_HAS_GICV5_LEGACY)) + if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY)) return true; if (is_kernel_in_hyp_mode()) @@ -2863,7 +2863,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { */ .desc = "ICV_DIR_EL1 trapping", .capability = ARM64_HAS_ICH_HCR_EL2_TDIR, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .type = ARM64_CPUCAP_EARLY_LOCAL_CPU_FEATURE, .matches = can_trap_icv_dir_el1, }, #ifdef CONFIG_ARM64_E0PD From 6b49f70022ed607bab66da60c7b332f39cda4ff1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:46 -0800 Subject: [PATCH 64/82] KVM: arm64: Teach ptdump about FEAT_XNX permissions Although KVM doesn't make direct use of the feature, guest hypervisors can use FEAT_XNX which influences the permissions of the shadow stage-2. Update ptdump to separately print the privileged and unprivileged execute permissions. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-5-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/ptdump.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c index dc5acfb00af9..6cbe018fd6fd 100644 --- a/arch/arm64/kvm/ptdump.c +++ b/arch/arm64/kvm/ptdump.c @@ -31,27 +31,46 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = { .val = PTE_VALID, .set = " ", .clear = "F", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R, .set = "R", .clear = " ", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, .set = "W", .clear = " ", - }, { + }, + { .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, - .val = KVM_PTE_LEAF_ATTR_HI_S2_XN, - .set = "NX", - .clear = "x ", - }, { + .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px ux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNux ", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "PXNUXN", + }, + { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN, + .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN), + .set = "px UXN", + }, + { .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF, .val = KVM_PTE_LEAF_ATTR_LO_S2_AF, .set = "AF", .clear = " ", - }, { + }, + { .mask = PMD_TYPE_MASK, .val = PMD_TYPE_SECT, .set = "BLK", From 692650bd7b12532798c2022cb53869a07a288fbe Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:47 -0800 Subject: [PATCH 65/82] KVM: arm64: nv: Advertise support for FEAT_XNX Everything is in place to support FEAT_XNX, advertise support. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-6-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 92b2a69f0b89..08839a320a45 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1559,7 +1559,6 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) val &= ~(ID_AA64MMFR1_EL1_CMOW | ID_AA64MMFR1_EL1_nTLBPA | ID_AA64MMFR1_EL1_ETS | - ID_AA64MMFR1_EL1_XNX | ID_AA64MMFR1_EL1_HAFDBS); /* FEAT_E2H0 implies no VHE */ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) From cdba9da34b145eb2f3c502279a454f9a1a8346c1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:48 -0800 Subject: [PATCH 66/82] KVM: arm64: Call helper for reading descriptors directly Going through a function pointer doesn't serve much purpose when there's only one implementation. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-7-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 08839a320a45..e4928a6a3672 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -124,7 +124,6 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) } struct s2_walk_info { - int (*read_desc)(phys_addr_t pa, u64 *desc, void *data); void *data; u64 baddr; unsigned int max_oa_bits; @@ -199,6 +198,13 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) return 0; } +static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) +{ + struct kvm_vcpu *vcpu = data; + + return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); +} + /* * This is essentially a C-version of the pseudo code from the ARM ARM * AArch64.TranslationTableWalk function. I strongly recommend looking at @@ -257,7 +263,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, >> (addr_bottom - 3); paddr = base_addr | index; - ret = wi->read_desc(paddr, &desc, wi->data); + ret = read_guest_s2_desc(paddr, &desc, wi->data); if (ret < 0) return ret; @@ -325,13 +331,6 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, return 0; } -static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) -{ - struct kvm_vcpu *vcpu = data; - - return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); -} - static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) { wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK; @@ -364,7 +363,6 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, if (!vcpu_has_nv(vcpu)) return 0; - wi.read_desc = read_guest_s2_desc; wi.data = vcpu; wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); From 977d1bf15c5179276d93468abbc00a224908ff72 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:49 -0800 Subject: [PATCH 67/82] KVM: arm64: nv: Stop passing vCPU through void ptr in S2 PTW The stage-2 table walker passes down the vCPU as a void pointer. That might've made sense if the walker was generic although at this point it is clear this will only ever be used in the context of a vCPU. Suggested-by: Marc Zyngier Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-8-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index e4928a6a3672..94eff8307aad 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -124,7 +124,6 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) } struct s2_walk_info { - void *data; u64 baddr; unsigned int max_oa_bits; unsigned int pgshift; @@ -198,10 +197,8 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) return 0; } -static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) +static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc) { - struct kvm_vcpu *vcpu = data; - return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); } @@ -212,7 +209,7 @@ static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data) * * Must be called with the kvm->srcu read lock held */ -static int walk_nested_s2_pgd(phys_addr_t ipa, +static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, struct s2_walk_info *wi, struct kvm_s2_trans *out) { int first_block_level, level, stride, input_size, base_lower_bound; @@ -263,7 +260,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, >> (addr_bottom - 3); paddr = base_addr | index; - ret = read_guest_s2_desc(paddr, &desc, wi->data); + ret = read_guest_s2_desc(vcpu, paddr, &desc); if (ret < 0) return ret; @@ -363,14 +360,13 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, if (!vcpu_has_nv(vcpu)) return 0; - wi.data = vcpu; wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); vtcr_to_walk_info(vtcr, &wi); wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE; - ret = walk_nested_s2_pgd(gipa, &wi, result); + ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result); if (ret) result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC); From fabf321cba4be0d0dcbb39e97c3deb572fec2f8d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:50 -0800 Subject: [PATCH 68/82] KVM: arm64: Handle endianness in read helper for emulated PTW Implementing FEAT_HAFDBS means adding another descriptor accessor that needs to deal with the guest-configured endianness. Prepare by moving the endianness handling into the read accessor and out of the main body of the S1/S2 PTWs. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-9-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 25 +++++++++++++++++++------ arch/arm64/kvm/nested.c | 32 ++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index be26d5aa668c..a295a37dd3b1 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -362,6 +362,24 @@ transfault: return -EFAULT; } +static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, + struct s1_walk_info *wi) +{ + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; +} + static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { @@ -414,17 +432,12 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return ret; } - ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi); if (ret) { fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); return ret; } - if (wi->be) - desc = be64_to_cpu((__force __be64)desc); - else - desc = le64_to_cpu((__force __le64)desc); - /* Invalid descriptor */ if (!(desc & BIT(0))) goto transfault; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 94eff8307aad..75d26c0ba3e0 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -197,9 +197,26 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output) return 0; } -static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc) +static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc, + struct s2_walk_info *wi) { - return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc)); + u64 val; + int r; + + r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val)); + if (r) + return r; + + /* + * Handle reversedescriptors if endianness differs between the + * host and the guest hypervisor. + */ + if (wi->be) + *desc = be64_to_cpu((__force __be64)val); + else + *desc = le64_to_cpu((__force __le64)val); + + return 0; } /* @@ -260,19 +277,10 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, >> (addr_bottom - 3); paddr = base_addr | index; - ret = read_guest_s2_desc(vcpu, paddr, &desc); + ret = read_guest_s2_desc(vcpu, paddr, &desc, wi); if (ret < 0) return ret; - /* - * Handle reversedescriptors if endianness differs between the - * host and the guest hypervisor. - */ - if (wi->be) - desc = be64_to_cpu((__force __be64)desc); - else - desc = le64_to_cpu((__force __le64)desc); - /* Check for valid descriptor at this point */ if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); From 590e694820bfd70e3de78fcb98b16c98a905230e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:51 -0800 Subject: [PATCH 69/82] KVM: arm64: nv: Use pgtable definitions in stage-2 walk Use the existing page table definitions instead of magic numbers for the stage-2 table walk. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-10-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 75d26c0ba3e0..a096766c6ec3 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -282,14 +282,23 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return ret; /* Check for valid descriptor at this point */ - if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { + if (!(desc & KVM_PTE_VALID)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); out->desc = desc; return 1; } - /* We're at the final level or block translation level */ - if ((desc & 3) == 1 || level == 3) + if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) { + if (level < 3) + break; + + out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); + out->desc = desc; + return 1; + } + + /* We're at the final level */ + if (level == 3) break; if (check_output_size(wi, desc)) { @@ -316,7 +325,7 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return 1; } - if (!(desc & BIT(10))) { + if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); out->desc = desc; return 1; @@ -329,8 +338,8 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, (ipa & GENMASK_ULL(addr_bottom - 1, 0)); out->output = paddr; out->block_size = 1UL << ((3 - level) * stride + wi->pgshift); - out->readable = desc & (0b01 << 6); - out->writable = desc & (0b10 << 6); + out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; out->level = level; out->desc = desc; return 0; From f6927b41d57390c597a126063e2e518911976878 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:52 -0800 Subject: [PATCH 70/82] KVM: arm64: Add helper for swapping guest descriptor Implementing FEAT_HAFDBS in KVM's software PTWs requires the ability to CAS a descriptor to update the in-memory value. Add an accessor to do exactly that, coping with the fact that guest descriptors are in user memory (duh). While FEAT_LSE required on any system that implements NV, KVM now uses the stage-1 PTW for non-nested use cases meaning an LL/SC implementation is necessary as well. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-11-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_nested.h | 2 + arch/arm64/kvm/at.c | 87 +++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 5d967b60414c..6dbc2908aed9 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -403,4 +403,6 @@ void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); (FIX_VNCR - __c); \ }) +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new); + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index a295a37dd3b1..581c4c49d9cd 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1650,3 +1650,90 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) return ret; } } + +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + uaccess_enable_privileged(); + + asm volatile(__LSE_PREAMBLE + "1: cas %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} + +static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + int ret = 1; + u64 tmp; + + uaccess_enable_privileged(); + + asm volatile("prfm pstl1strm, %[addr]\n" + "1: ldxr %[tmp], %[addr]\n" + "sub %[tmp], %[tmp], %[old]\n" + "cbnz %[tmp], 3f\n" + "2: stlxr %w[ret], %[new], %[addr]\n" + "3:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) + : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp) + : [old] "r" (old), [new] "r" (new) + : "memory"); + + uaccess_disable_privileged(); + + /* STLXR didn't update the descriptor, or the compare failed */ + if (ret == 1) + return -EAGAIN; + + return ret; +} + +int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + u64 __user *ptep; + bool writable; + int offset; + gfn_t gfn; + int r; + + lockdep_assert(srcu_read_lock_held(&kvm->srcu)); + + gfn = ipa >> PAGE_SHIFT; + offset = offset_in_page(ipa); + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return -EINVAL; + if (!writable) + return -EPERM; + + ptep = (u64 __user *)hva + offset; + if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + r = __lse_swap_desc(ptep, old, new); + else + r = __llsc_swap_desc(ptep, old, new); + + if (r < 0) + return r; + + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} From 92c6443222ca4289191d797ac79176c560886998 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:53 -0800 Subject: [PATCH 71/82] KVM: arm64: Propagate PTW errors up to AT emulation KVM's software PTW will soon support 'hardware' updates to the access flag. Similar to fault handling, races to update the descriptor will be handled by restarting the instruction. Prepare for this by propagating errors up to the AT emulation, only retiring the instruction if the walk succeeds. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-12-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 6 ++--- arch/arm64/kvm/at.c | 43 ++++++++++++++++++++++---------- arch/arm64/kvm/sys_regs.c | 9 ++++--- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9da54d4ee49e..090f7b740bdc 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -246,9 +246,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding); extern void __kvm_timer_set_cntvoff(u64 cntvoff); -extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); -extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); -extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 581c4c49d9cd..2a99380ada6f 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1234,7 +1234,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu, wr->pr &= !pan; } -static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par) { struct s1_walk_result wr = {}; struct s1_walk_info wi = {}; @@ -1259,6 +1259,11 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* + * Race to update a descriptor -- restart the walk. + */ + if (ret == -EAGAIN) + return ret; if (ret) goto compute_par; @@ -1292,7 +1297,8 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); compute_par: - return compute_par_s1(vcpu, &wi, &wr); + *par = compute_par_s1(vcpu, &wi, &wr); + return 0; } /* @@ -1420,9 +1426,10 @@ static bool par_check_s1_access_fault(u64 par) !(par & SYS_PAR_EL1_S)); } -void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + int ret; /* * If PAR_EL1 reports that AT failed on a S1 permission or access @@ -1434,15 +1441,20 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) */ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par) && - !par_check_s1_access_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + !par_check_s1_access_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { u64 par; + int ret; /* * We've trapped, so everything is live on the CPU. As we will be @@ -1489,13 +1501,17 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) } /* We failed the translation, let's replay it in slow motion */ - if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) - par = handle_at_slow(vcpu, op, vaddr); + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) { + ret = handle_at_slow(vcpu, op, vaddr, &par); + if (ret) + return ret; + } vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } -void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) { struct kvm_s2_trans out = {}; u64 ipa, par; @@ -1522,13 +1538,13 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) break; default: WARN_ON_ONCE(1); - return; + return 0; } __kvm_at_s1e01(vcpu, op, vaddr); par = vcpu_read_sys_reg(vcpu, PAR_EL1); if (par & SYS_PAR_EL1_F) - return; + return 0; /* * If we only have a single stage of translation (EL2&0), exit @@ -1536,14 +1552,14 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) */ if (compute_translation_regime(vcpu, op) == TR_EL20 || !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) - return; + return 0; /* Do the stage-2 translation */ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); out.esr = 0; ret = kvm_walk_nested_s2(vcpu, ipa, &out); if (ret < 0) - return; + return ret; /* Check the access permission */ if (!out.esr && @@ -1552,6 +1568,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) par = compute_par_s12(vcpu, par, &out); vcpu_write_sys_reg(vcpu, par, PAR_EL1); + return 0; } /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e67eb39ddc11..61830eb3607c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3767,7 +3767,8 @@ static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - __kvm_at_s1e01(vcpu, op, p->regval); + if (__kvm_at_s1e01(vcpu, op, p->regval)) + return false; return true; } @@ -3784,7 +3785,8 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; } - __kvm_at_s1e2(vcpu, op, p->regval); + if (__kvm_at_s1e2(vcpu, op, p->regval)) + return false; return true; } @@ -3794,7 +3796,8 @@ static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - __kvm_at_s12(vcpu, op, p->regval); + if (__kvm_at_s12(vcpu, op, p->regval)) + return false; return true; } From bff8aa213dee742b09151a34494418050afed948 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:54 -0800 Subject: [PATCH 72/82] KVM: arm64: Implement HW access flag management in stage-1 SW PTW Atomically update the Access flag at stage-1 when the guest has configured the MMU to do so. Make the implementation choice (and liberal interpretation of speculation) that any access type updates the Access flag, including AT and CMO instructions. Restart the entire walk by returning to the exception-generating instruction in the case of a failed Access flag update. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-13-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_nested.h | 1 + arch/arm64/kvm/at.c | 33 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 6dbc2908aed9..905c658057a4 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -353,6 +353,7 @@ struct s1_walk_info { bool be; bool s2; bool pa52bit; + bool ha; }; struct s1_walk_result { diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 2a99380ada6f..e39f814d247f 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -346,6 +346,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + wi->ha = tcr & TCR_HA; + return 0; addrsz: @@ -380,10 +382,24 @@ static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc, return 0; } +static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, + struct s1_walk_info *wi) +{ + if (wi->be) { + old = cpu_to_be64(old); + new = cpu_to_be64(new); + } else { + old = cpu_to_le64(old); + new = cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { - u64 va_top, va_bottom, baddr, desc; + u64 va_top, va_bottom, baddr, desc, new_desc, ipa; int level, stride, ret; level = wi->sl; @@ -393,7 +409,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, va_top = get_ia_size(wi) - 1; while (1) { - u64 index, ipa; + u64 index; va_bottom = (3 - level) * stride + wi->pgshift; index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); @@ -438,6 +454,8 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, return ret; } + new_desc = desc; + /* Invalid descriptor */ if (!(desc & BIT(0))) goto transfault; @@ -490,6 +508,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) goto addrsz; + if (wi->ha) + new_desc |= PTE_AF; + + if (new_desc != desc) { + ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + if (!(desc & PTE_AF)) { fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false); return -EACCES; From e4c7dfac2f1ab848ffd356b3e76827ea404bbd94 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:55 -0800 Subject: [PATCH 73/82] KVM: arm64: nv: Implement HW access flag management in stage-2 SW PTW Give the stage-2 walk similar treatment to stage-1: update the access flag during the table walk and do so for any walk context. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-14-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 5 +++++ arch/arm64/kvm/nested.c | 44 ++++++++++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 96f1786c72fe..b9aebca90f59 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2012,6 +2012,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) u32 esr; ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret == -EAGAIN) { + ret = 1; + goto out_unlock; + } + if (ret) { esr = kvm_s2_trans_esr(&nested_trans); kvm_inject_s2_fault(vcpu, esr); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index a096766c6ec3..6495442f400a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -124,12 +124,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) } struct s2_walk_info { - u64 baddr; - unsigned int max_oa_bits; - unsigned int pgshift; - unsigned int sl; - unsigned int t0sz; - bool be; + u64 baddr; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int sl; + unsigned int t0sz; + bool be; + bool ha; }; static u32 compute_fsc(int level, u32 fsc) @@ -219,6 +220,20 @@ static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc, return 0; } +static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new, + struct s2_walk_info *wi) +{ + if (wi->be) { + old = cpu_to_be64(old); + new = cpu_to_be64(new); + } else { + old = cpu_to_le64(old); + new = cpu_to_le64(new); + } + + return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); +} + /* * This is essentially a C-version of the pseudo code from the ARM ARM * AArch64.TranslationTableWalk function. I strongly recommend looking at @@ -232,7 +247,7 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, int first_block_level, level, stride, input_size, base_lower_bound; phys_addr_t base_addr; unsigned int addr_top, addr_bottom; - u64 desc; /* page table entry */ + u64 desc, new_desc; /* page table entry */ int ret; phys_addr_t paddr; @@ -281,6 +296,8 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, if (ret < 0) return ret; + new_desc = desc; + /* Check for valid descriptor at this point */ if (!(desc & KVM_PTE_VALID)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); @@ -325,6 +342,17 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa, return 1; } + if (wi->ha) + new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + + if (new_desc != desc) { + ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi); + if (ret) + return ret; + + desc = new_desc; + } + if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); out->desc = desc; @@ -363,6 +391,8 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi) /* Global limit for now, should eventually be per-VM */ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); + + wi->ha = vtcr & VTCR_EL2_HA; } int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, From d5bbb76f447420681abdcfa4ad32344d11188d00 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:56 -0800 Subject: [PATCH 74/82] KVM: arm64: nv: Expose hardware access flag management to NV guests Everything is in place to update the access flag at S1 and S2. Expose support for the access flag flavor of FEAT_HAFDBS to NV guests. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-15-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/nested.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 6495442f400a..88d7dfb44410 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1599,11 +1599,13 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val) case SYS_ID_AA64MMFR1_EL1: val &= ~(ID_AA64MMFR1_EL1_CMOW | ID_AA64MMFR1_EL1_nTLBPA | - ID_AA64MMFR1_EL1_ETS | - ID_AA64MMFR1_EL1_HAFDBS); + ID_AA64MMFR1_EL1_ETS); + /* FEAT_E2H0 implies no VHE */ if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) val &= ~ID_AA64MMFR1_EL1_VH; + + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF); break; case SYS_ID_AA64MMFR2_EL1: From 66f188858385d640163fbf866d9c11b7741da91a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 11:01:57 -0800 Subject: [PATCH 75/82] KVM: arm64: selftests: Add test for AT emulation Add a basic test for AT emulation in the EL2&0 and EL1&0 translation regimes. Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Link: https://msgid.link/20251124190158.177318-16-oupton@kernel.org Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/Makefile.kvm | 1 + tools/testing/selftests/kvm/arm64/at.c | 166 ++++++++++++++++++ .../testing/selftests/kvm/include/kvm_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 10 ++ 4 files changed, 178 insertions(+) create mode 100644 tools/testing/selftests/kvm/arm64/at.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..81b3aa54678a 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -156,6 +156,7 @@ TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs TEST_GEN_PROGS_arm64 += arm64/arch_timer_edge_cases +TEST_GEN_PROGS_arm64 += arm64/at TEST_GEN_PROGS_arm64 += arm64/debug-exceptions TEST_GEN_PROGS_arm64 += arm64/hello_el2 TEST_GEN_PROGS_arm64 += arm64/host_sve diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c new file mode 100644 index 000000000000..acecb6ab5071 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * at - Test for KVM's AT emulation in the EL2&0 and EL1&0 translation regimes. + */ +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include "ucall.h" + +#include + +#define TEST_ADDR 0x80000000 + +enum { + CLEAR_ACCESS_FLAG, + TEST_ACCESS_FLAG, +}; + +static u64 *ptep_hva; + +#define copy_el2_to_el1(reg) \ + write_sysreg_s(read_sysreg_s(SYS_##reg##_EL1), SYS_##reg##_EL12) + +/* Yes, this is an ugly hack */ +#define __at(op, addr) write_sysreg_s(addr, op) + +#define test_at_insn(op, expect_fault) \ +do { \ + u64 par, fsc; \ + bool fault; \ + \ + GUEST_SYNC(CLEAR_ACCESS_FLAG); \ + \ + __at(OP_AT_##op, TEST_ADDR); \ + isb(); \ + par = read_sysreg(par_el1); \ + \ + fault = par & SYS_PAR_EL1_F; \ + fsc = FIELD_GET(SYS_PAR_EL1_FST, par); \ + \ + __GUEST_ASSERT((expect_fault) == fault, \ + "AT "#op": %sexpected fault (par: %lx)1", \ + (expect_fault) ? "" : "un", par); \ + if ((expect_fault)) { \ + __GUEST_ASSERT(fsc == ESR_ELx_FSC_ACCESS_L(3), \ + "AT "#op": expected access flag fault (par: %lx)", \ + par); \ + } else { \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \ + GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \ + GUEST_SYNC(TEST_ACCESS_FLAG); \ + } \ +} while (0) + +static void test_at(bool expect_fault) +{ + test_at_insn(S1E2R, expect_fault); + test_at_insn(S1E2W, expect_fault); + + /* Reuse the stage-1 MMU context from EL2 at EL1 */ + copy_el2_to_el1(SCTLR); + copy_el2_to_el1(MAIR); + copy_el2_to_el1(TCR); + copy_el2_to_el1(TTBR0); + copy_el2_to_el1(TTBR1); + + /* Disable stage-2 translation and enter a non-host context */ + write_sysreg(0, vtcr_el2); + write_sysreg(0, vttbr_el2); + sysreg_clear_set(hcr_el2, HCR_EL2_TGE | HCR_EL2_VM, 0); + isb(); + + test_at_insn(S1E1R, expect_fault); + test_at_insn(S1E1W, expect_fault); +} + +static void guest_code(void) +{ + sysreg_clear_set(tcr_el1, TCR_HA, 0); + isb(); + + test_at(true); + + if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1))) + GUEST_DONE(); + + /* + * KVM's software PTW makes the implementation choice that the AT + * instruction sets the access flag. + */ + sysreg_clear_set(tcr_el1, 0, TCR_HA); + isb(); + test_at(false); + + GUEST_DONE(); +} + +static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + switch (uc->args[1]) { + case CLEAR_ACCESS_FLAG: + /* + * Delete + reinstall the memslot to invalidate stage-2 + * mappings of the stage-1 page tables, forcing KVM to + * use the 'slow' AT emulation path. + * + * This and clearing the access flag from host userspace + * ensures that the access flag cannot be set speculatively + * and is reliably cleared at the time of the AT instruction. + */ + clear_bit(__ffs(PTE_AF), ptep_hva); + vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]); + break; + case TEST_ACCESS_FLAG: + TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva), + "Expected access flag to be set (desc: %lu)", *ptep_hva); + break; + default: + TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]); + } +} + +static void run_test(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + return; + case UCALL_SYNC: + handle_sync(vcpu, &uc); + continue; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + default: + TEST_FAIL("Unexpeced ucall: %lu", uc.cmd); + } + } +} + +int main(void) +{ + struct kvm_vcpu_init init; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_EL2)); + + vm = vm_create(1); + + kvm_get_default_vcpu_target(vm, &init); + init.features[0] |= BIT(KVM_ARM_VCPU_HAS_EL2); + vcpu = aarch64_vcpu_add(vm, 0, &init, guest_code); + kvm_arch_vm_finalize_vcpus(vm); + + virt_map(vm, TEST_ADDR, TEST_ADDR, 1); + ptep_hva = virt_get_pte_hva_at_level(vm, TEST_ADDR, 3); + run_test(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d3f3e455c031..41467dad9178 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -715,6 +715,7 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) #endif void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..d6538bb17740 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1201,6 +1201,16 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) ret, errno, slot, flags); } +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot) +{ + struct userspace_mem_region *region = memslot2region(vm, slot); + struct kvm_userspace_memory_region2 tmp = region->region; + + tmp.memory_size = 0; + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &tmp); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); +} + /* * VM Memory Region Move * From 36fe022f884bb936d911a0d2e93819aba11daceb Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 24 Nov 2025 15:54:09 -0800 Subject: [PATCH 76/82] KVM: arm64: Fix compilation when CONFIG_ARM64_USE_LSE_ATOMICS=n __lse_swap_desc() is compiled unconditionally, even if LSE is disabled using the config option. Align with the spirit of the config option and fix some build errors due to __LSE_PREAMBLE being undefined with the application of some ifdeffery. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511250700.kAutzJFm-lkp@intel.com/ Link: https://msgid.link/20251124235409.1731253-1-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index e39f814d247f..f774a02d9393 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1697,6 +1697,7 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) } } +#ifdef CONFIG_ARM64_LSE_ATOMICS static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) { u64 tmp = old; @@ -1721,6 +1722,12 @@ static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) return ret; } +#else +static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + return -EINVAL; +} +#endif static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new) { From b0fc8329ec98ea891a5e47821db6aee1d564bff6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 25 Nov 2025 20:48:48 +0000 Subject: [PATCH 77/82] KVM: arm64: Add endian casting to kvm_swap_s[12]_desc() Keep sparse quiet by explicitly casting endianness conversion when swapping S1 and S2 descriptors. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511260246.JQDGsQKa-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202511260344.9XehvH5Q-lkp@intel.com/ Fixes: c59ca4b5b0c3f ("KVM: arm64: Implement HW access flag management in stage-1 SW PTW") Fixes: 39db933ba67f8 ("KVM: arm64: nv: Implement HW access flag management in stage-2 SW PTW") Signed-off-by: Marc Zyngier Link: https://msgid.link/20251125204848.1136383-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 8 ++++---- arch/arm64/kvm/nested.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index f774a02d9393..d25fef0f66e2 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -386,11 +386,11 @@ static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new, struct s1_walk_info *wi) { if (wi->be) { - old = cpu_to_be64(old); - new = cpu_to_be64(new); + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); } else { - old = cpu_to_le64(old); - new = cpu_to_le64(new); + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); } return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 88d7dfb44410..911fc99ed99d 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -224,11 +224,11 @@ static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u6 struct s2_walk_info *wi) { if (wi->be) { - old = cpu_to_be64(old); - new = cpu_to_be64(new); + old = (__force u64)cpu_to_be64(old); + new = (__force u64)cpu_to_be64(new); } else { - old = cpu_to_le64(old); - new = cpu_to_le64(new); + old = (__force u64)cpu_to_le64(old); + new = (__force u64)cpu_to_le64(new); } return __kvm_at_swap_desc(vcpu->kvm, pa, old, new); From d98a04dc190672bbd29654a159f7621e0c63adcf Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 25 Nov 2025 10:59:15 -0700 Subject: [PATCH 78/82] KVM: arm64: Add break to default case in kvm_pgtable_stage2_pte_prot() Clang warns (or errors with CONFIG_WERROR=y / W=e): arch/arm64/kvm/hyp/pgtable.c:757:2: error: label at end of compound statement is a C23 extension [-Werror,-Wc23-extensions] 757 | } | ^ With older versions of clang (15 and older) and GCC (at least the minimum supported, 8.1), this is an unconditional hard error: arch/arm64/kvm/hyp/pgtable.c: In function 'kvm_pgtable_stage2_pte_prot': arch/arm64/kvm/hyp/pgtable.c:756:2: error: label at end of compound statement default: ^~~~~~~ arch/arm64/kvm/hyp/pgtable.c:756:10: error: label at end of compound statement: expected statement default: ^ ; Add a break statement to this default case to clear up the error/warning. Fixes: 2608563b466b ("KVM: arm64: Add support for FEAT_XNX stage-2 permissions") Signed-off-by: Nathan Chancellor Acked-by: Marc Zyngier Link: https://msgid.link/20251125-arm64-kvm-hyp-pgtable-fix-c23-ext-warn-v1-1-98b506ddefbf@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index e1d75f965027..5a1cf42df4c5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -754,6 +754,7 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) prot |= KVM_PGTABLE_PROT_PX; break; default: + break; } return prot; From 05474b7bc75d215a147b44b339ba4e9638b74382 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 28 Nov 2025 17:51:24 +0000 Subject: [PATCH 79/82] KVM: arm64: Fix spelling mistake "Unexpeced" -> "Unexpected" There is a spelling mistake in a TEST_FAIL message. Fix it. Signed-off-by: Colin Ian King Link: https://msgid.link/20251128175124.319094-1-colin.i.king@gmail.com Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/at.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c index acecb6ab5071..c8ee6f520734 100644 --- a/tools/testing/selftests/kvm/arm64/at.c +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -137,7 +137,7 @@ static void run_test(struct kvm_vcpu *vcpu) REPORT_GUEST_ASSERT(uc); return; default: - TEST_FAIL("Unexpeced ucall: %lu", uc.cmd); + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); } } } From 93e8d997812b315bbec946e874171a8b7d785eaf Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Fri, 28 Nov 2025 10:09:43 +0000 Subject: [PATCH 80/82] KVM: arm64: Document KVM_PGTABLE_PROT_{UX,PX} Commit 2608563b466b ("KVM: arm64: Add support for FEAT_XNX stage-2 permissions") added the KVM_PGTABLE_PROX_{UX,PX} permissions to stage 2 and to EL2 translation regimes, but left them undocumented. Let's fix that. Signed-off-by: Alexandru Elisei Link: https://msgid.link/20251128100946.74210-2-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c72149a607d6..611e62331763 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -240,7 +240,9 @@ enum kvm_pgtable_stage2_flags { /** * enum kvm_pgtable_prot - Page-table permissions and attributes. - * @KVM_PGTABLE_PROT_X: Execute permission. + * @KVM_PGTABLE_PROT_UX: Unprivileged execute permission. + * @KVM_PGTABLE_PROT_PX: Privileged execute permission. + * @KVM_PGTABLE_PROT_X: Privileged and unprivileged execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. From e88d60c0aa0abbaade177b84f54a868c67231cd7 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Fri, 28 Nov 2025 10:09:44 +0000 Subject: [PATCH 81/82] KVM: arm64: at: Use correct HA bit in TCR_EL2 when regime is EL2 According to ARM DDI 0487L.b, the HA bit in TCR_EL2 when the translation regime is EL2 (or !ELIsInHost(EL2)) is bit 21, not 39. Fixes: c59ca4b5b0c3 ("KVM: arm64: Implement HW access flag management in stage-1 SW PTW") Signed-off-by: Alexandru Elisei Link: https://msgid.link/20251128100946.74210-3-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/kvm/at.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1da290aeedce..e500600e4b9b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -111,6 +111,7 @@ #define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) #define TCR_EL2_HPD (1 << 24) +#define TCR_EL2_HA (1 << 21) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index d25fef0f66e2..6d41a95f6c60 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -346,7 +346,9 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); - wi->ha = tcr & TCR_HA; + wi->ha = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HA, tcr) : + FIELD_GET(TCR_HA, tcr)); return 0; From d52aca1635654805a682afef176473793a63b588 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Fri, 28 Nov 2025 10:09:46 +0000 Subject: [PATCH 82/82] KVM: arm64: at: Update AF on software walk only if VM has FEAT_HAFDBS A guest can write 1 to TCR_ELx.HA, making the KVM software walker update the access flag in a table descriptor even if FEAT_HAFDBS is not present. Avoid this by making wi->ha depend on FEAT_HAFDBS being enabled in the VM, similar to how the software walker treats FEAT_HPDS. This is not needed for VTCR_EL2.HA, since a guest will always write to the in-memory copy of the register, where the HA bit is masked (set to 0) by KVM if the VM doesn't have FEAT_HAFDBS. Fixes: c59ca4b5b0c3 ("KVM: arm64: Implement HW access flag management in stage-1 SW PTW") Signed-off-by: Alexandru Elisei Link: https://msgid.link/20251128100946.74210-5-alexandru.elisei@arm.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/at.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 6d41a95f6c60..53bf70126f81 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -346,7 +346,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); - wi->ha = (wi->regime == TR_EL2 ? + wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF); + wi->ha &= (wi->regime == TR_EL2 ? FIELD_GET(TCR_EL2_HA, tcr) : FIELD_GET(TCR_HA, tcr));