Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Fix another set of FP/SIMD/SVE bugs affecting NV, and plugging some
     missing synchronisation

   - A small fix for the irqbypass hook fixes, tightening the check and
     ensuring that we only deal with MSI for both the old and the new
     route entry

   - Rework the way the shadow LRs are addressed in a nesting
     configuration, plugging an embarrassing bug as well as simplifying
     the whole process

   - Add yet another fix for the dreaded arch_timer_edge_cases selftest

  RISC-V:

   - Fix the size parameter check in SBI SFENCE calls

   - Don't treat SBI HFENCE calls as NOPs

  x86 TDX:

   - Complete API for handling complex TDVMCALLs in userspace.

     This was delayed because the spec lacked a way for userspace to
     deny supporting these calls; the new exit code is now approved"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: TDX: Exit to userspace for GetTdVmCallInfo
  KVM: TDX: Handle TDG.VP.VMCALL<GetQuote>
  KVM: TDX: Add new TDVMCALL status code for unsupported subfuncs
  KVM: arm64: VHE: Centralize ISBs when returning to host
  KVM: arm64: Remove cpacr_clear_set()
  KVM: arm64: Remove ad-hoc CPTR manipulation from kvm_hyp_handle_fpsimd()
  KVM: arm64: Remove ad-hoc CPTR manipulation from fpsimd_sve_sync()
  KVM: arm64: Reorganise CPTR trap manipulation
  KVM: arm64: VHE: Synchronize CPTR trap deactivation
  KVM: arm64: VHE: Synchronize restore of host debug registers
  KVM: arm64: selftests: Close the GIC FD in arch_timer_edge_cases
  KVM: arm64: Explicitly treat routing entry type changes as changes
  KVM: arm64: nv: Fix tracking of shadow list registers
  RISC-V: KVM: Don't treat SBI HFENCE calls as NOPs
  RISC-V: KVM: Fix the size parameter check in SBI SFENCE calls
This commit is contained in:
Linus Torvalds
2025-06-22 09:58:23 -07:00
14 changed files with 376 additions and 283 deletions

View File

@@ -6645,7 +6645,8 @@ to the byte array.
.. note::
For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
KVM_EXIT_EPR, KVM_EXIT_HYPERCALL, KVM_EXIT_TDX,
KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
operations are complete (and guest state is consistent) only after userspace
has re-entered the kernel with KVM_RUN. The kernel side will first finish
incomplete operations and then check for pending signals.
@@ -7174,6 +7175,62 @@ The valid value for 'flags' is:
- KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid
in VMCS. It would run into unknown result if resume the target VM.
::
/* KVM_EXIT_TDX */
struct {
__u64 flags;
__u64 nr;
union {
struct {
u64 ret;
u64 data[5];
} unknown;
struct {
u64 ret;
u64 gpa;
u64 size;
} get_quote;
struct {
u64 ret;
u64 leaf;
u64 r11, r12, r13, r14;
} get_tdvmcall_info;
};
} tdx;
Process a TDVMCALL from the guest. KVM forwards select TDVMCALL based
on the Guest-Hypervisor Communication Interface (GHCI) specification;
KVM bridges these requests to the userspace VMM with minimal changes,
placing the inputs in the union and copying them back to the guest
on re-entry.
Flags are currently always zero, whereas ``nr`` contains the TDVMCALL
number from register R11. The remaining field of the union provide the
inputs and outputs of the TDVMCALL. Currently the following values of
``nr`` are defined:
* ``TDVMCALL_GET_QUOTE``: the guest has requested to generate a TD-Quote
signed by a service hosting TD-Quoting Enclave operating on the host.
Parameters and return value are in the ``get_quote`` field of the union.
The ``gpa`` field and ``size`` specify the guest physical address
(without the shared bit set) and the size of a shared-memory buffer, in
which the TDX guest passes a TD Report. The ``ret`` field represents
the return value of the GetQuote request. When the request has been
queued successfully, the TDX guest can poll the status field in the
shared-memory area to check whether the Quote generation is completed or
not. When completed, the generated Quote is returned via the same buffer.
* ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support
status of TDVMCALLs. The output values for the given leaf should be
placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info``
field of the union.
KVM may add support for more values in the future that may cause a userspace
exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case,
it will enter with output fields already valid; in the common case, the
``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``.
Userspace need not do anything if it does not wish to support a TDVMCALL.
::
/* Fix the size of the union. */

View File

@@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
vcpu_set_flag((v), e); \
} while (0)
#define __build_check_all_or_none(r, bits) \
BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits))
#define __cpacr_to_cptr_clr(clr, set) \
({ \
u64 cptr = 0; \
\
if ((set) & CPACR_EL1_FPEN) \
cptr |= CPTR_EL2_TFP; \
if ((set) & CPACR_EL1_ZEN) \
cptr |= CPTR_EL2_TZ; \
if ((set) & CPACR_EL1_SMEN) \
cptr |= CPTR_EL2_TSM; \
if ((clr) & CPACR_EL1_TTA) \
cptr |= CPTR_EL2_TTA; \
if ((clr) & CPTR_EL2_TAM) \
cptr |= CPTR_EL2_TAM; \
if ((clr) & CPTR_EL2_TCPAC) \
cptr |= CPTR_EL2_TCPAC; \
\
cptr; \
})
#define __cpacr_to_cptr_set(clr, set) \
({ \
u64 cptr = 0; \
\
if ((clr) & CPACR_EL1_FPEN) \
cptr |= CPTR_EL2_TFP; \
if ((clr) & CPACR_EL1_ZEN) \
cptr |= CPTR_EL2_TZ; \
if ((clr) & CPACR_EL1_SMEN) \
cptr |= CPTR_EL2_TSM; \
if ((set) & CPACR_EL1_TTA) \
cptr |= CPTR_EL2_TTA; \
if ((set) & CPTR_EL2_TAM) \
cptr |= CPTR_EL2_TAM; \
if ((set) & CPTR_EL2_TCPAC) \
cptr |= CPTR_EL2_TCPAC; \
\
cptr; \
})
#define cpacr_clear_set(clr, set) \
do { \
BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \
BUILD_BUG_ON((clr) & CPACR_EL1_E0POE); \
__build_check_all_or_none((clr), CPACR_EL1_FPEN); \
__build_check_all_or_none((set), CPACR_EL1_FPEN); \
__build_check_all_or_none((clr), CPACR_EL1_ZEN); \
__build_check_all_or_none((set), CPACR_EL1_ZEN); \
__build_check_all_or_none((clr), CPACR_EL1_SMEN); \
__build_check_all_or_none((set), CPACR_EL1_SMEN); \
\
if (has_vhe() || has_hvhe()) \
sysreg_clear_set(cpacr_el1, clr, set); \
else \
sysreg_clear_set(cptr_el2, \
__cpacr_to_cptr_clr(clr, set), \
__cpacr_to_cptr_set(clr, set));\
} while (0)
/*
* Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE
* format if E2H isn't set.

View File

@@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm);
})
/*
* The couple of isb() below are there to guarantee the same behaviour
* on VHE as on !VHE, where the eret to EL1 acts as a context
* synchronization event.
* The isb() below is there to guarantee the same behaviour on VHE as on !VHE,
* where the eret to EL1 acts as a context synchronization event.
*/
#define kvm_call_hyp(f, ...) \
do { \
@@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm);
\
if (has_vhe()) { \
ret = f(__VA_ARGS__); \
isb(); \
} else { \
ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \
} \

View File

@@ -2764,7 +2764,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
if (new->type != KVM_IRQ_ROUTING_MSI)
if (old->type != KVM_IRQ_ROUTING_MSI ||
new->type != KVM_IRQ_ROUTING_MSI)
return true;
return memcmp(&old->msi, &new->msi, sizeof(new->msi));

View File

@@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
}
}
static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
{
u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA;
/*
* Always trap SME since it's not supported in KVM.
* TSM is RES1 if SME isn't implemented.
*/
val |= CPTR_EL2_TSM;
if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
val |= CPTR_EL2_TZ;
if (!guest_owns_fp_regs())
val |= CPTR_EL2_TFP;
write_sysreg(val, cptr_el2);
}
static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
{
/*
* With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
* CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
* except for some missing controls, such as TAM.
* In this case, CPTR_EL2.TAM has the same position with or without
* VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
* shift value for trapping the AMU accesses.
*/
u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA;
u64 cptr;
if (guest_owns_fp_regs()) {
val |= CPACR_EL1_FPEN;
if (vcpu_has_sve(vcpu))
val |= CPACR_EL1_ZEN;
}
if (!vcpu_has_nv(vcpu))
goto write;
/*
* The architecture is a bit crap (what a surprise): an EL2 guest
* writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
* as they are RES0 in the guest's view. To work around it, trap the
* sucker using the very same bit it can't set...
*/
if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
val |= CPTR_EL2_TCPAC;
/*
* Layer the guest hypervisor's trap configuration on top of our own if
* we're in a nested context.
*/
if (is_hyp_ctxt(vcpu))
goto write;
cptr = vcpu_sanitised_cptr_el2(vcpu);
/*
* Pay attention, there's some interesting detail here.
*
* The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
* meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
*
* - CPTR_EL2.xEN = x0, traps are enabled
* - CPTR_EL2.xEN = x1, traps are disabled
*
* In other words, bit[0] determines if guest accesses trap or not. In
* the interest of simplicity, clear the entire field if the guest
* hypervisor has traps enabled to dispel any illusion of something more
* complicated taking place.
*/
if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
val &= ~CPACR_EL1_FPEN;
if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
val &= ~CPACR_EL1_ZEN;
if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
val |= cptr & CPACR_EL1_E0POE;
val |= cptr & CPTR_EL2_TCPAC;
write:
write_sysreg(val, cpacr_el1);
}
static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu)
{
if (!guest_owns_fp_regs())
__activate_traps_fpsimd32(vcpu);
if (has_vhe() || has_hvhe())
__activate_cptr_traps_vhe(vcpu);
else
__activate_cptr_traps_nvhe(vcpu);
}
static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
{
u64 val = CPTR_NVHE_EL2_RES1;
if (!cpus_have_final_cap(ARM64_SVE))
val |= CPTR_EL2_TZ;
if (!cpus_have_final_cap(ARM64_SME))
val |= CPTR_EL2_TSM;
write_sysreg(val, cptr_el2);
}
static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
{
u64 val = CPACR_EL1_FPEN;
if (cpus_have_final_cap(ARM64_SVE))
val |= CPACR_EL1_ZEN;
if (cpus_have_final_cap(ARM64_SME))
val |= CPACR_EL1_SMEN;
write_sysreg(val, cpacr_el1);
}
static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
{
if (has_vhe() || has_hvhe())
__deactivate_cptr_traps_vhe(vcpu);
else
__deactivate_cptr_traps_nvhe(vcpu);
}
#define reg_to_fgt_masks(reg) \
({ \
struct fgt_masks *m; \
@@ -486,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
*/
if (system_supports_sve()) {
__hyp_sve_save_host();
/* Re-enable SVE traps if not supported for the guest vcpu. */
if (!vcpu_has_sve(vcpu))
cpacr_clear_set(CPACR_EL1_ZEN, 0);
} else {
__fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
}
@@ -541,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Valid trap. Switch the context: */
/* First disable enough traps to allow us to update the registers */
if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve()))
cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
else
cpacr_clear_set(0, CPACR_EL1_FPEN);
__deactivate_cptr_traps(vcpu);
isb();
/* Write out the host state if it's in the registers */
@@ -566,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
*host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED;
/*
* Re-enable traps necessary for the current state of the guest, e.g.
* those enabled by a guest hypervisor. The ERET to the guest will
* provide the necessary context synchronization.
*/
__activate_cptr_traps(vcpu);
return true;
}

View File

@@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
if (!guest_owns_fp_regs())
return;
cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
/*
* Traps have been disabled by __deactivate_cptr_traps(), but there
* hasn't necessarily been a context synchronization event yet.
*/
isb();
if (vcpu_has_sve(vcpu))

View File

@@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks;
extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
{
u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */
if (!guest_owns_fp_regs())
__activate_traps_fpsimd32(vcpu);
if (has_hvhe()) {
val |= CPACR_EL1_TTA;
if (guest_owns_fp_regs()) {
val |= CPACR_EL1_FPEN;
if (vcpu_has_sve(vcpu))
val |= CPACR_EL1_ZEN;
}
write_sysreg(val, cpacr_el1);
} else {
val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
/*
* Always trap SME since it's not supported in KVM.
* TSM is RES1 if SME isn't implemented.
*/
val |= CPTR_EL2_TSM;
if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
val |= CPTR_EL2_TZ;
if (!guest_owns_fp_regs())
val |= CPTR_EL2_TFP;
write_sysreg(val, cptr_el2);
}
}
static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
{
if (has_hvhe()) {
u64 val = CPACR_EL1_FPEN;
if (cpus_have_final_cap(ARM64_SVE))
val |= CPACR_EL1_ZEN;
if (cpus_have_final_cap(ARM64_SME))
val |= CPACR_EL1_SMEN;
write_sysreg(val, cpacr_el1);
} else {
u64 val = CPTR_NVHE_EL2_RES1;
if (!cpus_have_final_cap(ARM64_SVE))
val |= CPTR_EL2_TZ;
if (!cpus_have_final_cap(ARM64_SME))
val |= CPTR_EL2_TSM;
write_sysreg(val, cptr_el2);
}
}
static void __activate_traps(struct kvm_vcpu *vcpu)
{
___activate_traps(vcpu, vcpu->arch.hcr_el2);

View File

@@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE);
}
static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
{
u64 cptr;
/*
* With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
* CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
* except for some missing controls, such as TAM.
* In this case, CPTR_EL2.TAM has the same position with or without
* VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
* shift value for trapping the AMU accesses.
*/
u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM;
if (guest_owns_fp_regs()) {
val |= CPACR_EL1_FPEN;
if (vcpu_has_sve(vcpu))
val |= CPACR_EL1_ZEN;
} else {
__activate_traps_fpsimd32(vcpu);
}
if (!vcpu_has_nv(vcpu))
goto write;
/*
* The architecture is a bit crap (what a surprise): an EL2 guest
* writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
* as they are RES0 in the guest's view. To work around it, trap the
* sucker using the very same bit it can't set...
*/
if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
val |= CPTR_EL2_TCPAC;
/*
* Layer the guest hypervisor's trap configuration on top of our own if
* we're in a nested context.
*/
if (is_hyp_ctxt(vcpu))
goto write;
cptr = vcpu_sanitised_cptr_el2(vcpu);
/*
* Pay attention, there's some interesting detail here.
*
* The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
* meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
*
* - CPTR_EL2.xEN = x0, traps are enabled
* - CPTR_EL2.xEN = x1, traps are disabled
*
* In other words, bit[0] determines if guest accesses trap or not. In
* the interest of simplicity, clear the entire field if the guest
* hypervisor has traps enabled to dispel any illusion of something more
* complicated taking place.
*/
if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
val &= ~CPACR_EL1_FPEN;
if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
val &= ~CPACR_EL1_ZEN;
if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
val |= cptr & CPACR_EL1_E0POE;
val |= cptr & CPTR_EL2_TCPAC;
write:
write_sysreg(val, cpacr_el1);
}
static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
{
u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN;
if (cpus_have_final_cap(ARM64_SME))
val |= CPACR_EL1_SMEN_EL1EN;
write_sysreg(val, cpacr_el1);
}
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -639,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
host_ctxt = host_data_ptr(host_ctxt);
guest_ctxt = &vcpu->arch.ctxt;
sysreg_save_host_state_vhe(host_ctxt);
fpsimd_lazy_switch_to_guest(vcpu);
sysreg_save_host_state_vhe(host_ctxt);
/*
* Note that ARM erratum 1165522 requires us to configure both stage 1
* and stage 2 translation for the guest context before we clear
@@ -667,15 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
__deactivate_traps(vcpu);
fpsimd_lazy_switch_to_host(vcpu);
sysreg_restore_host_state_vhe(host_ctxt);
__debug_switch_to_host(vcpu);
/*
* Ensure that all system register writes above have taken effect
* before returning to the host. In VHE mode, CPTR traps for
* FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be
* manipulated after the ISB.
*/
isb();
fpsimd_lazy_switch_to_host(vcpu);
if (guest_owns_fp_regs())
__fpsimd_save_fpexc32(vcpu);
__debug_switch_to_host(vcpu);
return exit_code;
}
NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
@@ -705,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
*/
local_daif_restore(DAIF_PROCCTX_NOIRQ);
/*
* When we exit from the guest we change a number of CPU configuration
* parameters, such as traps. We rely on the isb() in kvm_call_hyp*()
* to make sure these changes take effect before running the host or
* additional guests.
*/
return ret;
}

View File

@@ -36,6 +36,11 @@ struct shadow_if {
static DEFINE_PER_CPU(struct shadow_if, shadow_if);
static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
{
return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
}
/*
* Nesting GICv3 support
*
@@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
return reg;
}
static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
{
struct vgic_irq *irq;
if (!(lr & ICH_LR_HW))
return lr;
/* We have the HW bit set, check for validity of pINTID */
irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
/* If there was no real mapping, nuke the HW bit */
if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
lr &= ~ICH_LR_HW;
/* Translate the virtual mapping to the real one, even if invalid */
if (irq) {
lr &= ~ICH_LR_PHYS_ID_MASK;
lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
vgic_put_irq(vcpu->kvm, irq);
}
return lr;
}
/*
* For LRs which have HW bit set such as timer interrupts, we modify them to
* have the host hardware interrupt number instead of the virtual one programmed
@@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
struct vgic_v3_cpu_if *s_cpu_if)
{
unsigned long lr_map = 0;
int index = 0;
struct shadow_if *shadow_if;
shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
shadow_if->lr_map = 0;
for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
struct vgic_irq *irq;
if (!(lr & ICH_LR_STATE))
lr = 0;
continue;
if (!(lr & ICH_LR_HW))
goto next;
lr = translate_lr_pintid(vcpu, lr);
/* We have the HW bit set, check for validity of pINTID */
irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
/* There was no real mapping, so nuke the HW bit */
lr &= ~ICH_LR_HW;
if (irq)
vgic_put_irq(vcpu->kvm, irq);
goto next;
}
/* Translate the virtual mapping to the real one */
lr &= ~ICH_LR_PHYS_ID_MASK;
lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
vgic_put_irq(vcpu->kvm, irq);
next:
s_cpu_if->vgic_lr[index] = lr;
if (lr) {
lr_map |= BIT(i);
index++;
}
s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
shadow_if->lr_map |= BIT(i);
}
container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
s_cpu_if->used_lrs = index;
s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
}
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
{
struct shadow_if *shadow_if = get_shadow_if();
int i, index = 0;
int i;
for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
struct vgic_irq *irq;
if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
goto next;
continue;
/*
* If we had a HW lr programmed by the guest hypervisor, we
@@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
*/
irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
goto next;
continue;
lr = __gic_v3_get_lr(index);
lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
if (!(lr & ICH_LR_STATE))
irq->active = false;
vgic_put_irq(vcpu->kvm, irq);
next:
index++;
}
}
@@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
val &= ~ICH_LR_STATE;
val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
s_cpu_if->vgic_lr[i] = 0;
}
shadow_if->lr_map = 0;
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
}

View File

@@ -103,7 +103,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
if (cp->a2 == 0 && cp->a3 == 0)
if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask);
else
kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask,
@@ -111,7 +111,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
if (cp->a2 == 0 && cp->a3 == 0)
if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
hbase, hmask, cp->a4);
else
@@ -127,9 +127,9 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID:
/*
* Until nested virtualization is implemented, the
* SBI HFENCE calls should be treated as NOPs
* SBI HFENCE calls should return not supported
* hence fallthrough.
*/
break;
default:
retdata->err_val = SBI_ERR_NOT_SUPPORTED;
}

View File

@@ -80,6 +80,7 @@
#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL
#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL
#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL
#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL
/*
* Bitmasks of exposed registers (with VMM).

View File

@@ -1212,11 +1212,13 @@ static int tdx_map_gpa(struct kvm_vcpu *vcpu)
/*
* Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
* userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
* bit set. If not, the error code is not defined in GHCI for TDX, use
* TDVMCALL_STATUS_INVALID_OPERAND for this case.
* bit set. This is a base call so it should always be supported, but
* KVM has no way to ensure that userspace implements the GHCI correctly.
* So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
* to the guest.
*/
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
ret = TDVMCALL_STATUS_INVALID_OPERAND;
ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
goto error;
}
@@ -1449,20 +1451,85 @@ error:
return 1;
}
static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
/*
* For now, there is no TDVMCALL beyond GHCI base API supported by KVM
* directly without the support from userspace, just set the value
* returned from userspace.
*/
tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
return 1;
}
static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
if (tdx->vp_enter_args.r12)
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
else {
switch (tdx->vp_enter_args.r12) {
case 0:
tdx->vp_enter_args.r11 = 0;
tdx->vp_enter_args.r12 = 0;
tdx->vp_enter_args.r13 = 0;
tdx->vp_enter_args.r14 = 0;
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
return 1;
case 1:
vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
vcpu->run->exit_reason = KVM_EXIT_TDX;
vcpu->run->tdx.flags = 0;
vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
return 0;
default:
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
return 1;
}
}
static int tdx_complete_simple(struct kvm_vcpu *vcpu)
{
tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
return 1;
}
static int tdx_get_quote(struct kvm_vcpu *vcpu)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
u64 gpa = tdx->vp_enter_args.r12;
u64 size = tdx->vp_enter_args.r13;
/* The gpa of buffer must have shared bit set. */
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
return 1;
}
vcpu->run->exit_reason = KVM_EXIT_TDX;
vcpu->run->tdx.flags = 0;
vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
vcpu->run->tdx.get_quote.size = size;
vcpu->arch.complete_userspace_io = tdx_complete_simple;
return 0;
}
static int handle_tdvmcall(struct kvm_vcpu *vcpu)
{
switch (tdvmcall_leaf(vcpu)) {
@@ -1472,11 +1539,13 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
return tdx_report_fatal_error(vcpu);
case TDVMCALL_GET_TD_VM_CALL_INFO:
return tdx_get_td_vm_call_info(vcpu);
case TDVMCALL_GET_QUOTE:
return tdx_get_quote(vcpu);
default:
break;
}
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
return 1;
}

View File

@@ -178,6 +178,7 @@ struct kvm_xen_exit {
#define KVM_EXIT_NOTIFY 37
#define KVM_EXIT_LOONGARCH_IOCSR 38
#define KVM_EXIT_MEMORY_FAULT 39
#define KVM_EXIT_TDX 40
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -447,6 +448,27 @@ struct kvm_run {
__u64 gpa;
__u64 size;
} memory_fault;
/* KVM_EXIT_TDX */
struct {
__u64 flags;
__u64 nr;
union {
struct {
__u64 ret;
__u64 data[5];
} unknown;
struct {
__u64 ret;
__u64 gpa;
__u64 size;
} get_quote;
struct {
__u64 ret;
__u64 leaf;
__u64 r11, r12, r13, r14;
} get_tdvmcall_info;
};
} tdx;
/* Fix the size of the union. */
char padding[256];
};

View File

@@ -954,6 +954,8 @@ static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq);
}
static int gic_fd;
static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
enum arch_timer timer)
{
@@ -968,12 +970,20 @@ static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu,
vcpu_args_set(*vcpu, 1, timer);
test_init_timer_irq(*vm, *vcpu);
vgic_v3_setup(*vm, 1, 64);
gic_fd = vgic_v3_setup(*vm, 1, 64);
__TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3");
sync_global_to_guest(*vm, test_args);
sync_global_to_guest(*vm, CVAL_MAX);
sync_global_to_guest(*vm, DEF_CNT);
}
static void test_vm_cleanup(struct kvm_vm *vm)
{
close(gic_fd);
kvm_vm_free(vm);
}
static void test_print_help(char *name)
{
pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n"
@@ -1060,13 +1070,13 @@ int main(int argc, char *argv[])
if (test_args.test_virtual) {
test_vm_create(&vm, &vcpu, VIRTUAL);
test_run(vm, vcpu);
kvm_vm_free(vm);
test_vm_cleanup(vm);
}
if (test_args.test_physical) {
test_vm_create(&vm, &vcpu, PHYSICAL);
test_run(vm, vcpu);
kvm_vm_free(vm);
test_vm_cleanup(vm);
}
return 0;