mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-04 20:19:47 +08:00

Enable/disable local IRQs, i.e. set/clear RFLAGS.IF, in the common
svm_vcpu_enter_exit() just after/before guest_state_{enter,exit}_irqoff()
so that VMRUN is not executed in an STI shadow. AMD CPUs have a quirk
(some would say "bug"), where the STI shadow bleeds into the guest's
intr_state field if a #VMEXIT occurs during injection of an event, i.e. if
the VMRUN doesn't complete before the subsequent #VMEXIT.
The spurious "interrupts masked" state is relatively benign, as it only
occurs during event injection and is transient. Because KVM is already
injecting an event, the guest can't be in HLT, and if KVM is querying IRQ
blocking for injection, then KVM would need to force an immediate exit
anyways since injecting multiple events is impossible.
However, because KVM copies int_state verbatim from vmcb02 to vmcb12, the
spurious STI shadow is visible to L1 when running a nested VM, which can
trip sanity checks, e.g. in VMware's VMM.
Hoist the STI+CLI all the way to C code, as the aforementioned calls to
guest_state_{enter,exit}_irqoff() already inform lockdep that IRQs are
enabled/disabled, and taking a fault on VMRUN with RFLAGS.IF=1 is already
possible. I.e. if there's kernel code that is confused by running with
RFLAGS.IF=1, then it's already a problem. In practice, since GIF=0 also
blocks NMIs, the only change in exposure to non-KVM code (relative to
surrounding VMRUN with STI+CLI) is exception handling code, and except for
the kvm_rebooting=1 case, all exception in the core VM-Enter/VM-Exit path
are fatal.
Use the "raw" variants to enable/disable IRQs to avoid tracing in the
"no instrumentation" code; the guest state helpers also take care of
tracing IRQ state.
Oppurtunstically document why KVM needs to do STI in the first place.
Reported-by: Doug Covelli <doug.covelli@broadcom.com>
Closes: https://lore.kernel.org/all/CADH9ctBs1YPmE4aCfGPNBwA10cA8RuAk2gO7542DjMZgs4uzJQ@mail.gmail.com
Fixes: f14eec0a32
("KVM: SVM: move more vmentry code to assembly")
Cc: stable@vger.kernel.org
Reviewed-by: Jim Mattson <jmattson@google.com>
Link: https://lore.kernel.org/r/20250224165442.2338294-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
370 lines
9.7 KiB
ArmAsm
370 lines
9.7 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#include <linux/linkage.h>
|
|
#include <asm/asm.h>
|
|
#include <asm/asm-offsets.h>
|
|
#include <asm/bitsperlong.h>
|
|
#include <asm/frame.h>
|
|
#include <asm/kvm_vcpu_regs.h>
|
|
#include <asm/nospec-branch.h>
|
|
#include "kvm-asm-offsets.h"
|
|
|
|
#define WORD_SIZE (BITS_PER_LONG / 8)
|
|
|
|
/* Intentionally omit RAX as it's context switched by hardware */
|
|
#define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE)
|
|
#define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE)
|
|
#define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE)
|
|
/* Intentionally omit RSP as it's context switched by hardware */
|
|
#define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE)
|
|
#define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE)
|
|
#define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE)
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE)
|
|
#define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE)
|
|
#define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE)
|
|
#define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE)
|
|
#define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE)
|
|
#define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE)
|
|
#define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE)
|
|
#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE)
|
|
#endif
|
|
|
|
#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa)
|
|
|
|
.section .noinstr.text, "ax"
|
|
|
|
.macro RESTORE_GUEST_SPEC_CTRL
|
|
/* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
|
|
ALTERNATIVE_2 "", \
|
|
"jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
|
|
"", X86_FEATURE_V_SPEC_CTRL
|
|
801:
|
|
.endm
|
|
.macro RESTORE_GUEST_SPEC_CTRL_BODY
|
|
800:
|
|
/*
|
|
* SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
|
|
* host's, write the MSR. This is kept out-of-line so that the common
|
|
* case does not have to jump.
|
|
*
|
|
* IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
|
|
* there must not be any returns or indirect branches between this code
|
|
* and vmentry.
|
|
*/
|
|
movl SVM_spec_ctrl(%_ASM_DI), %eax
|
|
cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
|
|
je 801b
|
|
mov $MSR_IA32_SPEC_CTRL, %ecx
|
|
xor %edx, %edx
|
|
wrmsr
|
|
jmp 801b
|
|
.endm
|
|
|
|
.macro RESTORE_HOST_SPEC_CTRL
|
|
/* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
|
|
ALTERNATIVE_2 "", \
|
|
"jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
|
|
"", X86_FEATURE_V_SPEC_CTRL
|
|
901:
|
|
.endm
|
|
.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req
|
|
900:
|
|
/* Same for after vmexit. */
|
|
mov $MSR_IA32_SPEC_CTRL, %ecx
|
|
|
|
/*
|
|
* Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
|
|
* if it was not intercepted during guest execution.
|
|
*/
|
|
cmpb $0, \spec_ctrl_intercepted
|
|
jnz 998f
|
|
rdmsr
|
|
movl %eax, SVM_spec_ctrl(%_ASM_DI)
|
|
998:
|
|
|
|
/* Now restore the host value of the MSR if different from the guest's. */
|
|
movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
|
|
cmp SVM_spec_ctrl(%_ASM_DI), %eax
|
|
je 901b
|
|
xor %edx, %edx
|
|
wrmsr
|
|
jmp 901b
|
|
.endm
|
|
|
|
|
|
/**
|
|
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
|
|
* @svm: struct vcpu_svm *
|
|
* @spec_ctrl_intercepted: bool
|
|
*/
|
|
SYM_FUNC_START(__svm_vcpu_run)
|
|
push %_ASM_BP
|
|
mov %_ASM_SP, %_ASM_BP
|
|
#ifdef CONFIG_X86_64
|
|
push %r15
|
|
push %r14
|
|
push %r13
|
|
push %r12
|
|
#else
|
|
push %edi
|
|
push %esi
|
|
#endif
|
|
push %_ASM_BX
|
|
|
|
/*
|
|
* Save variables needed after vmexit on the stack, in inverse
|
|
* order compared to when they are needed.
|
|
*/
|
|
|
|
/* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
|
|
push %_ASM_ARG2
|
|
|
|
/* Needed to restore access to percpu variables. */
|
|
__ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
|
|
|
|
/* Finally save @svm. */
|
|
push %_ASM_ARG1
|
|
|
|
.ifnc _ASM_ARG1, _ASM_DI
|
|
/*
|
|
* Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
|
|
* and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
|
|
*/
|
|
mov %_ASM_ARG1, %_ASM_DI
|
|
.endif
|
|
|
|
/* Clobbers RAX, RCX, RDX. */
|
|
RESTORE_GUEST_SPEC_CTRL
|
|
|
|
/*
|
|
* Use a single vmcb (vmcb01 because it's always valid) for
|
|
* context switching guest state via VMLOAD/VMSAVE, that way
|
|
* the state doesn't need to be copied between vmcb01 and
|
|
* vmcb02 when switching vmcbs for nested virtualization.
|
|
*/
|
|
mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
|
|
1: vmload %_ASM_AX
|
|
2:
|
|
|
|
/* Get svm->current_vmcb->pa into RAX. */
|
|
mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
|
|
mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
|
|
|
|
/* Load guest registers. */
|
|
mov VCPU_RCX(%_ASM_DI), %_ASM_CX
|
|
mov VCPU_RDX(%_ASM_DI), %_ASM_DX
|
|
mov VCPU_RBX(%_ASM_DI), %_ASM_BX
|
|
mov VCPU_RBP(%_ASM_DI), %_ASM_BP
|
|
mov VCPU_RSI(%_ASM_DI), %_ASM_SI
|
|
#ifdef CONFIG_X86_64
|
|
mov VCPU_R8 (%_ASM_DI), %r8
|
|
mov VCPU_R9 (%_ASM_DI), %r9
|
|
mov VCPU_R10(%_ASM_DI), %r10
|
|
mov VCPU_R11(%_ASM_DI), %r11
|
|
mov VCPU_R12(%_ASM_DI), %r12
|
|
mov VCPU_R13(%_ASM_DI), %r13
|
|
mov VCPU_R14(%_ASM_DI), %r14
|
|
mov VCPU_R15(%_ASM_DI), %r15
|
|
#endif
|
|
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
|
|
|
|
/* Enter guest mode */
|
|
3: vmrun %_ASM_AX
|
|
4:
|
|
/* Pop @svm to RAX while it's the only available register. */
|
|
pop %_ASM_AX
|
|
|
|
/* Save all guest registers. */
|
|
mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
|
|
mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
|
|
mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
|
|
mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
|
|
mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
|
|
mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
|
|
#ifdef CONFIG_X86_64
|
|
mov %r8, VCPU_R8 (%_ASM_AX)
|
|
mov %r9, VCPU_R9 (%_ASM_AX)
|
|
mov %r10, VCPU_R10(%_ASM_AX)
|
|
mov %r11, VCPU_R11(%_ASM_AX)
|
|
mov %r12, VCPU_R12(%_ASM_AX)
|
|
mov %r13, VCPU_R13(%_ASM_AX)
|
|
mov %r14, VCPU_R14(%_ASM_AX)
|
|
mov %r15, VCPU_R15(%_ASM_AX)
|
|
#endif
|
|
|
|
/* @svm can stay in RDI from now on. */
|
|
mov %_ASM_AX, %_ASM_DI
|
|
|
|
mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
|
|
5: vmsave %_ASM_AX
|
|
6:
|
|
|
|
/* Restores GSBASE among other things, allowing access to percpu data. */
|
|
pop %_ASM_AX
|
|
7: vmload %_ASM_AX
|
|
8:
|
|
|
|
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
|
|
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
|
|
|
|
/* Clobbers RAX, RCX, RDX. */
|
|
RESTORE_HOST_SPEC_CTRL
|
|
|
|
/*
|
|
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
|
|
* untrained as soon as we exit the VM and are back to the
|
|
* kernel. This should be done before re-enabling interrupts
|
|
* because interrupt handlers won't sanitize 'ret' if the return is
|
|
* from the kernel.
|
|
*/
|
|
UNTRAIN_RET_VM
|
|
|
|
/*
|
|
* Clear all general purpose registers except RSP and RAX to prevent
|
|
* speculative use of the guest's values, even those that are reloaded
|
|
* via the stack. In theory, an L1 cache miss when restoring registers
|
|
* could lead to speculative execution with the guest's values.
|
|
* Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
|
|
* free. RSP and RAX are exempt as they are restored by hardware
|
|
* during VM-Exit.
|
|
*/
|
|
xor %ecx, %ecx
|
|
xor %edx, %edx
|
|
xor %ebx, %ebx
|
|
xor %ebp, %ebp
|
|
xor %esi, %esi
|
|
xor %edi, %edi
|
|
#ifdef CONFIG_X86_64
|
|
xor %r8d, %r8d
|
|
xor %r9d, %r9d
|
|
xor %r10d, %r10d
|
|
xor %r11d, %r11d
|
|
xor %r12d, %r12d
|
|
xor %r13d, %r13d
|
|
xor %r14d, %r14d
|
|
xor %r15d, %r15d
|
|
#endif
|
|
|
|
/* "Pop" @spec_ctrl_intercepted. */
|
|
pop %_ASM_BX
|
|
|
|
pop %_ASM_BX
|
|
|
|
#ifdef CONFIG_X86_64
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
#else
|
|
pop %esi
|
|
pop %edi
|
|
#endif
|
|
pop %_ASM_BP
|
|
RET
|
|
|
|
RESTORE_GUEST_SPEC_CTRL_BODY
|
|
RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP)
|
|
|
|
10: cmpb $0, _ASM_RIP(kvm_rebooting)
|
|
jne 2b
|
|
ud2
|
|
30: cmpb $0, _ASM_RIP(kvm_rebooting)
|
|
jne 4b
|
|
ud2
|
|
50: cmpb $0, _ASM_RIP(kvm_rebooting)
|
|
jne 6b
|
|
ud2
|
|
70: cmpb $0, _ASM_RIP(kvm_rebooting)
|
|
jne 8b
|
|
ud2
|
|
|
|
_ASM_EXTABLE(1b, 10b)
|
|
_ASM_EXTABLE(3b, 30b)
|
|
_ASM_EXTABLE(5b, 50b)
|
|
_ASM_EXTABLE(7b, 70b)
|
|
|
|
SYM_FUNC_END(__svm_vcpu_run)
|
|
|
|
#ifdef CONFIG_KVM_AMD_SEV
|
|
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#define SEV_ES_GPRS_BASE 0x300
|
|
#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE)
|
|
#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE)
|
|
#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE)
|
|
#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE)
|
|
#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE)
|
|
#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE)
|
|
#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE)
|
|
#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE)
|
|
#endif
|
|
|
|
/**
|
|
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
|
|
* @svm: struct vcpu_svm *
|
|
* @spec_ctrl_intercepted: bool
|
|
*/
|
|
SYM_FUNC_START(__svm_sev_es_vcpu_run)
|
|
FRAME_BEGIN
|
|
|
|
/*
|
|
* Save non-volatile (callee-saved) registers to the host save area.
|
|
* Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not
|
|
* saved on VMRUN.
|
|
*/
|
|
mov %rbp, SEV_ES_RBP (%rdx)
|
|
mov %r15, SEV_ES_R15 (%rdx)
|
|
mov %r14, SEV_ES_R14 (%rdx)
|
|
mov %r13, SEV_ES_R13 (%rdx)
|
|
mov %r12, SEV_ES_R12 (%rdx)
|
|
mov %rbx, SEV_ES_RBX (%rdx)
|
|
|
|
/*
|
|
* Save volatile registers that hold arguments that are needed after
|
|
* #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted).
|
|
*/
|
|
mov %rdi, SEV_ES_RDI (%rdx)
|
|
mov %rsi, SEV_ES_RSI (%rdx)
|
|
|
|
/* Clobbers RAX, RCX, RDX (@hostsa). */
|
|
RESTORE_GUEST_SPEC_CTRL
|
|
|
|
/* Get svm->current_vmcb->pa into RAX. */
|
|
mov SVM_current_vmcb(%rdi), %rax
|
|
mov KVM_VMCB_pa(%rax), %rax
|
|
|
|
/* Enter guest mode */
|
|
1: vmrun %rax
|
|
2:
|
|
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
|
|
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
|
|
|
|
/* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */
|
|
RESTORE_HOST_SPEC_CTRL
|
|
|
|
/*
|
|
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
|
|
* untrained as soon as we exit the VM and are back to the
|
|
* kernel. This should be done before re-enabling interrupts
|
|
* because interrupt handlers won't sanitize RET if the return is
|
|
* from the kernel.
|
|
*/
|
|
UNTRAIN_RET_VM
|
|
|
|
FRAME_END
|
|
RET
|
|
|
|
RESTORE_GUEST_SPEC_CTRL_BODY
|
|
RESTORE_HOST_SPEC_CTRL_BODY %sil
|
|
|
|
3: cmpb $0, kvm_rebooting(%rip)
|
|
jne 2b
|
|
ud2
|
|
|
|
_ASM_EXTABLE(1b, 3b)
|
|
|
|
SYM_FUNC_END(__svm_sev_es_vcpu_run)
|
|
#endif /* CONFIG_KVM_AMD_SEV */
|