2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

Compare commits

...

33 Commits

Author SHA1 Message Date
Linus Torvalds
11e7861d68 ARM:
- Correctly handle 'invariant' system registers for protected VMs
 
 - Improved handling of VNCR data aborts, including external aborts
 
 - Fixes for handling of FEAT_RAS for NV guests, providing a sane
   fault context during SEA injection and preventing the use of
   RASv1p1 fault injection hardware
 
 - Ensure that page table destruction when a VM is destroyed gives an
   opportunity to reschedule
 
 - Large fix to KVM's infrastructure for managing guest context loaded
   on the CPU, addressing issues where the output of AT emulation
   doesn't get reflected to the guest
 
 - Fix AT S12 emulation to actually perform stage-2 translation when
   necessary
 
 - Avoid attempting vLPI irqbypass when GICv4 has been explicitly
   disabled for a VM
 
 - Minor KVM + selftest fixes
 
 RISC-V:
 
 - Fix pte settings within kvm_riscv_gstage_ioremap()
 
 - Fix comments in kvm_riscv_check_vcpu_requests()
 
 - Fix stack overrun when setting vlenb via ONE_REG
 
 x86:
 
 - Use array_index_nospec() to sanitize the target vCPU ID when handling PV
   IPIs and yields as the ID is guest-controlled.
 
 - Drop a superfluous cpumask_empty() check when reclaiming SEV memory, as
   the common case, by far, is that at least one CPU will have entered the
   VM, and wbnoinvd_on_cpus_mask() will naturally handle the rare case where
   the set of have_run_cpus is empty.
 
 Selftests (not KVM):
 
 - Rename the is_signed_type() macro in kselftest_harness.h to is_signed_var()
   to fix a collision with linux/overflow.h.  The collision generates compiler
   warnings due to the two macros having different meaning.
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmix3OMUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroOZGAf+K+xTAhbMuY4bK5Sn93/QssYUVsFv
 wWc/q5FXUd8t21eAN+b/qhGF4d71eDuoIUNzOBwbJ9qY/0F42Xgihfr7BarSBBqD
 anqQBnhhtCyPCa1tF8SyBv34HewNKts3bgSxnwo2V2CBGWqomm6cZ9Uh3yALFBGJ
 kqHi0kKql+QL9G9DbRQ8lEJAPnCnktFFtA94T5B+o7yh1vvPeBsK40chH8bi19nh
 vCdoGhNLr+k+MoYpfJ8lyOJ7QctijJBK7OlsteksMvCXKQdfz1/X7TnoF11rb4yV
 MPfMUDOGlIVEBaVBkokyHXXPv0Fg4zGlt/SYzOZWRHIYgQNQ+aSscAKODA==
 =W51r
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Correctly handle 'invariant' system registers for protected VMs

   - Improved handling of VNCR data aborts, including external aborts

   - Fixes for handling of FEAT_RAS for NV guests, providing a sane
     fault context during SEA injection and preventing the use of
     RASv1p1 fault injection hardware

   - Ensure that page table destruction when a VM is destroyed gives an
     opportunity to reschedule

   - Large fix to KVM's infrastructure for managing guest context loaded
     on the CPU, addressing issues where the output of AT emulation
     doesn't get reflected to the guest

   - Fix AT S12 emulation to actually perform stage-2 translation when
     necessary

   - Avoid attempting vLPI irqbypass when GICv4 has been explicitly
     disabled for a VM

   - Minor KVM + selftest fixes

  RISC-V:

   - Fix pte settings within kvm_riscv_gstage_ioremap()

   - Fix comments in kvm_riscv_check_vcpu_requests()

   - Fix stack overrun when setting vlenb via ONE_REG

  x86:

   - Use array_index_nospec() to sanitize the target vCPU ID when
     handling PV IPIs and yields as the ID is guest-controlled.

   - Drop a superfluous cpumask_empty() check when reclaiming SEV
     memory, as the common case, by far, is that at least one CPU will
     have entered the VM, and wbnoinvd_on_cpus_mask() will naturally
     handle the rare case where the set of have_run_cpus is empty.

  Selftests (not KVM):

   - Rename the is_signed_type() macro in kselftest_harness.h to
     is_signed_var() to fix a collision with linux/overflow.h. The
     collision generates compiler warnings due to the two macros having
     different meaning"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (29 commits)
  KVM: arm64: nv: Fix ATS12 handling of single-stage translation
  KVM: arm64: Remove __vcpu_{read,write}_sys_reg_{from,to}_cpu()
  KVM: arm64: Fix vcpu_{read,write}_sys_reg() accessors
  KVM: arm64: Simplify sysreg access on exception delivery
  KVM: arm64: Check for SYSREGS_ON_CPU before accessing the 32bit state
  RISC-V: KVM: fix stack overrun when loading vlenb
  RISC-V: KVM: Correct kvm_riscv_check_vcpu_requests() comment
  RISC-V: KVM: Fix pte settings within kvm_riscv_gstage_ioremap()
  KVM: arm64: selftests: Sync ID_AA64MMFR3_EL1 in set_id_regs
  KVM: arm64: Get rid of ARM64_FEATURE_MASK()
  KVM: arm64: Make ID_AA64PFR1_EL1.RAS_frac writable
  KVM: arm64: Make ID_AA64PFR0_EL1.RAS writable
  KVM: arm64: Ignore HCR_EL2.FIEN set by L1 guest's EL2
  KVM: arm64: Handle RASv1p1 registers
  arm64: Add capability denoting FEAT_RASv1p1
  KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables
  KVM: arm64: Split kvm_pgtable_stage2_destroy()
  selftests: harness: Rename is_signed_type() to avoid collision with overflow.h
  KVM: SEV: don't check have_run_cpus in sev_writeback_caches()
  KVM: arm64: Correctly populate FAR_EL2 on nested SEA injection
  ...
2025-08-29 13:54:26 -07:00
Paolo Bonzini
42a0305ab1 KVM/arm64 changes for 6.17, take #2
- Correctly handle 'invariant' system registers for protected VMs
 
  - Improved handling of VNCR data aborts, including external aborts
 
  - Fixes for handling of FEAT_RAS for NV guests, providing a sane
    fault context during SEA injection and preventing the use of
    RASv1p1 fault injection hardware
 
  - Ensure that page table destruction when a VM is destroyed gives an
    opportunity to reschedule
 
  - Large fix to KVM's infrastructure for managing guest context loaded
    on the CPU, addressing issues where the output of AT emulation
    doesn't get reflected to the guest
 
  - Fix AT S12 emulation to actually perform stage-2 translation when
    necessary
 
  - Avoid attempting vLPI irqbypass when GICv4 has been explicitly
    disabled for a VM
 
  - Minor KVM + selftest fixes
 -----BEGIN PGP SIGNATURE-----
 
 iI0EABYIADUWIQSNXHjWXuzMZutrKNKivnWIJHzdFgUCaLC0JBccb2xpdmVyLnVw
 dG9uQGxpbnV4LmRldgAKCRCivnWIJHzdFogJAQCyxHd5tuvXWWT/iC2EYFlPWYkU
 LOQbNhus16QjQ9f2ggD8CoA+6UAxzYW7ZU6IzYkDhJkN/3dKQEQhh8Cx0GXXRAs=
 =uky+
 -----END PGP SIGNATURE-----

Merge tag 'kvmarm-fixes-6.17-1' of https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm64 changes for 6.17, take #2

 - Correctly handle 'invariant' system registers for protected VMs

 - Improved handling of VNCR data aborts, including external aborts

 - Fixes for handling of FEAT_RAS for NV guests, providing a sane
   fault context during SEA injection and preventing the use of
   RASv1p1 fault injection hardware

 - Ensure that page table destruction when a VM is destroyed gives an
   opportunity to reschedule

 - Large fix to KVM's infrastructure for managing guest context loaded
   on the CPU, addressing issues where the output of AT emulation
   doesn't get reflected to the guest

 - Fix AT S12 emulation to actually perform stage-2 translation when
   necessary

 - Avoid attempting vLPI irqbypass when GICv4 has been explicitly
   disabled for a VM

 - Minor KVM + selftest fixes
2025-08-29 12:57:31 -04:00
Paolo Bonzini
085e899aa1 KVM/riscv fixes for 6.17, take #1
- Fix pte settings within kvm_riscv_gstage_ioremap()
 - Fix comments in kvm_riscv_check_vcpu_requests()
 - Fix stack overrun when setting vlenb via ONE_REG
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEZdn75s5e6LHDQ+f/rUjsVaLHLAcFAmixlM4ACgkQrUjsVaLH
 LAfbyw//V5AWOlLdqqYzjZ8jlbemeNjvr03UaWsOwb/IYPkf4h5mbk1v2mBrXp1d
 ZSrswa71kD29tFcD79zdvpo8yBFHHLymhuRns7Ocz1x2n51SluD4Kzq/DRhD11+D
 DFa+vteiC0rNHqfBqq6YIeeHFjD9WEI46JQPS5b7Ri+Rpi3uOHjXTO1qUxlrwCSw
 Yrc89QxBNyPFp1hOa0794452okbdjtGQdWDFhwE9zpQXlsRh3H41p/r3WYZnDZT+
 FdrGKVBCs0/VV7+2uVdr2s7bmhbk9YOWUbFKF8RrkdJMAvQUd/bePduvuaKpJ0ja
 YnQvhx8217nRbTxRt5dGG68my93ne17nSJh7Zp4WNWy3KbZ4upikKl+8YyKjzh8q
 j5lISF1Dj88EgGHDQUQ+HAolJSZEoeAmrmTjyxFTzvseliJEkKYkJy6XTng3uTyX
 vxCvCwLwlt1+COb8DGJf8hI5sxq26isZtC0yA98H6ZCrmCm6Iwlpzn6q4gngyx+l
 Kkuqu9nCOyyo5FT/6K8b5VB+dy0k1LHOddQamkTOeTkMT35AkDPZC/kIWSA8kdPg
 Wcwwb0lKEZYiYFkQDR/4ytTA0kr0pXF7kHqnIWyUOVDR8GhTRmoVCkDBUwVbrdDN
 Q4ebW2nLHPZAdaZxnJGENHR2wOgBLtJQHIfAsaPa8Stf7JKswHw=
 =GMGp
 -----END PGP SIGNATURE-----

Merge tag 'kvm-riscv-fixes-6.17-1' of https://github.com/kvm-riscv/linux into HEAD

KVM/riscv fixes for 6.17, take #1

- Fix pte settings within kvm_riscv_gstage_ioremap()
- Fix comments in kvm_riscv_check_vcpu_requests()
- Fix stack overrun when setting vlenb via ONE_REG
2025-08-29 12:57:18 -04:00
Marc Zyngier
ee372e6451 KVM: arm64: nv: Fix ATS12 handling of single-stage translation
Volodymyr reports that using a Xen DomU as a nested guest (where
HCR_EL2.E2H == 0), ATS12 results in a translation that stops at
the L2's S1, which isn't something you'd normally expects.

Comparing the code against the spec proves to be illuminating,
and suggests that the author of such code must have been tired,
cross-eyed, drunk, or maybe all of the above.

The gist of it is that, apart from HCR_EL2.VM or HCR_EL2.DC being
0, only the use of the EL2&0 translation regime limits the walk
to S1 only, and that we must finish the S2 walk in any other case.
Which solves the above issue, as E2H==0 indicates that ATS12 walks
the EL1&0 translation regime.

Explicitly checking for EL2&0 fixes this.

Reported-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
Suggested-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Fixes: be04cebf3e ("KVM: arm64: nv: Add emulation of AT S12E{0,1}{R,W}")
Link: https://lore.kernel.org/r/20250806141707.3479194-2-volodymyr_babchuk@epam.com
Link: https://lore.kernel.org/r/20250809144811.2314038-2-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-28 12:44:42 -07:00
Marc Zyngier
3328d17e70 KVM: arm64: Remove __vcpu_{read,write}_sys_reg_{from,to}_cpu()
There is no point having __vcpu_{read,write}_sys_reg_{from,to}_cpu()
exposed to the rest of the kernel, as the only callers are in
sys_regs.c.

Move them where they below, which is another opportunity to
simplify things a bit.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250817121926.217900-5-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-28 11:39:48 -07:00
Marc Zyngier
ec0ab059d4 KVM: arm64: Fix vcpu_{read,write}_sys_reg() accessors
Volodymyr reports (again!) that under some circumstances (E2H==0,
walking S1 PTs), PAR_EL1 doesn't report the value of the latest
walk in the CPU register, but that instead the value is written to
the backing store.

Further investigation indicates that the root cause of this is
that a group of registers (PAR_EL1, TPIDR*_EL{0,1}, the *32_EL2 dregs)
should always be considered as "on CPU", as they are not remapped
between EL1 and EL2.

We fail to treat them accordingly, and end-up considering that
the register (PAR_EL1 in this example) should be written to memory
instead of in the register.

While it would be possible to quickly work around it, it is obvious
that the way we track these things at the moment is pretty horrible,
and could do with some improvement.

Revamp the whole thing by:

- defining a location for a register (memory, cpu), potentially
  depending on the state of the vcpu

- define a transformation for this register (mapped register, potential
  translation, special register needing some particular attention)

- convey this information in a structure that can be easily passed
  around

As a result, the accessors themselves become much simpler, as the
state is explicit instead of being driven by hard-to-understand
conventions.

We get rid of the "pure EL2 register" notion, which wasn't very
useful, and add sanitisation of the values by applying the RESx
masks as required, something that was missing so far.

And of course, we add the missing registers to the list, with the
indication that they are always loaded.

Reported-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Fixes: fedc612314 ("KVM: arm64: nv: Handle virtual EL2 registers in vcpu_read/write_sys_reg()")
Link: https://lore.kernel.org/r/20250806141707.3479194-3-volodymyr_babchuk@epam.com
Link: https://lore.kernel.org/r/20250817121926.217900-4-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-28 11:39:48 -07:00
Marc Zyngier
e3f6836a63 KVM: arm64: Simplify sysreg access on exception delivery
Distinguishing between NV and VHE is slightly pointless, and only
serves as an extra complication, or a way to introduce bugs, such
as the way SPSR_EL1 gets written without checking for the state
being resident.

Get rid if this silly distinction, and fix the bug in one go.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250817121926.217900-3-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-28 11:39:48 -07:00
Marc Zyngier
b720269334 KVM: arm64: Check for SYSREGS_ON_CPU before accessing the 32bit state
Just like c6e35dff58 ("KVM: arm64: Check for SYSREGS_ON_CPU before
accessing the CPU state") fixed the 64bit state access, add a check
for the 32bit state actually being on the CPU before writing it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250817121926.217900-2-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-28 11:39:48 -07:00
Paolo Bonzini
22b2ca023f KVM x86 fixes and a selftest fix for 6.17-rcN
- Use array_index_nospec() to sanitize the target vCPU ID when handling PV
    IPIs and yields as the ID is guest-controlled.
 
  - Drop a superfluous cpumask_empty() check when reclaiming SEV memory, as
    the common case, by far, is that at least one CPU will have entered the
    VM, and wbnoinvd_on_cpus_mask() will naturally handle the rare case where
    the set of have_run_cpus is empty.
 
  - Rename the is_signed_type() macro in kselftest_harness.h to is_signed_var()
    to fix a collision with linux/overflow.h.  The collision generates compiler
    warnings due to the two macros having different implementations.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAminjaMACgkQOlYIJqCj
 N/0QDhAAhZgUqW2BGqGzOU/pjzXr0riJvVsNeAP85pcxygCc8qO8Hg1OWQz50YL5
 q4sitjZ+Ot39bSzjDMiwkrtuX25OsvlTnZDeN/liIim5rKMiYwvoKqQe5PPNxx5M
 NI4dc4B2AMedJ42gP7thBO+sMGf3J07445C69nJ4K9BppHAoZHH40grVeV0oDw+0
 XoujnyjI0KjghkbWgJlg51TZg6et14prjNZeiAuSulSTaMaPBfjPadkjlG1bsBFV
 lDeZypPvsh/ZLhhAgUFjZCKl7+XCKwKeze5MnpwqFYKhEBL8QqS11WGhyNmPFd1u
 spDe7MjMiNMOOyPlWpJktjMJXz908MJKjrn1Rd78iieqVeM1HQyhHAeC26a+A5Xi
 gFI9lrnNbZ4mlas/xyiX+Tld2yR4Ns3zF4D+eSM4KwII6MF+kEcF3j++U+PqLRvh
 M7r+OKjdvry9cIgHZ/5pa3VshdAfTE6EwNPakdPl+D0hVhPqKHIzi0H8rPmkuNwM
 aIKYSCa9SVmU6DS2vh0qWmTgYsc4Nk7W0bBmce7NftI+PDCYl7+GJAiZ1DBt4N9P
 +i9dKK19tYJCRButj5GZXnYzRpQ3WuPBzEv9C63GPwNaRuzAxvJP5ErrkxT2xE/5
 2WJgd+/J+JvQ14o8HtALc7fZckdWflGlN+pyvGyyQnkNRFpBNN0=
 =8zIJ
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-fixes-6.17-rc7' of https://github.com/kvm-x86/linux into HEAD

KVM x86 fixes and a selftest fix for 6.17-rcN

 - Use array_index_nospec() to sanitize the target vCPU ID when handling PV
   IPIs and yields as the ID is guest-controlled.

 - Drop a superfluous cpumask_empty() check when reclaiming SEV memory, as
   the common case, by far, is that at least one CPU will have entered the
   VM, and wbnoinvd_on_cpus_mask() will naturally handle the rare case where
   the set of have_run_cpus is empty.

 - Rename the is_signed_type() macro in kselftest_harness.h to is_signed_var()
   to fix a collision with linux/overflow.h.  The collision generates compiler
   warnings due to the two macros having different implementations.
2025-08-27 04:18:01 -04:00
Radim Krčmář
799766208f RISC-V: KVM: fix stack overrun when loading vlenb
The userspace load can put up to 2048 bits into an xlen bit stack
buffer.  We want only xlen bits, so check the size beforehand.

Fixes: 2fa290372d ("RISC-V: KVM: add 'vlenb' Vector CSR")
Cc: stable@vger.kernel.org
Signed-off-by: Radim Krčmář <rkrcmar@ventanamicro.com>
Reviewed-by: Nutty Liu <liujingqi@lanxincomputing.com>
Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
Link: https://lore.kernel.org/r/20250805104418.196023-4-rkrcmar@ventanamicro.com
Signed-off-by: Anup Patel <anup@brainfault.org>
2025-08-25 10:26:20 +05:30
Quan Zhou
e61a12a4ba RISC-V: KVM: Correct kvm_riscv_check_vcpu_requests() comment
Correct `check_vcpu_requests` to `kvm_riscv_check_vcpu_requests` in
comments.

Fixes: f55ffaf896 ("RISC-V: KVM: Enable ring-based dirty memory tracking")
Signed-off-by: Quan Zhou <zhouquan@iscas.ac.cn>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/49680363098c45516ec4b305283d662d26fa9386.1754326285.git.zhouquan@iscas.ac.cn
Signed-off-by: Anup Patel <anup@brainfault.org>
2025-08-25 10:26:17 +05:30
Fangyu Yu
9bca8be646 RISC-V: KVM: Fix pte settings within kvm_riscv_gstage_ioremap()
Currently, kvm_riscv_gstage_ioremap() is used to map IMSIC gpa to the
spa of IMSIC guest interrupt file.

The PAGE_KERNEL_IO property includes global setting whereas it does not
include user mode settings, so when accessing the IMSIC address in the
virtual machine, a guest page fault will occur, this is not expected.

According to the RISC-V Privileged Architecture Spec, for G-stage address
translation, all memory accesses are considered to be user-level accesses
as though executed in U-mode.

Fixes: 659ad6d82c ("RISC-V: KVM: Use PAGE_KERNEL_IO in kvm_riscv_gstage_ioremap()")
Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
Reviewed-by: Radim Krčmář <rkrcmar@ventanamicro.com>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Link: https://lore.kernel.org/r/20250807070729.89701-1-fangyu.yu@linux.alibaba.com
Signed-off-by: Anup Patel <anup@brainfault.org>
2025-08-25 10:26:16 +05:30
Mark Brown
01860bcc53 KVM: arm64: selftests: Sync ID_AA64MMFR3_EL1 in set_id_regs
When we added coverage for ID_AA64MMFR3_EL1 we didn't add it to the list
of registers we read in the guest, do so.

Fixes: 0b593ef12a ("KVM: arm64: selftests: Catch up set_id_regs with the kernel")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20250818-kvm-arm64-selftests-mmfr3-idreg-v1-1-2f85114d0163@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:36:30 -07:00
Marc Zyngier
0843e0ced3 KVM: arm64: Get rid of ARM64_FEATURE_MASK()
The ARM64_FEATURE_MASK() macro was a hack introduce whilst the
automatic generation of sysreg encoding was introduced, and was
too unreliable to be entirely trusted.

We are in a better place now, and we could really do without this
macro. Get rid of it altogether.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250817202158.395078-7-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:31:56 -07:00
Marc Zyngier
7a765aa88e KVM: arm64: Make ID_AA64PFR1_EL1.RAS_frac writable
Allow userspace to write to RAS_frac, under the condition that
the host supports RASv1p1 with RAS_frac==1. Other configurations
will result in RAS_frac being exposed as 0, and therefore implicitly
not writable.

To avoid the clutter, the ID_AA64PFR1_EL1 sanitisation is moved to
its own function.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20250817202158.395078-6-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:31:56 -07:00
Marc Zyngier
1fab657cb2 KVM: arm64: Make ID_AA64PFR0_EL1.RAS writable
Make ID_AA64PFR0_EL1.RAS writable so that we can restore a VM from
a system without RAS to a RAS-equipped machine (or disable RAS
in the guest).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20250817202158.395078-5-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:28:46 -07:00
Marc Zyngier
9049fb1227 KVM: arm64: Ignore HCR_EL2.FIEN set by L1 guest's EL2
An EL2 guest can set HCR_EL2.FIEN, which gives access to the RASv1p1
fault injection mechanism. This would allow an EL1 guest to inject
error records into the system, which does sound like a terrible idea.

Prevent this situation by added FIEN to the list of bits we silently
exclude from being inserted into the host configuration.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Link: https://lore.kernel.org/r/20250817202158.395078-4-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:28:46 -07:00
Marc Zyngier
d7b3e23f94 KVM: arm64: Handle RASv1p1 registers
FEAT_RASv1p1 system registeres are not handled at all so far.
KVM will give an embarassed warning on the console and inject
an UNDEF, despite RASv1p1 being exposed to the guest on suitable HW.

Handle these registers similarly to FEAT_RAS, with the added fun
that there are *two* way to indicate the presence of FEAT_RASv1p1.

Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20250817202158.395078-3-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:28:46 -07:00
Marc Zyngier
8049164653 arm64: Add capability denoting FEAT_RASv1p1
Detecting FEAT_RASv1p1 is rather complicated, as there are two
ways for the architecture to advertise the same thing (always a
delight...).

Add a capability that will advertise this in a synthetic way to
the rest of the kernel.

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/20250817202158.395078-2-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:28:46 -07:00
Raghavendra Rao Ananta
e9abe311f3 KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables
When a large VM, specifically one that holds a significant number of PTEs,
gets abruptly destroyed, the following warning is seen during the
page-table walk:

 sched: CPU 0 need_resched set for > 100018840 ns (100 ticks) without schedule
 CPU: 0 UID: 0 PID: 9617 Comm: kvm_page_table_ Tainted: G O 6.16.0-smp-DEV #3 NONE
 Tainted: [O]=OOT_MODULE
 Call trace:
  show_stack+0x20/0x38 (C)
  dump_stack_lvl+0x3c/0xb8
  dump_stack+0x18/0x30
  resched_latency_warn+0x7c/0x88
  sched_tick+0x1c4/0x268
  update_process_times+0xa8/0xd8
  tick_nohz_handler+0xc8/0x168
  __hrtimer_run_queues+0x11c/0x338
  hrtimer_interrupt+0x104/0x308
  arch_timer_handler_phys+0x40/0x58
  handle_percpu_devid_irq+0x8c/0x1b0
  generic_handle_domain_irq+0x48/0x78
  gic_handle_irq+0x1b8/0x408
  call_on_irq_stack+0x24/0x30
  do_interrupt_handler+0x54/0x78
  el1_interrupt+0x44/0x88
  el1h_64_irq_handler+0x18/0x28
  el1h_64_irq+0x84/0x88
  stage2_free_walker+0x30/0xa0 (P)
  __kvm_pgtable_walk+0x11c/0x258
  __kvm_pgtable_walk+0x180/0x258
  __kvm_pgtable_walk+0x180/0x258
  __kvm_pgtable_walk+0x180/0x258
  kvm_pgtable_walk+0xc4/0x140
  kvm_pgtable_stage2_destroy+0x5c/0xf0
  kvm_free_stage2_pgd+0x6c/0xe8
  kvm_uninit_stage2_mmu+0x24/0x48
  kvm_arch_flush_shadow_all+0x80/0xa0
  kvm_mmu_notifier_release+0x38/0x78
  __mmu_notifier_release+0x15c/0x250
  exit_mmap+0x68/0x400
  __mmput+0x38/0x1c8
  mmput+0x30/0x68
  exit_mm+0xd4/0x198
  do_exit+0x1a4/0xb00
  do_group_exit+0x8c/0x120
  get_signal+0x6d4/0x778
  do_signal+0x90/0x718
  do_notify_resume+0x70/0x170
  el0_svc+0x74/0xd8
  el0t_64_sync_handler+0x60/0xc8
  el0t_64_sync+0x1b0/0x1b8

The warning is seen majorly on the host kernels that are configured
not to force-preempt, such as CONFIG_PREEMPT_NONE=y. To avoid this,
instead of walking the entire page-table in one go, split it into
smaller ranges, by checking for cond_resched() between each range.
Since the path is executed during VM destruction, after the
page-table structure is unlinked from the KVM MMU, relying on
cond_resched_rwlock_write() isn't necessary.

Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Suggested-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250820162242.2624752-3-rananta@google.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:27:03 -07:00
Raghavendra Rao Ananta
0e89ca13ee KVM: arm64: Split kvm_pgtable_stage2_destroy()
Split kvm_pgtable_stage2_destroy() into two:
  - kvm_pgtable_stage2_destroy_range(), that performs the
    page-table walk and free the entries over a range of addresses.
  - kvm_pgtable_stage2_destroy_pgd(), that frees the PGD.

This refactoring enables subsequent patches to free large page-tables
in chunks, calling cond_resched() between each chunk, to yield the
CPU as necessary.

Existing callers of kvm_pgtable_stage2_destroy(), that probably cannot
take advantage of this (such as nVMHE), will continue to function as is.

Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Suggested-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250820162242.2624752-2-rananta@google.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-21 16:27:03 -07:00
Sean Christopherson
dce1b33ed7 selftests: harness: Rename is_signed_type() to avoid collision with overflow.h
Rename is_signed_type() to is_signed_var() to avoid colliding with a macro
of the same name defined by tools' linux/overflow.h.  This fixes warnings
(and presumably potential test failures) in tests that utilize the
selftests harness and happen to (indirectly) include overflow.h.

  In file included from tools/include/linux/bits.h:34,
                   from tools/include/linux/bitops.h:14,
                   from tools/include/linux/hashtable.h:13,
                   from include/kvm_util.h:11,
                   from x86/userspace_msr_exit_test.c:11:
  tools/include/linux/overflow.h:31:9: error: "is_signed_type" redefined [-Werror]
     31 | #define is_signed_type(type)       (((type)(-1)) < (type)1)
        |         ^~~~~~~~~~~~~~
  In file included from include/kvm_test_harness.h:11,
                   from x86/userspace_msr_exit_test.c:9:
  ../kselftest_harness.h:754:9: note: this is the location of the previous definition
    754 | #define is_signed_type(var)       (!!(((__typeof__(var))(-1)) < (__typeof__(var))1))
        |         ^~~~~~~~~~~~~~

Use a separate definition, at least for now, as many selftests build
without tools/include in their include path.

Fixes: fc92099902 ("tools headers: Synchronize linux/bits.h with the kernel sources")
Cc: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20250624231930.583689-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-08-20 08:04:09 -07:00
Yury Norov
923fcb3dbc KVM: SEV: don't check have_run_cpus in sev_writeback_caches()
Drop KVM's check on an empty cpumask when flushing caches when memory is
being reclaimed from an SEV VM, as smp_call_function_many_cond() naturally
(and correctly) handles an empty cpumask.  This avoids an extra O(n)
lookup in the common case where at least one pCPU has enterred the guest,
which could be noticeable in some setups, e.g. if a small VM is pinned to
the last few pCPUs in the system.

Fixes: 6f38f8c574 ("KVM: SVM: Flush cache only on CPUs running SEV guest")
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
[sean: rewrite changelog to capture performance angle]
Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-08-18 14:31:27 -07:00
Marc Zyngier
d19c541d26 KVM: arm64: Correctly populate FAR_EL2 on nested SEA injection
vcpu_write_sys_reg()'s signature is not totally obvious, and it
is rather easy to write something that looks correct, except that...
Oh wait...

Swap addr and FAR_EL2 to restore some sanity in the nested SEA
department.

Fixes: 9aba641b9e ("KVM: arm64: nv: Respect exception routing rules for SEAs")
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250813163747.2591317-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-15 11:51:18 -07:00
Thijs Raymakers
c87bd4dd43 KVM: x86: use array_index_nospec with indices that come from guest
min and dest_id are guest-controlled indices. Using array_index_nospec()
after the bounds checks clamps these values to mitigate speculative execution
side-channels.

Signed-off-by: Thijs Raymakers <thijs@raymakers.nl>
Cc: stable@vger.kernel.org
Cc: Sean Christopherson <seanjc@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fixes: 715062970f ("KVM: X86: Implement PV sched yield hypercall")
Fixes: bdf7ffc899 ("KVM: LAPIC: Fix pv ipis out-of-bounds access")
Fixes: 4180bf1b65 ("KVM: X86: Implement "send IPI" hypercall")
Link: https://lore.kernel.org/r/20250804064405.4802-1-thijs@raymakers.nl
Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-08-15 11:33:21 -07:00
Marc Zyngier
85acc29f90 KVM: arm64: selftest: Add standalone test checking for KVM's own UUID
Tinkering with UUIDs is a perilious task, and the KVM UUID gets
broken at times. In order to spot this early enough, add a selftest
that will shout if the expected value isn't found.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Sebastian Ott <sebott@redhat.com>
Link: https://lore.kernel.org/r/20250721130558.50823-1-jackabt.amazon@gmail.com
Link: https://lore.kernel.org/r/20250806171341.1521210-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-08 01:30:16 -07:00
Fuad Tabba
f1edb15920 arm64: vgic-v2: Fix guest endianness check in hVHE mode
In hVHE when running at the hypervisor, SCTLR_EL1 refers to the
hypervisor's System Control Register rather than the guest's. Make sure
to access the guest's register to determine its endianness.

Reported-by: Will Deacon <will@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://lore.kernel.org/r/20250807120133.871892-4-tabba@google.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-08 01:29:32 -07:00
Fuad Tabba
798eb59787 KVM: arm64: Sync protected guest VBAR_EL1 on injecting an undef exception
In pKVM, a race condition can occur if a guest updates its VBAR_EL1
register and, before a vCPU exit synchronizes this change, the
hypervisor needs to inject an undefined exception into a protected
guest.

In this scenario, the vCPU still holds the stale VBAR_EL1 value from
before the guest's update. When pKVM injects the exception, it ends up
using the stale value.

Explicitly read the live value of VBAR_EL1 from the guest and update the
vCPU value immediately before pending the exception. This ensures the
vCPU's value is the same as the guest's and that the exception will be
handled at the correct address upon resuming the guest.

Reported-by: Keir Fraser <keirf@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://lore.kernel.org/r/20250807120133.871892-3-tabba@google.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-08 01:29:32 -07:00
Fuad Tabba
eaa43934b4 KVM: arm64: Handle AIDR_EL1 and REVIDR_EL1 in host for protected VMs
Since commit 17efc1acee ("arm64: Expose AIDR_EL1 via sysfs"), AIDR_EL1
is read early during boot. Therefore, a guest running as a protected VM
will fail to boot because when it attempts to access AIDR_EL1, access to
that register is restricted in pKVM for protected guests.

Similar to how MIDR_EL1 is handled by the host for protected VMs, let
the host handle accesses to AIDR_EL1 as well as REVIDR_EL1. However note
that, unlike MIDR_EL1, AIDR_EL1 and REVIDR_EL1 are trapped by
HCR_EL2.TID1. Therefore, explicitly mark them as handled by the host for
protected VMs. TID1 is always set in pKVM, because it needs to restrict
access to SMIDR_EL1, which is also trapped by that bit.

Reported-by: Will Deacon <will@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://lore.kernel.org/r/20250807120133.871892-2-tabba@google.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-08 01:29:31 -07:00
Arnd Bergmann
700d6868fe kvm: arm64: use BUG() instead of BUG_ON(1)
The BUG_ON() macro adds a little bit of complexity over BUG(), and in
some cases this ends up confusing the compiler's control flow analysis
in a way that results in a warning. This one now shows up with clang-21:

arch/arm64/kvm/vgic/vgic-mmio.c:1094:3: error: variable 'len' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized]
 1094 |                 BUG_ON(1);

Change both instances of BUG_ON(1) to a plain BUG() in the arm64 kvm
code, to avoid the false-positive warning.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20250807072132.4170088-1-arnd@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-08 01:28:57 -07:00
Oliver Upton
69f8fe955d KVM: arm64: nv: Handle SEAs due to VNCR redirection
System register accesses redirected to the VNCR page can also generate
external aborts just like any other form of memory access. Route to
kvm_handle_guest_sea() for potential APEI handling, falling back to a
vSError if the kernel didn't handle the abort.

Take the opportunity to throw out the useless kvm_ras.h which provided a
helper with a single callsite...

Cc: Jiaqi Yan <jiaqiyan@google.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250729182342.3281742-1-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-04 22:16:10 -07:00
Marc Zyngier
07f557f60a KVM: arm64: nv: Properly check ESR_EL2.VNCR on taking a VNCR_EL2 related fault
Instead of checking for the ESR_EL2.VNCR bit being set (the only case
we should be here), we are actually testing random bits in ESR_EL2.DFSC.

13 obviously being a lucky number, it matches both permission and
translation fault status codes, which explains why we never saw it
failing. This was found by inspection, while reviewing a vaguely
related patch.

Whilst we're at it, turn the BUG_ON() into a WARN_ON_ONCE(), as
exploding here is just silly.

Fixes: 069a05e535 ("KVM: arm64: nv: Handle VNCR_EL2-triggered faults")
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Link: https://lore.kernel.org/r/20250730101828.1168707-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-08-04 22:15:29 -07:00
Raghavendra Rao Ananta
7b8346bd9f KVM: arm64: Don't attempt vLPI mappings when vPE allocation is disabled
commit c652887a92 ("KVM: arm64: vgic-v3: Allow userspace to write
GICD_TYPER2.nASSGIcap") makes the allocation of vPEs depend on nASSGIcap
for GICv4.1 hosts. While the vGIC v4 initialization and teardown is
handled correctly, it erroneously attempts to establish a vLPI mapping
to a VM that has no vPEs allocated:

  Unable to handle kernel NULL pointer dereference at virtual address 00000000000000a8
   Mem abort info:
     ESR = 0x0000000096000044
     EC = 0x25: DABT (current EL), IL = 32 bits
     SET = 0, FnV = 0
     EA = 0, S1PTW = 0
     FSC = 0x04: level 0 translation fault
   Data abort info:
     ISV = 0, ISS = 0x00000044, ISS2 = 0x00000000
     CM = 0, WnR = 1, TnD = 0, TagAccess = 0
     GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
   user pgtable: 4k pages, 48-bit VAs, pgdp=00000073a453b000
   [00000000000000a8] pgd=0000000000000000, p4d=0000000000000000
   Internal error: Oops: 0000000096000044 [#1] SMP
   pstate: 23400009 (nzCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
   pc : its_irq_set_vcpu_affinity+0x58c/0x95c
   lr : its_irq_set_vcpu_affinity+0x1e0/0x95c
   sp : ffff8001029bb9e0
   pmr_save: 00000060
   x29: ffff8001029bba20 x28: ffff0001ca5e28c0 x27: 0000000000000000
   x26: 0000000000000000 x25: ffff00019eee9f80 x24: ffff0001992b3f00
   x23: ffff8001029bbab8 x22: ffff00001159fb80 x21: 00000000000024a7
   x20: 00000000000024a7 x19: ffff00019eee9fb4 x18: 0000000000000494
   x17: 000000000000000e x16: 0000000000000494 x15: 0000000000000002
   x14: ffff0001a7f34600 x13: ffffccaad1203000 x12: 0000000000000018
   x11: ffff000011991000 x10: 0000000000000000 x9 : 00000000000000a2
   x8 : 00000000000020a8 x7 : 0000000000000000 x6 : 000000000000003f
   x5 : 0000000000000040 x4 : 0000000000000000 x3 : 0000000000000004
   x2 : 0000000000000000 x1 : ffff8001029bbab8 x0 : 00000000000000a8
   Call trace:
    its_irq_set_vcpu_affinity+0x58c/0x95c
    irq_set_vcpu_affinity+0x74/0xc8
    its_map_vlpi+0x4c/0x94
    kvm_vgic_v4_set_forwarding+0x134/0x298
    kvm_arch_irq_bypass_add_producer+0x28/0x34
    irq_bypass_register_producer+0xf8/0x1d8
    vfio_msi_set_vector_signal+0x2c8/0x308
    vfio_pci_set_msi_trigger+0x198/0x2d4
    vfio_pci_set_irqs_ioctl+0xf0/0x104
    vfio_pci_core_ioctl+0x6ac/0xc5c
    vfio_device_fops_unl_ioctl+0x128/0x370
    __arm64_sys_ioctl+0x98/0xd0
    el0_svc_common+0xd8/0x1d8
    do_el0_svc+0x28/0x34
    el0_svc+0x40/0xb8
    el0t_64_sync_handler+0x70/0xbc
    el0t_64_sync+0x1a8/0x1ac
   Code: 321f0129 f940094a 8b080148 d1400900 (39000009)
   ---[ end trace 0000000000000000 ]---

Fix it by moving the GICv4.1 special-casing to
vgic_supports_direct_msis(), returning false if the user explicitly
disabled nASSGIcap for the VM.

Fixes: c652887a92 ("KVM: arm64: vgic-v3: Allow userspace to write GICD_TYPER2.nASSGIcap")
Suggested-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Link: https://lore.kernel.org/r/20250729210644.830364-1-rananta@google.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2025-07-29 14:17:31 -07:00
41 changed files with 585 additions and 368 deletions

View File

@ -1160,115 +1160,8 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64);
__v; \
})
u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
{
/*
* *** VHE ONLY ***
*
* System registers listed in the switch are not saved on every
* exit from the guest but are only saved on vcpu_put.
*
* SYSREGS_ON_CPU *MUST* be checked before using this helper.
*
* Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
* should never be listed below, because the guest cannot modify its
* own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
* thread when emulating cross-VCPU communication.
*/
if (!has_vhe())
return false;
switch (reg) {
case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break;
case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break;
case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break;
case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break;
case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break;
case TCR2_EL1: *val = read_sysreg_s(SYS_TCR2_EL12); break;
case PIR_EL1: *val = read_sysreg_s(SYS_PIR_EL12); break;
case PIRE0_EL1: *val = read_sysreg_s(SYS_PIRE0_EL12); break;
case POR_EL1: *val = read_sysreg_s(SYS_POR_EL12); break;
case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break;
case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break;
case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break;
case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break;
case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break;
case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break;
case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break;
case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break;
case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break;
case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break;
case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break;
case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break;
case SPSR_EL1: *val = read_sysreg_s(SYS_SPSR_EL12); break;
case PAR_EL1: *val = read_sysreg_par(); break;
case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break;
case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
case ZCR_EL1: *val = read_sysreg_s(SYS_ZCR_EL12); break;
case SCTLR2_EL1: *val = read_sysreg_s(SYS_SCTLR2_EL12); break;
default: return false;
}
return true;
}
static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
{
/*
* *** VHE ONLY ***
*
* System registers listed in the switch are not restored on every
* entry to the guest but are only restored on vcpu_load.
*
* SYSREGS_ON_CPU *MUST* be checked before using this helper.
*
* Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
* should never be listed below, because the MPIDR should only be set
* once, before running the VCPU, and never changed later.
*/
if (!has_vhe())
return false;
switch (reg) {
case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break;
case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break;
case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break;
case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break;
case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break;
case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break;
case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break;
case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break;
case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break;
case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break;
case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break;
case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break;
case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break;
case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break;
case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break;
case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break;
case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break;
case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break;
case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break;
case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break;
case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break;
case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break;
default: return false;
}
return true;
}
u64 vcpu_read_sys_reg(const struct kvm_vcpu *, enum vcpu_sysreg);
void vcpu_write_sys_reg(struct kvm_vcpu *, u64, enum vcpu_sysreg);
struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;

View File

@ -180,6 +180,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable);
int kvm_handle_guest_sea(struct kvm_vcpu *vcpu);
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);

View File

@ -355,6 +355,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke
return pteref;
}
static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref)
{
return pteref;
}
static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
/*
@ -384,6 +389,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke
return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED));
}
static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref)
{
return rcu_dereference_raw(pteref);
}
static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
if (walker->flags & KVM_PGTABLE_WALK_SHARED)
@ -551,6 +561,26 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2
*/
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
/**
* kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address at which to place the mapping.
* @size: Size of the mapping.
*
* The page-table is assumed to be unreachable by any hardware walkers prior
* to freeing and therefore no TLB invalidation is performed.
*/
void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
u64 addr, u64 size);
/**
* kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
*
* It is assumed that the rest of the page-table is freed before this operation.
*/
void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt);
/**
* kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
* @mm_ops: Memory management callbacks.

View File

@ -179,7 +179,9 @@ struct pkvm_mapping {
int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
struct kvm_pgtable_mm_ops *mm_ops);
void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
u64 addr, u64 size);
void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt);
int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
enum kvm_pgtable_prot prot, void *mc,
enum kvm_pgtable_walk_flags flags);

View File

@ -1,25 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (C) 2018 - Arm Ltd */
#ifndef __ARM64_KVM_RAS_H__
#define __ARM64_KVM_RAS_H__
#include <linux/acpi.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <asm/acpi.h>
/*
* Was this synchronous external abort a RAS notification?
* Returns '0' for errors handled by some RAS subsystem, or -ENOENT.
*/
static inline int kvm_handle_guest_sea(void)
{
/* apei_claim_sea(NULL) expects to mask interrupts itself */
lockdep_assert_irqs_enabled();
return apei_claim_sea(NULL);
}
#endif /* __ARM64_KVM_RAS_H__ */

View File

@ -1142,9 +1142,6 @@
#define ARM64_FEATURE_FIELD_BITS 4
/* Defined for compatibility only, do not add new users. */
#define ARM64_FEATURE_MASK(x) (x##_MASK)
#ifdef __ASSEMBLY__
.macro mrs_s, rt, sreg

View File

@ -2269,6 +2269,24 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused)
/* Firmware may have left a deferred SError in this register. */
write_sysreg_s(0, SYS_DISR_EL1);
}
static bool has_rasv1p1(const struct arm64_cpu_capabilities *__unused, int scope)
{
const struct arm64_cpu_capabilities rasv1p1_caps[] = {
{
ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, V1P1)
},
{
ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP)
},
{
ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, RAS_frac, RASv1p1)
},
};
return (has_cpuid_feature(&rasv1p1_caps[0], scope) ||
(has_cpuid_feature(&rasv1p1_caps[1], scope) &&
has_cpuid_feature(&rasv1p1_caps[2], scope)));
}
#endif /* CONFIG_ARM64_RAS_EXTN */
#ifdef CONFIG_ARM64_PTR_AUTH
@ -2687,6 +2705,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.cpu_enable = cpu_clear_disr,
ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP)
},
{
.desc = "RASv1p1 Extension Support",
.capability = ARM64_HAS_RASV1P1_EXTN,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_rasv1p1,
},
#endif /* CONFIG_ARM64_RAS_EXTN */
#ifdef CONFIG_ARM64_AMU_EXTN
{

View File

@ -2408,12 +2408,12 @@ static u64 get_hyp_id_aa64pfr0_el1(void)
*/
u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
val &= ~(ID_AA64PFR0_EL1_CSV2 |
ID_AA64PFR0_EL1_CSV3);
val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV2,
arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
val |= FIELD_PREP(ID_AA64PFR0_EL1_CSV3,
arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
return val;

View File

@ -1420,10 +1420,10 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
return;
/*
* If we only have a single stage of translation (E2H=0 or
* TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
* If we only have a single stage of translation (EL2&0), exit
* early. Same thing if {VM,DC}=={0,0}.
*/
if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
if (compute_translation_regime(vcpu, op) == TR_EL20 ||
!(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
return;

View File

@ -2833,7 +2833,7 @@ int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr)
iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW);
esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL;
vcpu_write_sys_reg(vcpu, FAR_EL2, addr);
vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE)
return kvm_inject_nested(vcpu, esr, except_type_serror);

View File

@ -22,36 +22,28 @@
static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
u64 val;
if (unlikely(vcpu_has_nv(vcpu)))
if (has_vhe())
return vcpu_read_sys_reg(vcpu, reg);
else if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
__vcpu_read_sys_reg_from_cpu(reg, &val))
return val;
return __vcpu_sys_reg(vcpu, reg);
}
static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
if (unlikely(vcpu_has_nv(vcpu)))
if (has_vhe())
vcpu_write_sys_reg(vcpu, val, reg);
else if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU) ||
!__vcpu_write_sys_reg_to_cpu(val, reg))
else
__vcpu_assign_sys_reg(vcpu, reg, val);
}
static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
u64 val)
{
if (unlikely(vcpu_has_nv(vcpu))) {
if (has_vhe()) {
if (target_mode == PSR_MODE_EL1h)
vcpu_write_sys_reg(vcpu, val, SPSR_EL1);
else
vcpu_write_sys_reg(vcpu, val, SPSR_EL2);
} else if (has_vhe()) {
write_sysreg_el1(val, SYS_SPSR);
} else {
__vcpu_assign_sys_reg(vcpu, SPSR_EL1, val);
}
@ -59,7 +51,7 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
{
if (has_vhe())
if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
write_sysreg(val, spsr_abt);
else
vcpu->arch.ctxt.spsr_abt = val;
@ -67,7 +59,7 @@ static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
{
if (has_vhe())
if (has_vhe() && vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
write_sysreg(val, spsr_und);
else
vcpu->arch.ctxt.spsr_und = val;

View File

@ -17,7 +17,7 @@ static inline __must_check bool nvhe_check_data_corruption(bool v)
bool corruption = unlikely(condition); \
if (corruption) { \
if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
BUG_ON(1); \
BUG(); \
} else \
WARN_ON(1); \
} \

View File

@ -253,6 +253,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
*vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
*vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR);
__vcpu_assign_sys_reg(vcpu, read_sysreg_el1(SYS_VBAR), VBAR_EL1);
kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
@ -372,6 +373,9 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Debug and Trace Registers are restricted. */
/* Group 1 ID registers */
HOST_HANDLED(SYS_REVIDR_EL1),
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
AARCH32(SYS_ID_PFR0_EL1),
@ -460,6 +464,7 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
HOST_HANDLED(SYS_CCSIDR_EL1),
HOST_HANDLED(SYS_CLIDR_EL1),
HOST_HANDLED(SYS_AIDR_EL1),
HOST_HANDLED(SYS_CSSELR_EL1),
HOST_HANDLED(SYS_CTR_EL0),

View File

@ -1551,21 +1551,38 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
return 0;
}
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
u64 addr, u64 size)
{
size_t pgd_sz;
struct kvm_pgtable_walker walker = {
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
};
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
}
void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
{
size_t pgd_sz;
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
/*
* Since the pgtable is unlinked at this point, and not shared with
* other walkers, safely deference pgd with kvm_dereference_pteref_raw()
*/
pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz);
pgt->pgd = NULL;
}
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
{
kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
kvm_pgtable_stage2_destroy_pgd(pgt);
}
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
{
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;

View File

@ -20,7 +20,7 @@ static bool __is_be(struct kvm_vcpu *vcpu)
if (vcpu_mode_is_32bit(vcpu))
return !!(read_sysreg_el2(SYS_SPSR) & PSR_AA32_E_BIT);
return !!(read_sysreg(SCTLR_EL1) & SCTLR_ELx_EE);
return !!(read_sysreg_el1(SYS_SCTLR) & SCTLR_ELx_EE);
}
/*

View File

@ -43,8 +43,11 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
*
* - API/APK: they are already accounted for by vcpu_load(), and can
* only take effect across a load/put cycle (such as ERET)
*
* - FIEN: no way we let a guest have access to the RAS "Common Fault
* Injection" thing, whatever that does
*/
#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK)
#define NV_HCR_GUEST_EXCLUDE (HCR_TGE | HCR_API | HCR_APK | HCR_FIEN)
static u64 __compute_hcr(struct kvm_vcpu *vcpu)
{

View File

@ -4,19 +4,20 @@
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
*/
#include <linux/acpi.h>
#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/sched/signal.h>
#include <trace/events/kvm.h>
#include <asm/acpi.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/kvm_pkvm.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
@ -903,6 +904,38 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
return 0;
}
/*
* Assume that @pgt is valid and unlinked from the KVM MMU to free the
* page-table without taking the kvm_mmu_lock and without performing any
* TLB invalidations.
*
* Also, the range of addresses can be large enough to cause need_resched
* warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
* cond_resched() periodically to prevent hogging the CPU for a long time
* and schedule something else, if required.
*/
static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
phys_addr_t end)
{
u64 next;
do {
next = stage2_range_addr_end(addr, end);
KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr,
next - addr);
if (next != end)
cond_resched();
} while (addr = next, addr != end);
}
static void kvm_stage2_destroy(struct kvm_pgtable *pgt)
{
unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr);
stage2_destroy_range(pgt, 0, BIT(ia_bits));
KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
}
/**
* kvm_init_stage2_mmu - Initialise a S2 MMU structure
* @kvm: The pointer to the KVM structure
@ -979,7 +1012,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
return 0;
out_destroy_pgtable:
KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
kvm_stage2_destroy(pgt);
out_free_pgtable:
kfree(pgt);
return err;
@ -1076,7 +1109,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
write_unlock(&kvm->mmu_lock);
if (pgt) {
KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
kvm_stage2_destroy(pgt);
kfree(pgt);
}
}
@ -1811,6 +1844,19 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
read_unlock(&vcpu->kvm->mmu_lock);
}
int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
{
/*
* Give APEI the opportunity to claim the abort before handling it
* within KVM. apei_claim_sea() expects to be called with IRQs enabled.
*/
lockdep_assert_irqs_enabled();
if (apei_claim_sea(NULL) == 0)
return 1;
return kvm_inject_serror(vcpu);
}
/**
* kvm_handle_guest_abort - handles all 2nd stage aborts
* @vcpu: the VCPU pointer
@ -1834,17 +1880,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
gfn_t gfn;
int ret, idx;
/* Synchronous External Abort? */
if (kvm_vcpu_abt_issea(vcpu)) {
/*
* For RAS the host kernel may handle this abort.
* There is no need to pass the error into the guest.
*/
if (kvm_handle_guest_sea())
return kvm_inject_serror(vcpu);
return 1;
}
if (kvm_vcpu_abt_issea(vcpu))
return kvm_handle_guest_sea(vcpu);
esr = kvm_vcpu_get_esr(vcpu);

View File

@ -1287,7 +1287,10 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
u64 esr = kvm_vcpu_get_esr(vcpu);
BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT));
WARN_ON_ONCE(!(esr & ESR_ELx_VNCR));
if (kvm_vcpu_abt_issea(vcpu))
return kvm_handle_guest_sea(vcpu);
if (esr_fsc_is_permission_fault(esr)) {
inject_vncr_perm(vcpu);

View File

@ -316,9 +316,16 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e
return 0;
}
void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
u64 addr, u64 size)
{
__pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL));
__pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
}
void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
{
/* Expected to be called after all pKVM mappings have been released. */
WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root));
}
int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,

View File

@ -82,43 +82,105 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
"sys_reg write to read-only register");
}
#define PURE_EL2_SYSREG(el2) \
case el2: { \
*el1r = el2; \
return true; \
}
enum sr_loc_attr {
SR_LOC_MEMORY = 0, /* Register definitely in memory */
SR_LOC_LOADED = BIT(0), /* Register on CPU, unless it cannot */
SR_LOC_MAPPED = BIT(1), /* Register in a different CPU register */
SR_LOC_XLATED = BIT(2), /* Register translated to fit another reg */
SR_LOC_SPECIAL = BIT(3), /* Demanding register, implies loaded */
};
#define MAPPED_EL2_SYSREG(el2, el1, fn) \
case el2: { \
*xlate = fn; \
*el1r = el1; \
return true; \
}
struct sr_loc {
enum sr_loc_attr loc;
enum vcpu_sysreg map_reg;
u64 (*xlate)(u64);
};
static bool get_el2_to_el1_mapping(unsigned int reg,
unsigned int *el1r, u64 (**xlate)(u64))
static enum sr_loc_attr locate_direct_register(const struct kvm_vcpu *vcpu,
enum vcpu_sysreg reg)
{
switch (reg) {
PURE_EL2_SYSREG( VPIDR_EL2 );
PURE_EL2_SYSREG( VMPIDR_EL2 );
PURE_EL2_SYSREG( ACTLR_EL2 );
PURE_EL2_SYSREG( HCR_EL2 );
PURE_EL2_SYSREG( MDCR_EL2 );
PURE_EL2_SYSREG( HSTR_EL2 );
PURE_EL2_SYSREG( HACR_EL2 );
PURE_EL2_SYSREG( VTTBR_EL2 );
PURE_EL2_SYSREG( VTCR_EL2 );
PURE_EL2_SYSREG( TPIDR_EL2 );
PURE_EL2_SYSREG( HPFAR_EL2 );
PURE_EL2_SYSREG( HCRX_EL2 );
PURE_EL2_SYSREG( HFGRTR_EL2 );
PURE_EL2_SYSREG( HFGWTR_EL2 );
PURE_EL2_SYSREG( HFGITR_EL2 );
PURE_EL2_SYSREG( HDFGRTR_EL2 );
PURE_EL2_SYSREG( HDFGWTR_EL2 );
PURE_EL2_SYSREG( HAFGRTR_EL2 );
PURE_EL2_SYSREG( CNTVOFF_EL2 );
PURE_EL2_SYSREG( CNTHCTL_EL2 );
case SCTLR_EL1:
case CPACR_EL1:
case TTBR0_EL1:
case TTBR1_EL1:
case TCR_EL1:
case TCR2_EL1:
case PIR_EL1:
case PIRE0_EL1:
case POR_EL1:
case ESR_EL1:
case AFSR0_EL1:
case AFSR1_EL1:
case FAR_EL1:
case MAIR_EL1:
case VBAR_EL1:
case CONTEXTIDR_EL1:
case AMAIR_EL1:
case CNTKCTL_EL1:
case ELR_EL1:
case SPSR_EL1:
case ZCR_EL1:
case SCTLR2_EL1:
/*
* EL1 registers which have an ELx2 mapping are loaded if
* we're not in hypervisor context.
*/
return is_hyp_ctxt(vcpu) ? SR_LOC_MEMORY : SR_LOC_LOADED;
case TPIDR_EL0:
case TPIDRRO_EL0:
case TPIDR_EL1:
case PAR_EL1:
case DACR32_EL2:
case IFSR32_EL2:
case DBGVCR32_EL2:
/* These registers are always loaded, no matter what */
return SR_LOC_LOADED;
default:
/* Non-mapped EL2 registers are by definition in memory. */
return SR_LOC_MEMORY;
}
}
static void locate_mapped_el2_register(const struct kvm_vcpu *vcpu,
enum vcpu_sysreg reg,
enum vcpu_sysreg map_reg,
u64 (*xlate)(u64),
struct sr_loc *loc)
{
if (!is_hyp_ctxt(vcpu)) {
loc->loc = SR_LOC_MEMORY;
return;
}
loc->loc = SR_LOC_LOADED | SR_LOC_MAPPED;
loc->map_reg = map_reg;
WARN_ON(locate_direct_register(vcpu, map_reg) != SR_LOC_MEMORY);
if (xlate != NULL && !vcpu_el2_e2h_is_set(vcpu)) {
loc->loc |= SR_LOC_XLATED;
loc->xlate = xlate;
}
}
#define MAPPED_EL2_SYSREG(r, m, t) \
case r: { \
locate_mapped_el2_register(vcpu, r, m, t, loc); \
break; \
}
static void locate_register(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg,
struct sr_loc *loc)
{
if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU)) {
loc->loc = SR_LOC_MEMORY;
return;
}
switch (reg) {
MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1,
translate_sctlr_el2_to_sctlr_el1 );
MAPPED_EL2_SYSREG(CPTR_EL2, CPACR_EL1,
@ -144,125 +206,189 @@ static bool get_el2_to_el1_mapping(unsigned int reg,
MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL );
MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL );
MAPPED_EL2_SYSREG(SCTLR2_EL2, SCTLR2_EL1, NULL );
case CNTHCTL_EL2:
/* CNTHCTL_EL2 is super special, until we support NV2.1 */
loc->loc = ((is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) ?
SR_LOC_SPECIAL : SR_LOC_MEMORY);
break;
default:
return false;
loc->loc = locate_direct_register(vcpu, reg);
}
}
u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
static u64 read_sr_from_cpu(enum vcpu_sysreg reg)
{
u64 val = 0x8badf00d8badf00d;
u64 (*xlate)(u64) = NULL;
unsigned int el1r;
if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
goto memory_read;
switch (reg) {
case SCTLR_EL1: val = read_sysreg_s(SYS_SCTLR_EL12); break;
case CPACR_EL1: val = read_sysreg_s(SYS_CPACR_EL12); break;
case TTBR0_EL1: val = read_sysreg_s(SYS_TTBR0_EL12); break;
case TTBR1_EL1: val = read_sysreg_s(SYS_TTBR1_EL12); break;
case TCR_EL1: val = read_sysreg_s(SYS_TCR_EL12); break;
case TCR2_EL1: val = read_sysreg_s(SYS_TCR2_EL12); break;
case PIR_EL1: val = read_sysreg_s(SYS_PIR_EL12); break;
case PIRE0_EL1: val = read_sysreg_s(SYS_PIRE0_EL12); break;
case POR_EL1: val = read_sysreg_s(SYS_POR_EL12); break;
case ESR_EL1: val = read_sysreg_s(SYS_ESR_EL12); break;
case AFSR0_EL1: val = read_sysreg_s(SYS_AFSR0_EL12); break;
case AFSR1_EL1: val = read_sysreg_s(SYS_AFSR1_EL12); break;
case FAR_EL1: val = read_sysreg_s(SYS_FAR_EL12); break;
case MAIR_EL1: val = read_sysreg_s(SYS_MAIR_EL12); break;
case VBAR_EL1: val = read_sysreg_s(SYS_VBAR_EL12); break;
case CONTEXTIDR_EL1: val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
case AMAIR_EL1: val = read_sysreg_s(SYS_AMAIR_EL12); break;
case CNTKCTL_EL1: val = read_sysreg_s(SYS_CNTKCTL_EL12); break;
case ELR_EL1: val = read_sysreg_s(SYS_ELR_EL12); break;
case SPSR_EL1: val = read_sysreg_s(SYS_SPSR_EL12); break;
case ZCR_EL1: val = read_sysreg_s(SYS_ZCR_EL12); break;
case SCTLR2_EL1: val = read_sysreg_s(SYS_SCTLR2_EL12); break;
case TPIDR_EL0: val = read_sysreg_s(SYS_TPIDR_EL0); break;
case TPIDRRO_EL0: val = read_sysreg_s(SYS_TPIDRRO_EL0); break;
case TPIDR_EL1: val = read_sysreg_s(SYS_TPIDR_EL1); break;
case PAR_EL1: val = read_sysreg_par(); break;
case DACR32_EL2: val = read_sysreg_s(SYS_DACR32_EL2); break;
case IFSR32_EL2: val = read_sysreg_s(SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
default: WARN_ON_ONCE(1);
}
if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
if (!is_hyp_ctxt(vcpu))
goto memory_read;
return val;
}
static void write_sr_to_cpu(enum vcpu_sysreg reg, u64 val)
{
switch (reg) {
case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break;
case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break;
case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break;
case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break;
case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break;
case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break;
case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break;
case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break;
case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break;
case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break;
case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break;
case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break;
case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break;
case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break;
case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break;
case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break;
case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break;
case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break;
case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break;
case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break;
case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break;
case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break;
case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
default: WARN_ON_ONCE(1);
}
}
u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg reg)
{
struct sr_loc loc = {};
locate_register(vcpu, reg, &loc);
WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
if (loc.loc & SR_LOC_SPECIAL) {
u64 val;
WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
/*
* CNTHCTL_EL2 requires some special treatment to
* account for the bits that can be set via CNTKCTL_EL1.
* CNTHCTL_EL2 requires some special treatment to account
* for the bits that can be set via CNTKCTL_EL1 when E2H==1.
*/
switch (reg) {
case CNTHCTL_EL2:
if (vcpu_el2_e2h_is_set(vcpu)) {
val = read_sysreg_el1(SYS_CNTKCTL);
val &= CNTKCTL_VALID_BITS;
val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
return val;
default:
WARN_ON_ONCE(1);
}
break;
}
/*
* If this register does not have an EL1 counterpart,
* then read the stored EL2 version.
*/
if (reg == el1r)
goto memory_read;
if (loc.loc & SR_LOC_LOADED) {
enum vcpu_sysreg map_reg = reg;
/*
* If we have a non-VHE guest and that the sysreg
* requires translation to be used at EL1, use the
* in-memory copy instead.
*/
if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
goto memory_read;
if (loc.loc & SR_LOC_MAPPED)
map_reg = loc.map_reg;
if (!(loc.loc & SR_LOC_XLATED)) {
u64 val = read_sr_from_cpu(map_reg);
/* Get the current version of the EL1 counterpart. */
WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val));
if (reg >= __SANITISED_REG_START__)
val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
return val;
}
}
/* EL1 register can't be on the CPU if the guest is in vEL2. */
if (unlikely(is_hyp_ctxt(vcpu)))
goto memory_read;
if (__vcpu_read_sys_reg_from_cpu(reg, &val))
return val;
memory_read:
return __vcpu_sys_reg(vcpu, reg);
}
void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, enum vcpu_sysreg reg)
{
u64 (*xlate)(u64) = NULL;
unsigned int el1r;
struct sr_loc loc = {};
if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
goto memory_write;
locate_register(vcpu, reg, &loc);
if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
if (!is_hyp_ctxt(vcpu))
goto memory_write;
WARN_ON_ONCE(!has_vhe() && loc.loc != SR_LOC_MEMORY);
/*
* Always store a copy of the write to memory to avoid having
* to reverse-translate virtual EL2 system registers for a
* non-VHE guest hypervisor.
*/
__vcpu_assign_sys_reg(vcpu, reg, val);
if (loc.loc & SR_LOC_SPECIAL) {
WARN_ON_ONCE(loc.loc & ~SR_LOC_SPECIAL);
switch (reg) {
case CNTHCTL_EL2:
/*
* If E2H=0, CNHTCTL_EL2 is a pure shadow register.
* Otherwise, some of the bits are backed by
* If E2H=1, some of the bits are backed by
* CNTKCTL_EL1, while the rest is kept in memory.
* Yes, this is fun stuff.
*/
if (vcpu_el2_e2h_is_set(vcpu))
write_sysreg_el1(val, SYS_CNTKCTL);
return;
break;
default:
WARN_ON_ONCE(1);
}
}
/* No EL1 counterpart? We're done here.? */
if (reg == el1r)
return;
if (loc.loc & SR_LOC_LOADED) {
enum vcpu_sysreg map_reg = reg;
u64 xlated_val;
if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
val = xlate(val);
if (reg >= __SANITISED_REG_START__)
val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);
/* Redirect this to the EL1 version of the register. */
WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r));
return;
if (loc.loc & SR_LOC_MAPPED)
map_reg = loc.map_reg;
if (loc.loc & SR_LOC_XLATED)
xlated_val = loc.xlate(val);
else
xlated_val = val;
write_sr_to_cpu(map_reg, xlated_val);
/*
* Fall through to write the backing store anyway, which
* allows translated registers to be directly read without a
* reverse translation.
*/
}
/* EL1 register can't be on the CPU if the guest is in vEL2. */
if (unlikely(is_hyp_ctxt(vcpu)))
goto memory_write;
if (__vcpu_write_sys_reg_to_cpu(val, reg))
return;
memory_write:
__vcpu_assign_sys_reg(vcpu, reg, val);
}
@ -1584,6 +1710,7 @@ static u8 pmuver_to_perfmon(u8 pmuver)
}
static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val);
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
/* Read a sanitised cpufeature ID register by sys_reg_desc */
@ -1606,19 +1733,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val = sanitise_id_aa64pfr0_el1(vcpu, val);
break;
case SYS_ID_AA64PFR1_EL1:
if (!kvm_has_mte(vcpu->kvm)) {
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac);
}
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
val = sanitise_id_aa64pfr1_el1(vcpu, val);
break;
case SYS_ID_AA64PFR2_EL1:
val &= ID_AA64PFR2_EL1_FPMR |
@ -1628,18 +1743,18 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
val &= ~(ID_AA64ISAR1_EL1_APA |
ID_AA64ISAR1_EL1_API |
ID_AA64ISAR1_EL1_GPA |
ID_AA64ISAR1_EL1_GPI);
break;
case SYS_ID_AA64ISAR2_EL1:
if (!vcpu_has_ptrauth(vcpu))
val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
val &= ~(ID_AA64ISAR2_EL1_APA3 |
ID_AA64ISAR2_EL1_GPA3);
if (!cpus_have_final_cap(ARM64_HAS_WFXT) ||
has_broken_cntvoff())
val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
val &= ~ID_AA64ISAR2_EL1_WFxT;
break;
case SYS_ID_AA64ISAR3_EL1:
val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX;
@ -1655,7 +1770,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
ID_AA64MMFR3_EL1_S1PIE;
break;
case SYS_ID_MMFR4_EL1:
val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
val &= ~ID_MMFR4_EL1_CCIDX;
break;
}
@ -1836,6 +1951,31 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
return val;
}
static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val)
{
u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
if (!kvm_has_mte(vcpu->kvm)) {
val &= ~ID_AA64PFR1_EL1_MTE;
val &= ~ID_AA64PFR1_EL1_MTE_frac;
}
if (!(cpus_have_final_cap(ARM64_HAS_RASV1P1_EXTN) &&
SYS_FIELD_GET(ID_AA64PFR0_EL1, RAS, pfr0) == ID_AA64PFR0_EL1_RAS_IMP))
val &= ~ID_AA64PFR1_EL1_RAS_frac;
val &= ~ID_AA64PFR1_EL1_SME;
val &= ~ID_AA64PFR1_EL1_RNDR_trap;
val &= ~ID_AA64PFR1_EL1_NMI;
val &= ~ID_AA64PFR1_EL1_GCS;
val &= ~ID_AA64PFR1_EL1_THE;
val &= ~ID_AA64PFR1_EL1_MTEX;
val &= ~ID_AA64PFR1_EL1_PFAR;
val &= ~ID_AA64PFR1_EL1_MPAM_frac;
return val;
}
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
@ -2697,6 +2837,18 @@ static bool access_ras(struct kvm_vcpu *vcpu,
struct kvm *kvm = vcpu->kvm;
switch(reg_to_encoding(r)) {
case SYS_ERXPFGCDN_EL1:
case SYS_ERXPFGCTL_EL1:
case SYS_ERXPFGF_EL1:
case SYS_ERXMISC2_EL1:
case SYS_ERXMISC3_EL1:
if (!(kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) ||
(kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) &&
kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)))) {
kvm_inject_undefined(vcpu);
return false;
}
break;
default:
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) {
kvm_inject_undefined(vcpu);
@ -2929,7 +3081,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
~(ID_AA64PFR0_EL1_AMU |
ID_AA64PFR0_EL1_MPAM |
ID_AA64PFR0_EL1_SVE |
ID_AA64PFR0_EL1_RAS |
ID_AA64PFR0_EL1_AdvSIMD |
ID_AA64PFR0_EL1_FP)),
ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
@ -2943,7 +3094,6 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR1_EL1_SME |
ID_AA64PFR1_EL1_RES0 |
ID_AA64PFR1_EL1_MPAM_frac |
ID_AA64PFR1_EL1_RAS_frac |
ID_AA64PFR1_EL1_MTE)),
ID_WRITABLE(ID_AA64PFR2_EL1,
ID_AA64PFR2_EL1_FPMR |
@ -3063,8 +3213,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ERXCTLR_EL1), access_ras },
{ SYS_DESC(SYS_ERXSTATUS_EL1), access_ras },
{ SYS_DESC(SYS_ERXADDR_EL1), access_ras },
{ SYS_DESC(SYS_ERXPFGF_EL1), access_ras },
{ SYS_DESC(SYS_ERXPFGCTL_EL1), access_ras },
{ SYS_DESC(SYS_ERXPFGCDN_EL1), access_ras },
{ SYS_DESC(SYS_ERXMISC0_EL1), access_ras },
{ SYS_DESC(SYS_ERXMISC1_EL1), access_ras },
{ SYS_DESC(SYS_ERXMISC2_EL1), access_ras },
{ SYS_DESC(SYS_ERXMISC3_EL1), access_ras },
MTE_REG(TFSR_EL1),
MTE_REG(TFSRE0_EL1),

View File

@ -50,6 +50,14 @@ bool vgic_has_its(struct kvm *kvm)
bool vgic_supports_direct_msis(struct kvm *kvm)
{
/*
* Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
* indirectly allowing userspace to control whether or not vPEs are
* allocated for the VM.
*/
if (system_supports_direct_sgis() && !vgic_supports_direct_sgis(kvm))
return false;
return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
}

View File

@ -1091,7 +1091,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
len = vgic_v3_init_dist_iodev(io_device);
break;
default:
BUG_ON(1);
BUG();
}
io_device->base_addr = dist_base_address;

View File

@ -396,15 +396,7 @@ bool vgic_supports_direct_sgis(struct kvm *kvm);
static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
{
/*
* Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
* indirectly allowing userspace to control whether or not vPEs are
* allocated for the VM.
*/
if (system_supports_direct_sgis())
return vgic_supports_direct_sgis(kvm);
return vgic_supports_direct_msis(kvm);
return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
}
int vgic_v4_init(struct kvm *kvm);

View File

@ -53,6 +53,7 @@ HAS_S1PIE
HAS_S1POE
HAS_SCTLR2
HAS_RAS_EXTN
HAS_RASV1P1_EXTN
HAS_RNG
HAS_SB
HAS_STAGE2_FWB

View File

@ -39,6 +39,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
unsigned long size, bool writable, bool in_atomic)
{
int ret = 0;
pgprot_t prot;
unsigned long pfn;
phys_addr_t addr, end;
struct kvm_mmu_memory_cache pcache = {
@ -55,10 +56,12 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
pfn = __phys_to_pfn(hpa);
prot = pgprot_noncached(PAGE_WRITE);
for (addr = gpa; addr < end; addr += PAGE_SIZE) {
map.addr = addr;
map.pte = pfn_pte(pfn, PAGE_KERNEL_IO);
map.pte = pfn_pte(pfn, prot);
map.pte = pte_mkdirty(map.pte);
map.level = 0;
if (!writable)

View File

@ -683,7 +683,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
}
/**
* check_vcpu_requests - check and handle pending vCPU requests
* kvm_riscv_check_vcpu_requests - check and handle pending vCPU requests
* @vcpu: the VCPU pointer
*
* Return: 1 if we should enter the guest

View File

@ -182,6 +182,8 @@ int kvm_riscv_vcpu_set_reg_vector(struct kvm_vcpu *vcpu,
struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
unsigned long reg_val;
if (reg_size != sizeof(reg_val))
return -EINVAL;
if (copy_from_user(&reg_val, uaddr, reg_size))
return -EFAULT;
if (reg_val != cntx->vector.vlenb)

View File

@ -810,6 +810,8 @@ static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
if (min > map->max_apic_id)
return 0;
min = array_index_nospec(min, map->max_apic_id + 1);
for_each_set_bit(i, ipi_bitmap,
min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
if (map->phys_map[min + i]) {

View File

@ -718,13 +718,6 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
static void sev_writeback_caches(struct kvm *kvm)
{
/*
* Note, the caller is responsible for ensuring correctness if the mask
* can be modified, e.g. if a CPU could be doing VMRUN.
*/
if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
return;
/*
* Ensure that all dirty guest tagged cache entries are written back
* before releasing the pages back to the system for use. CLFLUSH will
@ -739,6 +732,9 @@ static void sev_writeback_caches(struct kvm *kvm)
* serializing multiple calls and having responding CPUs (to the IPI)
* mark themselves as still running if they are running (or about to
* run) a vCPU for the VM.
*
* Note, the caller is responsible for ensuring correctness if the mask
* can be modified, e.g. if a CPU could be doing VMRUN.
*/
wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
}

View File

@ -9908,8 +9908,11 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
rcu_read_lock();
map = rcu_dereference(vcpu->kvm->arch.apic_map);
if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
if (likely(map) && dest_id <= map->max_apic_id) {
dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
if (map->phys_map[dest_id])
target = map->phys_map[dest_id]->vcpu;
}
rcu_read_unlock();

View File

@ -1080,9 +1080,6 @@
#define ARM64_FEATURE_FIELD_BITS 4
/* Defined for compatibility only, do not add new users. */
#define ARM64_FEATURE_MASK(x) (x##_MASK)
#ifdef __ASSEMBLY__
.macro mrs_s, rt, sreg

View File

@ -751,7 +751,7 @@
for (; _metadata->trigger; _metadata->trigger = \
__bail(_assert, _metadata))
#define is_signed_type(var) (!!(((__typeof__(var))(-1)) < (__typeof__(var))1))
#define is_signed_var(var) (!!(((__typeof__(var))(-1)) < (__typeof__(var))1))
#define __EXPECT(_expected, _expected_str, _seen, _seen_str, _t, _assert) do { \
/* Avoid multiple evaluation of the cases */ \
@ -759,7 +759,7 @@
__typeof__(_seen) __seen = (_seen); \
if (!(__exp _t __seen)) { \
/* Report with actual signedness to avoid weird output. */ \
switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \
switch (is_signed_var(__exp) * 2 + is_signed_var(__seen)) { \
case 0: { \
uintmax_t __exp_print = (uintmax_t)__exp; \
uintmax_t __seen_print = (uintmax_t)__seen; \

View File

@ -169,6 +169,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq
TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress
TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access
TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3
TEST_GEN_PROGS_arm64 += arm64/kvm-uuid
TEST_GEN_PROGS_arm64 += access_tracking_perf_test
TEST_GEN_PROGS_arm64 += arch_timer
TEST_GEN_PROGS_arm64 += coalesced_io_test

View File

@ -146,7 +146,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu)
val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val);
return el0 == ID_AA64PFR0_EL1_EL0_IMP;
}

View File

@ -116,12 +116,12 @@ static void reset_debug_state(void)
/* Reset all bcr/bvr/wcr/wvr registers */
dfr0 = read_sysreg(id_aa64dfr0_el1);
brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), dfr0);
brps = FIELD_GET(ID_AA64DFR0_EL1_BRPs, dfr0);
for (i = 0; i <= brps; i++) {
write_dbgbcr(i, 0);
write_dbgbvr(i, 0);
}
wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), dfr0);
wrps = FIELD_GET(ID_AA64DFR0_EL1_WRPs, dfr0);
for (i = 0; i <= wrps; i++) {
write_dbgwcr(i, 0);
write_dbgwvr(i, 0);
@ -418,7 +418,7 @@ static void guest_code_ss(int test_cnt)
static int debug_version(uint64_t id_aa64dfr0)
{
return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0);
return FIELD_GET(ID_AA64DFR0_EL1_DebugVer, id_aa64dfr0);
}
static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn)
@ -539,14 +539,14 @@ void test_guest_debug_exceptions_all(uint64_t aa64dfr0)
int b, w, c;
/* Number of breakpoints */
brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), aa64dfr0) + 1;
brp_num = FIELD_GET(ID_AA64DFR0_EL1_BRPs, aa64dfr0) + 1;
__TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required");
/* Number of watchpoints */
wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), aa64dfr0) + 1;
wrp_num = FIELD_GET(ID_AA64DFR0_EL1_WRPs, aa64dfr0) + 1;
/* Number of context aware breakpoints */
ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_CTX_CMPs), aa64dfr0) + 1;
ctx_brp_num = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs, aa64dfr0) + 1;
pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__,
brp_num, wrp_num, ctx_brp_num);

View File

@ -0,0 +1,70 @@
// SPDX-License-Identifier: GPL-2.0
// Check that nobody has tampered with KVM's UID
#include <errno.h>
#include <linux/arm-smccc.h>
#include <asm/kvm.h>
#include <kvm_util.h>
#include "processor.h"
/*
* Do NOT redefine these constants, or try to replace them with some
* "common" version. They are hardcoded here to detect any potential
* breakage happening in the rest of the kernel.
*
* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74
*/
#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U
#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU
#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U
#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU
static void guest_code(void)
{
struct arm_smccc_res res = {};
smccc_hvc(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0, 0, 0, 0, 0, 0, 0, &res);
__GUEST_ASSERT(res.a0 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 &&
res.a1 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 &&
res.a2 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 &&
res.a3 == ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3,
"Unexpected KVM-specific UID %lx %lx %lx %lx\n", res.a0, res.a1, res.a2, res.a3);
GUEST_DONE();
}
int main (int argc, char *argv[])
{
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
struct ucall uc;
bool guest_done = false;
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
while (!guest_done) {
vcpu_run(vcpu);
switch (get_ucall(vcpu, &uc)) {
case UCALL_SYNC:
break;
case UCALL_DONE:
guest_done = true;
break;
case UCALL_ABORT:
REPORT_GUEST_ASSERT(uc);
break;
case UCALL_PRINTF:
printf("%s", uc.buffer);
break;
default:
TEST_FAIL("Unexpected guest exit");
}
}
kvm_vm_free(vm);
return 0;
}

View File

@ -54,7 +54,7 @@ static void guest_code(void)
* Check that we advertise that ID_AA64PFR0_EL1.GIC == 0, having
* hidden the feature at runtime without any other userspace action.
*/
__GUEST_ASSERT(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC),
__GUEST_ASSERT(FIELD_GET(ID_AA64PFR0_EL1_GIC,
read_sysreg(id_aa64pfr0_el1)) == 0,
"GICv3 wrongly advertised");
@ -165,7 +165,7 @@ int main(int argc, char *argv[])
vm = vm_create_with_one_vcpu(&vcpu, NULL);
pfr0 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
__TEST_REQUIRE(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), pfr0),
__TEST_REQUIRE(FIELD_GET(ID_AA64PFR0_EL1_GIC, pfr0),
"GICv3 not supported.");
kvm_vm_free(vm);

View File

@ -95,14 +95,14 @@ static bool guest_check_lse(void)
uint64_t isar0 = read_sysreg(id_aa64isar0_el1);
uint64_t atomic;
atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC), isar0);
atomic = FIELD_GET(ID_AA64ISAR0_EL1_ATOMIC, isar0);
return atomic >= 2;
}
static bool guest_check_dc_zva(void)
{
uint64_t dczid = read_sysreg(dczid_el0);
uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_EL0_DZP), dczid);
uint64_t dzp = FIELD_GET(DCZID_EL0_DZP, dczid);
return dzp == 0;
}
@ -195,7 +195,7 @@ static bool guest_set_ha(void)
uint64_t hadbs, tcr;
/* Skip if HA is not supported. */
hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS), mmfr1);
hadbs = FIELD_GET(ID_AA64MMFR1_EL1_HAFDBS, mmfr1);
if (hadbs == 0)
return false;

View File

@ -243,6 +243,7 @@ static void guest_code(void)
GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1);
GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1);
GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1);
GUEST_REG_SYNC(SYS_ID_AA64MMFR3_EL1);
GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1);
GUEST_REG_SYNC(SYS_CTR_EL0);
GUEST_REG_SYNC(SYS_MIDR_EL1);
@ -594,8 +595,8 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu)
*/
val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1));
mte = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), val);
mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val);
mte = FIELD_GET(ID_AA64PFR1_EL1_MTE, val);
mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val);
if (mte != ID_AA64PFR1_EL1_MTE_MTE2 ||
mte_frac != ID_AA64PFR1_EL1_MTE_frac_NI) {
ksft_test_result_skip("MTE_ASYNC or MTE_ASYMM are supported, nothing to test\n");
@ -612,7 +613,7 @@ static void test_user_set_mte_reg(struct kvm_vcpu *vcpu)
}
val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR1_EL1));
mte_frac = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac), val);
mte_frac = FIELD_GET(ID_AA64PFR1_EL1_MTE_frac, val);
if (mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI)
ksft_test_result_pass("ID_AA64PFR1_EL1.MTE_frac=0 accepted and still 0xF\n");
else
@ -774,7 +775,7 @@ int main(void)
/* Check for AARCH64 only system */
val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1));
el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val);
el0 = FIELD_GET(ID_AA64PFR0_EL1_EL0, val);
aarch64_only = (el0 == ID_AA64PFR0_EL1_EL0_IMP);
ksft_print_header();

View File

@ -441,7 +441,7 @@ static void create_vpmu_vm(void *guest_code)
/* Make sure that PMUv3 support is indicated in the ID register */
dfr0 = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1));
pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0);
pmuver = FIELD_GET(ID_AA64DFR0_EL1_PMUVer, dfr0);
TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF &&
pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP,
"Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver);

View File

@ -573,15 +573,15 @@ void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k,
err = ioctl(vcpu_fd, KVM_GET_ONE_REG, &reg);
TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd));
gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val);
gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN4, val);
*ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI,
ID_AA64MMFR0_EL1_TGRAN4_52_BIT);
gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val);
gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN64, val);
*ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI,
ID_AA64MMFR0_EL1_TGRAN64_IMP);
gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val);
gran = FIELD_GET(ID_AA64MMFR0_EL1_TGRAN16, val);
*ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI,
ID_AA64MMFR0_EL1_TGRAN16_52_BIT);