2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00
* Add large stage-2 mapping (THP) support for non-protected guests when
   pKVM is enabled, clawing back some performance.
 
 * Enable nested virtualisation support on systems that support it,
   though it is disabled by default.
 
 * Add UBSAN support to the standalone EL2 object used in nVHE/hVHE and
   protected modes.
 
 * Large rework of the way KVM tracks architecture features and links
   them with the effects of control bits. While this has no functional
   impact, it ensures correctness of emulation (the data is automatically
   extracted from the published JSON files), and helps dealing with the
   evolution of the architecture.
 
 * Significant changes to the way pKVM tracks ownership of pages,
   avoiding page table walks by storing the state in the hypervisor's
   vmemmap. This in turn enables the THP support described above.
 
 * New selftest checking the pKVM ownership transition rules
 
 * Fixes for FEAT_MTE_ASYNC being accidentally advertised to guests
   even if the host didn't have it.
 
 * Fixes for the address translation emulation, which happened to be
   rather buggy in some specific contexts.
 
 * Fixes for the PMU emulation in NV contexts, decoupling PMCR_EL0.N
   from the number of counters exposed to a guest and addressing a
   number of issues in the process.
 
 * Add a new selftest for the SVE host state being corrupted by a
   guest.
 
 * Keep HCR_EL2.xMO set at all times for systems running with the
   kernel at EL2, ensuring that the window for interrupts is slightly
   bigger, and avoiding a pretty bad erratum on the AmpereOne HW.
 
 * Add workaround for AmpereOne's erratum AC04_CPU_23, which suffers
   from a pretty bad case of TLB corruption unless accesses to HCR_EL2
   are heavily synchronised.
 
 * Add a per-VM, per-ITS debugfs entry to dump the state of the ITS
   tables in a human-friendly fashion.
 
 * and the usual random cleanups.
 
 LoongArch:
 
 * Don't flush tlb if the host supports hardware page table walks.
 
 * Add KVM selftests support.
 
 RISC-V:
 
 * Add vector registers to get-reg-list selftest
 
 * VCPU reset related improvements
 
 * Remove scounteren initialization from VCPU reset
 
 * Support VCPU reset from userspace using set_mpstate() ioctl
 
 x86:
 
 * Initial support for TDX in KVM.  This finally makes it possible to use the
   TDX module to run confidential guests on Intel processors.  This is quite a
   large series, including support for private page tables (managed by the
   TDX module and mirrored in KVM for efficiency), forwarding some TDVMCALLs
   to userspace, and handling several special VM exits from the TDX module.
 
   This has been in the works for literally years and it's not really possible
   to describe everything here, so I'll defer to the various merge commits
   up to and including commit 7bcf7246c4 ("Merge branch 'kvm-tdx-finish-initial'
   into HEAD").
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmg02hwUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroNnkwf/db4xeWKSMseCIvBVR+ObDn3LXhwT
 hAgmTkDkP1zq9RfbfJSbUA1DXRwfP+f1sWySLMWECkFEQW9fGIJF9fOQRDSXKmhX
 158U3+FEt+3jxLRCGFd4zyXAqyY3C8JSkPUyJZxCpUbXtB5tdDNac4rZAXKDULwe
 sUi0OW/kFDM2yt369pBGQAGdN+75/oOrYISGOSvMXHxjccNqvveX8MUhpBjYIuuj
 73iBWmsfv3vCtam56Racz3C3v44ie498PmWFtnB0R+CVfWfrnUAaRiGWx+egLiBW
 dBPDiZywMn++prmphEUFgaStDTQy23JBLJ8+RvHkp+o5GaTISKJB3nedZQ==
 =adZU
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm updates from Paolo Bonzini:
 "As far as x86 goes this pull request "only" includes TDX host support.

  Quotes are appropriate because (at 6k lines and 100+ commits) it is
  much bigger than the rest, which will come later this week and
  consists mostly of bugfixes and selftests. s390 changes will also come
  in the second batch.

  ARM:

   - Add large stage-2 mapping (THP) support for non-protected guests
     when pKVM is enabled, clawing back some performance.

   - Enable nested virtualisation support on systems that support it,
     though it is disabled by default.

   - Add UBSAN support to the standalone EL2 object used in nVHE/hVHE
     and protected modes.

   - Large rework of the way KVM tracks architecture features and links
     them with the effects of control bits. While this has no functional
     impact, it ensures correctness of emulation (the data is
     automatically extracted from the published JSON files), and helps
     dealing with the evolution of the architecture.

   - Significant changes to the way pKVM tracks ownership of pages,
     avoiding page table walks by storing the state in the hypervisor's
     vmemmap. This in turn enables the THP support described above.

   - New selftest checking the pKVM ownership transition rules

   - Fixes for FEAT_MTE_ASYNC being accidentally advertised to guests
     even if the host didn't have it.

   - Fixes for the address translation emulation, which happened to be
     rather buggy in some specific contexts.

   - Fixes for the PMU emulation in NV contexts, decoupling PMCR_EL0.N
     from the number of counters exposed to a guest and addressing a
     number of issues in the process.

   - Add a new selftest for the SVE host state being corrupted by a
     guest.

   - Keep HCR_EL2.xMO set at all times for systems running with the
     kernel at EL2, ensuring that the window for interrupts is slightly
     bigger, and avoiding a pretty bad erratum on the AmpereOne HW.

   - Add workaround for AmpereOne's erratum AC04_CPU_23, which suffers
     from a pretty bad case of TLB corruption unless accesses to HCR_EL2
     are heavily synchronised.

   - Add a per-VM, per-ITS debugfs entry to dump the state of the ITS
     tables in a human-friendly fashion.

   - and the usual random cleanups.

  LoongArch:

   - Don't flush tlb if the host supports hardware page table walks.

   - Add KVM selftests support.

  RISC-V:

   - Add vector registers to get-reg-list selftest

   - VCPU reset related improvements

   - Remove scounteren initialization from VCPU reset

   - Support VCPU reset from userspace using set_mpstate() ioctl

  x86:

   - Initial support for TDX in KVM.

     This finally makes it possible to use the TDX module to run
     confidential guests on Intel processors. This is quite a large
     series, including support for private page tables (managed by the
     TDX module and mirrored in KVM for efficiency), forwarding some
     TDVMCALLs to userspace, and handling several special VM exits from
     the TDX module.

     This has been in the works for literally years and it's not really
     possible to describe everything here, so I'll defer to the various
     merge commits up to and including commit 7bcf7246c4 ('Merge
     branch 'kvm-tdx-finish-initial' into HEAD')"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (248 commits)
  x86/tdx: mark tdh_vp_enter() as __flatten
  Documentation: virt/kvm: remove unreferenced footnote
  RISC-V: KVM: lock the correct mp_state during reset
  KVM: arm64: Fix documentation for vgic_its_iter_next()
  KVM: arm64: np-guest CMOs with PMD_SIZE fixmap
  KVM: arm64: Stage-2 huge mappings for np-guests
  KVM: arm64: Add a range to pkvm_mappings
  KVM: arm64: Convert pkvm_mappings to interval tree
  KVM: arm64: Add a range to __pkvm_host_test_clear_young_guest()
  KVM: arm64: Add a range to __pkvm_host_wrprotect_guest()
  KVM: arm64: Add a range to __pkvm_host_unshare_guest()
  KVM: arm64: Add a range to __pkvm_host_share_guest()
  KVM: arm64: Introduce for_each_hyp_page
  KVM: arm64: Handle huge mappings for np-guest CMOs
  KVM: arm64: nv: Release faulted-in VNCR page from mmu_lock critical section
  KVM: arm64: nv: Handle TLBI S1E2 for VNCR invalidation with mmu_lock held
  KVM: arm64: nv: Hold mmu_lock when invalidating VNCR SW-TLB before translating
  RISC-V: KVM: add KVM_CAP_RISCV_MP_STATE_RESET
  RISC-V: KVM: Remove scounteren initialization
  KVM: RISC-V: remove unnecessary SBI reset state
  ...
This commit is contained in:
Linus Torvalds 2025-05-29 08:10:01 -07:00
commit 43db111107
158 changed files with 13224 additions and 1989 deletions

View File

@ -57,6 +57,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| Ampere | AmpereOne AC04 | AC04_CPU_10 | AMPERE_ERRATUM_AC03_CPU_38 |
+----------------+-----------------+-----------------+-----------------------------+
| Ampere | AmpereOne AC04 | AC04_CPU_23 | AMPERE_ERRATUM_AC04_CPU_23 |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
+----------------+-----------------+-----------------+-----------------------------+

View File

@ -1411,6 +1411,9 @@ the memory region are automatically reflected into the guest. For example, an
mmap() that affects the region will be made visible immediately. Another
example is madvise(MADV_DROP).
For TDX guest, deleting/moving memory region loses guest memory contents.
Read only region isn't supported. Only as-id 0 is supported.
Note: On arm64, a write generated by the page-table walker (to update
the Access and Dirty flags, for example) never results in a
KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This
@ -3460,7 +3463,8 @@ The initial values are defined as:
- FPSIMD/NEON registers: set to 0
- SVE registers: set to 0
- System registers: Reset to their architecturally defined
values as for a warm reset to EL1 (resp. SVC)
values as for a warm reset to EL1 (resp. SVC) or EL2 (in the
case of EL2 being enabled).
Note that because some registers reflect machine topology, all vcpus
should be created before this ioctl is invoked.
@ -3527,6 +3531,17 @@ Possible features:
- the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can
no longer be written using KVM_SET_ONE_REG.
- KVM_ARM_VCPU_HAS_EL2: Enable Nested Virtualisation support,
booting the guest from EL2 instead of EL1.
Depends on KVM_CAP_ARM_EL2.
The VM is running with HCR_EL2.E2H being RES1 (VHE) unless
KVM_ARM_VCPU_HAS_EL2_E2H0 is also set.
- KVM_ARM_VCPU_HAS_EL2_E2H0: Restrict Nested Virtualisation
support to HCR_EL2.E2H being RES0 (non-VHE).
Depends on KVM_CAP_ARM_EL2_E2H0.
KVM_ARM_VCPU_HAS_EL2 must also be set.
4.83 KVM_ARM_PREFERRED_TARGET
-----------------------------
@ -4768,7 +4783,7 @@ H_GET_CPU_CHARACTERISTICS hypercall.
:Capability: basic
:Architectures: x86
:Type: vm
:Type: vm ioctl, vcpu ioctl
:Parameters: an opaque platform specific structure (in/out)
:Returns: 0 on success; -1 on error
@ -4776,9 +4791,11 @@ If the platform supports creating encrypted VMs then this ioctl can be used
for issuing platform-specific memory encryption commands to manage those
encrypted VMs.
Currently, this ioctl is used for issuing Secure Encrypted Virtualization
(SEV) commands on AMD Processors. The SEV commands are defined in
Documentation/virt/kvm/x86/amd-memory-encryption.rst.
Currently, this ioctl is used for issuing both Secure Encrypted Virtualization
(SEV) commands on AMD Processors and Trusted Domain Extensions (TDX) commands
on Intel Processors. The detailed commands are defined in
Documentation/virt/kvm/x86/amd-memory-encryption.rst and
Documentation/virt/kvm/x86/intel-tdx.rst.
4.111 KVM_MEMORY_ENCRYPT_REG_REGION
-----------------------------------
@ -6827,6 +6844,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
#define KVM_SYSTEM_EVENT_WAKEUP 4
#define KVM_SYSTEM_EVENT_SUSPEND 5
#define KVM_SYSTEM_EVENT_SEV_TERM 6
#define KVM_SYSTEM_EVENT_TDX_FATAL 7
__u32 type;
__u32 ndata;
__u64 data[16];
@ -6853,6 +6871,11 @@ Valid values for 'type' are:
reset/shutdown of the VM.
- KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination.
The guest physical address of the guest's GHCB is stored in `data[0]`.
- KVM_SYSTEM_EVENT_TDX_FATAL -- a TDX guest reported a fatal error state.
KVM doesn't do any parsing or conversion, it just dumps 16 general-purpose
registers to userspace, in ascending order of the 4-bit indices for x86-64
general-purpose registers in instruction encoding, as defined in the Intel
SDM.
- KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and
KVM has recognized a wakeup event. Userspace may honor this event by
marking the exiting vCPU as runnable, or deny it and call KVM_RUN again.
@ -8194,6 +8217,28 @@ KVM_X86_QUIRK_STUFF_FEATURE_MSRS By default, at vCPU creation, KVM sets the
and 0x489), as KVM does now allow them to
be set by userspace (KVM sets them based on
guest CPUID, for safety purposes).
KVM_X86_QUIRK_IGNORE_GUEST_PAT By default, on Intel platforms, KVM ignores
guest PAT and forces the effective memory
type to WB in EPT. The quirk is not available
on Intel platforms which are incapable of
safely honoring guest PAT (i.e., without CPU
self-snoop, KVM always ignores guest PAT and
forces effective memory type to WB). It is
also ignored on AMD platforms or, on Intel,
when a VM has non-coherent DMA devices
assigned; KVM always honors guest PAT in
such case. The quirk is needed to avoid
slowdowns on certain Intel Xeon platforms
(e.g. ICX, SPR) where self-snoop feature is
supported but UC is slow enough to cause
issues with some older guests that use
UC instead of WC to map the video RAM.
Userspace can disable the quirk to honor
guest PAT if it knows that there is no such
guest software, for example if it does not
expose a bochs graphics device (which is
known to have had a buggy driver).
=================================== ============================================
7.32 KVM_CAP_MAX_VCPU_ID
@ -8496,6 +8541,17 @@ aforementioned registers before the first KVM_RUN. These registers are VM
scoped, meaning that the same set of values are presented on all vCPUs in a
given VM.
7.43 KVM_CAP_RISCV_MP_STATE_RESET
---------------------------------
:Architectures: riscv
:Type: VM
:Parameters: None
:Returns: 0 on success, -EINVAL if arg[0] is not zero
When this capability is enabled, KVM resets the VCPU when setting
MP_STATE_INIT_RECEIVED through IOCTL. The original MP_STATE is preserved.
8. Other capabilities.
======================

View File

@ -137,6 +137,30 @@ exit_reason = KVM_EXIT_FAIL_ENTRY and populate the fail_entry struct by setting
hardare_entry_failure_reason field to KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED and
the cpu field to the processor id.
1.5 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS
--------------------------------------------------
:Parameters: in kvm_device_attr.addr the address to an unsigned int
representing the maximum value taken by PMCR_EL0.N
:Returns:
======= ====================================================
-EBUSY PMUv3 already initialized, a VCPU has already run or
an event filter has already been set
-EFAULT Error accessing the value pointed to by addr
-ENODEV PMUv3 not supported or GIC not initialized
-EINVAL No PMUv3 explicitly selected, or value of N out of
range
======= ====================================================
Set the number of implemented event counters in the virtual PMU. This
mandates that a PMU has explicitly been selected via
KVM_ARM_VCPU_PMU_V3_SET_PMU, and will fail when no PMU has been
explicitly selected, or the number of counters is out of range for the
selected PMU. Selecting a new PMU cancels the effect of setting this
attribute.
2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
=================================

View File

@ -11,6 +11,7 @@ KVM for x86 systems
cpuid
errata
hypercalls
intel-tdx
mmu
msr
nested-vmx

View File

@ -0,0 +1,255 @@
.. SPDX-License-Identifier: GPL-2.0
===================================
Intel Trust Domain Extensions (TDX)
===================================
Overview
========
Intel's Trust Domain Extensions (TDX) protect confidential guest VMs from the
host and physical attacks. A CPU-attested software module called 'the TDX
module' runs inside a new CPU isolated range to provide the functionalities to
manage and run protected VMs, a.k.a, TDX guests or TDs.
Please refer to [1] for the whitepaper, specifications and other resources.
This documentation describes TDX-specific KVM ABIs. The TDX module needs to be
initialized before it can be used by KVM to run any TDX guests. The host
core-kernel provides the support of initializing the TDX module, which is
described in the Documentation/arch/x86/tdx.rst.
API description
===============
KVM_MEMORY_ENCRYPT_OP
---------------------
:Type: vm ioctl, vcpu ioctl
For TDX operations, KVM_MEMORY_ENCRYPT_OP is re-purposed to be generic
ioctl with TDX specific sub-ioctl() commands.
::
/* Trust Domain Extensions sub-ioctl() commands. */
enum kvm_tdx_cmd_id {
KVM_TDX_CAPABILITIES = 0,
KVM_TDX_INIT_VM,
KVM_TDX_INIT_VCPU,
KVM_TDX_INIT_MEM_REGION,
KVM_TDX_FINALIZE_VM,
KVM_TDX_GET_CPUID,
KVM_TDX_CMD_NR_MAX,
};
struct kvm_tdx_cmd {
/* enum kvm_tdx_cmd_id */
__u32 id;
/* flags for sub-command. If sub-command doesn't use this, set zero. */
__u32 flags;
/*
* data for each sub-command. An immediate or a pointer to the actual
* data in process virtual address. If sub-command doesn't use it,
* set zero.
*/
__u64 data;
/*
* Auxiliary error code. The sub-command may return TDX SEAMCALL
* status code in addition to -Exxx.
*/
__u64 hw_error;
};
KVM_TDX_CAPABILITIES
--------------------
:Type: vm ioctl
:Returns: 0 on success, <0 on error
Return the TDX capabilities that current KVM supports with the specific TDX
module loaded in the system. It reports what features/capabilities are allowed
to be configured to the TDX guest.
- id: KVM_TDX_CAPABILITIES
- flags: must be 0
- data: pointer to struct kvm_tdx_capabilities
- hw_error: must be 0
::
struct kvm_tdx_capabilities {
__u64 supported_attrs;
__u64 supported_xfam;
__u64 reserved[254];
/* Configurable CPUID bits for userspace */
struct kvm_cpuid2 cpuid;
};
KVM_TDX_INIT_VM
---------------
:Type: vm ioctl
:Returns: 0 on success, <0 on error
Perform TDX specific VM initialization. This needs to be called after
KVM_CREATE_VM and before creating any VCPUs.
- id: KVM_TDX_INIT_VM
- flags: must be 0
- data: pointer to struct kvm_tdx_init_vm
- hw_error: must be 0
::
struct kvm_tdx_init_vm {
__u64 attributes;
__u64 xfam;
__u64 mrconfigid[6]; /* sha384 digest */
__u64 mrowner[6]; /* sha384 digest */
__u64 mrownerconfig[6]; /* sha384 digest */
/* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
__u64 reserved[12];
/*
* Call KVM_TDX_INIT_VM before vcpu creation, thus before
* KVM_SET_CPUID2.
* This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
* TDX module directly virtualizes those CPUIDs without VMM. The user
* space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
* those values. If it doesn't, KVM may have wrong idea of vCPUIDs of
* the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
* module doesn't virtualize.
*/
struct kvm_cpuid2 cpuid;
};
KVM_TDX_INIT_VCPU
-----------------
:Type: vcpu ioctl
:Returns: 0 on success, <0 on error
Perform TDX specific VCPU initialization.
- id: KVM_TDX_INIT_VCPU
- flags: must be 0
- data: initial value of the guest TD VCPU RCX
- hw_error: must be 0
KVM_TDX_INIT_MEM_REGION
-----------------------
:Type: vcpu ioctl
:Returns: 0 on success, <0 on error
Initialize @nr_pages TDX guest private memory starting from @gpa with userspace
provided data from @source_addr.
Note, before calling this sub command, memory attribute of the range
[gpa, gpa + nr_pages] needs to be private. Userspace can use
KVM_SET_MEMORY_ATTRIBUTES to set the attribute.
If KVM_TDX_MEASURE_MEMORY_REGION flag is specified, it also extends measurement.
- id: KVM_TDX_INIT_MEM_REGION
- flags: currently only KVM_TDX_MEASURE_MEMORY_REGION is defined
- data: pointer to struct kvm_tdx_init_mem_region
- hw_error: must be 0
::
#define KVM_TDX_MEASURE_MEMORY_REGION (1UL << 0)
struct kvm_tdx_init_mem_region {
__u64 source_addr;
__u64 gpa;
__u64 nr_pages;
};
KVM_TDX_FINALIZE_VM
-------------------
:Type: vm ioctl
:Returns: 0 on success, <0 on error
Complete measurement of the initial TD contents and mark it ready to run.
- id: KVM_TDX_FINALIZE_VM
- flags: must be 0
- data: must be 0
- hw_error: must be 0
KVM_TDX_GET_CPUID
-----------------
:Type: vcpu ioctl
:Returns: 0 on success, <0 on error
Get the CPUID values that the TDX module virtualizes for the TD guest.
When it returns -E2BIG, the user space should allocate a larger buffer and
retry. The minimum buffer size is updated in the nent field of the
struct kvm_cpuid2.
- id: KVM_TDX_GET_CPUID
- flags: must be 0
- data: pointer to struct kvm_cpuid2 (in/out)
- hw_error: must be 0 (out)
::
struct kvm_cpuid2 {
__u32 nent;
__u32 padding;
struct kvm_cpuid_entry2 entries[0];
};
struct kvm_cpuid_entry2 {
__u32 function;
__u32 index;
__u32 flags;
__u32 eax;
__u32 ebx;
__u32 ecx;
__u32 edx;
__u32 padding[3];
};
KVM TDX creation flow
=====================
In addition to the standard KVM flow, new TDX ioctls need to be called. The
control flow is as follows:
#. Check system wide capability
* KVM_CAP_VM_TYPES: Check if VM type is supported and if KVM_X86_TDX_VM
is supported.
#. Create VM
* KVM_CREATE_VM
* KVM_TDX_CAPABILITIES: Query TDX capabilities for creating TDX guests.
* KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPUS): Query maximum VCPUs the TD can
support at VM level (TDX has its own limitation on this).
* KVM_SET_TSC_KHZ: Configure TD's TSC frequency if a different TSC frequency
than host is desired. This is Optional.
* KVM_TDX_INIT_VM: Pass TDX specific VM parameters.
#. Create VCPU
* KVM_CREATE_VCPU
* KVM_TDX_INIT_VCPU: Pass TDX specific VCPU parameters.
* KVM_SET_CPUID2: Configure TD's CPUIDs.
* KVM_SET_MSRS: Configure TD's MSRs.
#. Initialize initial guest memory
* Prepare content of initial guest memory.
* KVM_TDX_INIT_MEM_REGION: Add initial guest memory.
* KVM_TDX_FINALIZE_VM: Finalize the measurement of the TDX guest.
#. Run VCPU
References
==========
https://www.intel.com/content/www/us/en/developer/tools/trust-domain-extensions/documentation.html

View File

@ -13148,6 +13148,8 @@ F: Documentation/virt/kvm/loongarch/
F: arch/loongarch/include/asm/kvm*
F: arch/loongarch/include/uapi/asm/kvm*
F: arch/loongarch/kvm/
F: tools/testing/selftests/kvm/*/loongarch/
F: tools/testing/selftests/kvm/lib/loongarch/
KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips)
M: Huacai Chen <chenhuacai@kernel.org>

View File

@ -464,6 +464,23 @@ config AMPERE_ERRATUM_AC03_CPU_38
If unsure, say Y.
config AMPERE_ERRATUM_AC04_CPU_23
bool "AmpereOne: AC04_CPU_23: Failure to synchronize writes to HCR_EL2 may corrupt address translations."
default y
help
This option adds an alternative code sequence to work around Ampere
errata AC04_CPU_23 on AmpereOne.
Updates to HCR_EL2 can rarely corrupt simultaneous translations for
data addresses initiated by load/store instructions. Only
instruction initiated translations are vulnerable, not translations
from prefetches for example. A DSB before the store to HCR_EL2 is
sufficient to prevent older instructions from hitting the window
for corruption, and an ISB after is sufficient to prevent younger
instructions from hitting the window for corruption.
If unsure, say Y.
config ARM64_WORKAROUND_CLEAN_CACHE
bool

View File

@ -38,7 +38,7 @@
orr x0, x0, #HCR_E2H
.LnVHE_\@:
msr hcr_el2, x0
msr_hcr_el2 x0
isb
.endm
@ -215,8 +215,8 @@
cbz x1, .Lskip_sme_fgt_\@
/* Disable nVHE traps of TPIDR2 and SMPRI */
orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK
orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK
orr x0, x0, #HFGRTR_EL2_nSMPRI_EL1_MASK
orr x0, x0, #HFGRTR_EL2_nTPIDR2_EL0_MASK
.Lskip_sme_fgt_\@:
mrs_s x1, SYS_ID_AA64MMFR3_EL1
@ -224,8 +224,8 @@
cbz x1, .Lskip_pie_fgt_\@
/* Disable trapping of PIR_EL1 / PIRE0_EL1 */
orr x0, x0, #HFGxTR_EL2_nPIR_EL1
orr x0, x0, #HFGxTR_EL2_nPIRE0_EL1
orr x0, x0, #HFGRTR_EL2_nPIR_EL1
orr x0, x0, #HFGRTR_EL2_nPIRE0_EL1
.Lskip_pie_fgt_\@:
mrs_s x1, SYS_ID_AA64MMFR3_EL1
@ -233,7 +233,7 @@
cbz x1, .Lskip_poe_fgt_\@
/* Disable trapping of POR_EL0 */
orr x0, x0, #HFGxTR_EL2_nPOR_EL0
orr x0, x0, #HFGRTR_EL2_nPOR_EL0
.Lskip_poe_fgt_\@:
/* GCS depends on PIE so we don't check it if PIE is absent */
@ -242,8 +242,8 @@
cbz x1, .Lskip_gce_fgt_\@
/* Disable traps of access to GCS registers at EL0 and EL1 */
orr x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK
orr x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK
orr x0, x0, #HFGRTR_EL2_nGCS_EL1_MASK
orr x0, x0, #HFGRTR_EL2_nGCS_EL0_MASK
.Lskip_gce_fgt_\@:

View File

@ -20,7 +20,8 @@
#define ESR_ELx_EC_FP_ASIMD UL(0x07)
#define ESR_ELx_EC_CP10_ID UL(0x08) /* EL2 only */
#define ESR_ELx_EC_PAC UL(0x09) /* EL2 and above */
/* Unallocated EC: 0x0A - 0x0B */
#define ESR_ELx_EC_OTHER UL(0x0A)
/* Unallocated EC: 0x0B */
#define ESR_ELx_EC_CP14_64 UL(0x0C)
#define ESR_ELx_EC_BTI UL(0x0D)
#define ESR_ELx_EC_ILL UL(0x0E)
@ -99,6 +100,8 @@
#define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT)
/* Shared ISS field definitions for Data/Instruction aborts */
#define ESR_ELx_VNCR_SHIFT (13)
#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT)
#define ESR_ELx_SET_SHIFT (11)
#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT)
#define ESR_ELx_FnV_SHIFT (10)
@ -181,6 +184,13 @@
#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0)
#define ESR_ELx_xVC_IMM_MASK ((UL(1) << 16) - 1)
/* ISS definitions for LD64B/ST64B/{T,P}SBCSYNC instructions */
#define ESR_ELx_ISS_OTHER_ST64BV (0)
#define ESR_ELx_ISS_OTHER_ST64BV0 (1)
#define ESR_ELx_ISS_OTHER_LDST64B (2)
#define ESR_ELx_ISS_OTHER_TSBCSYNC (3)
#define ESR_ELx_ISS_OTHER_PSBCSYNC (4)
#define DISR_EL1_IDS (UL(1) << 24)
/*
* DISR_EL1 and ESR_ELx share the bottom 13 bits, but the RES0 bits may mean
@ -442,6 +452,11 @@ static inline bool esr_is_cfi_brk(unsigned long esr)
(esr_brk_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE;
}
static inline bool esr_is_ubsan_brk(unsigned long esr)
{
return (esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM;
}
static inline bool esr_fsc_is_translation_fault(unsigned long esr)
{
esr = esr & ESR_ELx_FSC;

View File

@ -48,6 +48,12 @@ enum fixed_addresses {
FIX_EARLYCON_MEM_BASE,
FIX_TEXT_POKE0,
#ifdef CONFIG_KVM
/* One slot per CPU, mapping the guest's VNCR page at EL2. */
FIX_VNCR_END,
FIX_VNCR = FIX_VNCR_END + NR_CPUS,
#endif
#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
FIX_APEI_GHES_IRQ,

View File

@ -41,7 +41,7 @@ do { \
\
___hcr = read_sysreg(hcr_el2); \
if (!(___hcr & HCR_TGE)) { \
write_sysreg(___hcr | HCR_TGE, hcr_el2); \
write_sysreg_hcr(___hcr | HCR_TGE); \
isb(); \
} \
/* \
@ -82,7 +82,7 @@ do { \
*/ \
barrier(); \
if (!___ctx->cnt && !(___hcr & HCR_TGE)) \
write_sysreg(___hcr, hcr_el2); \
write_sysreg_hcr(___hcr); \
} while (0)
static inline void ack_bad_irq(unsigned int irq)

View File

@ -12,67 +12,70 @@
#include <asm/sysreg.h>
#include <asm/types.h>
/* Hyp Configuration Register (HCR) bits */
/*
* Because I'm terribly lazy and that repainting the whole of the KVM
* code with the proper names is a pain, use a helper to map the names
* inherited from AArch32 with the new fancy nomenclature. One day...
*/
#define __HCR(x) HCR_EL2_##x
#define HCR_TID5 (UL(1) << 58)
#define HCR_DCT (UL(1) << 57)
#define HCR_ATA_SHIFT 56
#define HCR_ATA (UL(1) << HCR_ATA_SHIFT)
#define HCR_TTLBOS (UL(1) << 55)
#define HCR_TTLBIS (UL(1) << 54)
#define HCR_ENSCXT (UL(1) << 53)
#define HCR_TOCU (UL(1) << 52)
#define HCR_AMVOFFEN (UL(1) << 51)
#define HCR_TICAB (UL(1) << 50)
#define HCR_TID4 (UL(1) << 49)
#define HCR_FIEN (UL(1) << 47)
#define HCR_FWB (UL(1) << 46)
#define HCR_NV2 (UL(1) << 45)
#define HCR_AT (UL(1) << 44)
#define HCR_NV1 (UL(1) << 43)
#define HCR_NV (UL(1) << 42)
#define HCR_API (UL(1) << 41)
#define HCR_APK (UL(1) << 40)
#define HCR_TEA (UL(1) << 37)
#define HCR_TERR (UL(1) << 36)
#define HCR_TLOR (UL(1) << 35)
#define HCR_E2H (UL(1) << 34)
#define HCR_ID (UL(1) << 33)
#define HCR_CD (UL(1) << 32)
#define HCR_RW_SHIFT 31
#define HCR_RW (UL(1) << HCR_RW_SHIFT)
#define HCR_TRVM (UL(1) << 30)
#define HCR_HCD (UL(1) << 29)
#define HCR_TDZ (UL(1) << 28)
#define HCR_TGE (UL(1) << 27)
#define HCR_TVM (UL(1) << 26)
#define HCR_TTLB (UL(1) << 25)
#define HCR_TPU (UL(1) << 24)
#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */
#define HCR_TSW (UL(1) << 22)
#define HCR_TACR (UL(1) << 21)
#define HCR_TIDCP (UL(1) << 20)
#define HCR_TSC (UL(1) << 19)
#define HCR_TID3 (UL(1) << 18)
#define HCR_TID2 (UL(1) << 17)
#define HCR_TID1 (UL(1) << 16)
#define HCR_TID0 (UL(1) << 15)
#define HCR_TWE (UL(1) << 14)
#define HCR_TWI (UL(1) << 13)
#define HCR_DC (UL(1) << 12)
#define HCR_BSU (3 << 10)
#define HCR_BSU_IS (UL(1) << 10)
#define HCR_FB (UL(1) << 9)
#define HCR_VSE (UL(1) << 8)
#define HCR_VI (UL(1) << 7)
#define HCR_VF (UL(1) << 6)
#define HCR_AMO (UL(1) << 5)
#define HCR_IMO (UL(1) << 4)
#define HCR_FMO (UL(1) << 3)
#define HCR_PTW (UL(1) << 2)
#define HCR_SWIO (UL(1) << 1)
#define HCR_VM (UL(1) << 0)
#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39))
#define HCR_TID5 __HCR(TID5)
#define HCR_DCT __HCR(DCT)
#define HCR_ATA_SHIFT __HCR(ATA_SHIFT)
#define HCR_ATA __HCR(ATA)
#define HCR_TTLBOS __HCR(TTLBOS)
#define HCR_TTLBIS __HCR(TTLBIS)
#define HCR_ENSCXT __HCR(EnSCXT)
#define HCR_TOCU __HCR(TOCU)
#define HCR_AMVOFFEN __HCR(AMVOFFEN)
#define HCR_TICAB __HCR(TICAB)
#define HCR_TID4 __HCR(TID4)
#define HCR_FIEN __HCR(FIEN)
#define HCR_FWB __HCR(FWB)
#define HCR_NV2 __HCR(NV2)
#define HCR_AT __HCR(AT)
#define HCR_NV1 __HCR(NV1)
#define HCR_NV __HCR(NV)
#define HCR_API __HCR(API)
#define HCR_APK __HCR(APK)
#define HCR_TEA __HCR(TEA)
#define HCR_TERR __HCR(TERR)
#define HCR_TLOR __HCR(TLOR)
#define HCR_E2H __HCR(E2H)
#define HCR_ID __HCR(ID)
#define HCR_CD __HCR(CD)
#define HCR_RW __HCR(RW)
#define HCR_TRVM __HCR(TRVM)
#define HCR_HCD __HCR(HCD)
#define HCR_TDZ __HCR(TDZ)
#define HCR_TGE __HCR(TGE)
#define HCR_TVM __HCR(TVM)
#define HCR_TTLB __HCR(TTLB)
#define HCR_TPU __HCR(TPU)
#define HCR_TPC __HCR(TPCP)
#define HCR_TSW __HCR(TSW)
#define HCR_TACR __HCR(TACR)
#define HCR_TIDCP __HCR(TIDCP)
#define HCR_TSC __HCR(TSC)
#define HCR_TID3 __HCR(TID3)
#define HCR_TID2 __HCR(TID2)
#define HCR_TID1 __HCR(TID1)
#define HCR_TID0 __HCR(TID0)
#define HCR_TWE __HCR(TWE)
#define HCR_TWI __HCR(TWI)
#define HCR_DC __HCR(DC)
#define HCR_BSU __HCR(BSU)
#define HCR_BSU_IS __HCR(BSU_IS)
#define HCR_FB __HCR(FB)
#define HCR_VSE __HCR(VSE)
#define HCR_VI __HCR(VI)
#define HCR_VF __HCR(VF)
#define HCR_AMO __HCR(AMO)
#define HCR_IMO __HCR(IMO)
#define HCR_FMO __HCR(FMO)
#define HCR_PTW __HCR(PTW)
#define HCR_SWIO __HCR(SWIO)
#define HCR_VM __HCR(VM)
/*
* The bits we set in HCR:
@ -312,56 +315,19 @@
GENMASK(15, 0))
/*
* FGT register definitions
*
* RES0 and polarity masks as of DDI0487J.a, to be updated as needed.
* We're not using the generated masks as they are usually ahead of
* the published ARM ARM, which we use as a reference.
*
* Once we get to a point where the two describe the same thing, we'll
* merge the definitions. One day.
* Polarity masks for HCRX_EL2, limited to the bits that we know about
* at this point in time. It doesn't mean that we actually *handle*
* them, but that at least those that are not advertised to a guest
* will be RES0 for that guest.
*/
#define __HFGRTR_EL2_RES0 HFGxTR_EL2_RES0
#define __HFGRTR_EL2_MASK GENMASK(49, 0)
#define __HFGRTR_EL2_nMASK ~(__HFGRTR_EL2_RES0 | __HFGRTR_EL2_MASK)
/*
* The HFGWTR bits are a subset of HFGRTR bits. To ensure we don't miss any
* future additions, define __HFGWTR* macros relative to __HFGRTR* ones.
*/
#define __HFGRTR_ONLY_MASK (BIT(46) | BIT(42) | BIT(40) | BIT(28) | \
GENMASK(26, 25) | BIT(21) | BIT(18) | \
GENMASK(15, 14) | GENMASK(10, 9) | BIT(2))
#define __HFGWTR_EL2_RES0 (__HFGRTR_EL2_RES0 | __HFGRTR_ONLY_MASK)
#define __HFGWTR_EL2_MASK (__HFGRTR_EL2_MASK & ~__HFGRTR_ONLY_MASK)
#define __HFGWTR_EL2_nMASK ~(__HFGWTR_EL2_RES0 | __HFGWTR_EL2_MASK)
#define __HFGITR_EL2_RES0 HFGITR_EL2_RES0
#define __HFGITR_EL2_MASK (BIT(62) | BIT(60) | GENMASK(54, 0))
#define __HFGITR_EL2_nMASK ~(__HFGITR_EL2_RES0 | __HFGITR_EL2_MASK)
#define __HDFGRTR_EL2_RES0 HDFGRTR_EL2_RES0
#define __HDFGRTR_EL2_MASK (BIT(63) | GENMASK(58, 50) | GENMASK(48, 43) | \
GENMASK(41, 40) | GENMASK(37, 22) | \
GENMASK(19, 9) | GENMASK(7, 0))
#define __HDFGRTR_EL2_nMASK ~(__HDFGRTR_EL2_RES0 | __HDFGRTR_EL2_MASK)
#define __HDFGWTR_EL2_RES0 HDFGWTR_EL2_RES0
#define __HDFGWTR_EL2_MASK (GENMASK(57, 52) | GENMASK(50, 48) | \
GENMASK(46, 44) | GENMASK(42, 41) | \
GENMASK(37, 35) | GENMASK(33, 31) | \
GENMASK(29, 23) | GENMASK(21, 10) | \
GENMASK(8, 7) | GENMASK(5, 0))
#define __HDFGWTR_EL2_nMASK ~(__HDFGWTR_EL2_RES0 | __HDFGWTR_EL2_MASK)
#define __HAFGRTR_EL2_RES0 HAFGRTR_EL2_RES0
#define __HAFGRTR_EL2_MASK (GENMASK(49, 17) | GENMASK(4, 0))
#define __HAFGRTR_EL2_nMASK ~(__HAFGRTR_EL2_RES0 | __HAFGRTR_EL2_MASK)
/* Similar definitions for HCRX_EL2 */
#define __HCRX_EL2_RES0 HCRX_EL2_RES0
#define __HCRX_EL2_MASK (BIT(6))
#define __HCRX_EL2_nMASK ~(__HCRX_EL2_RES0 | __HCRX_EL2_MASK)
#define __HCRX_EL2_MASK (BIT_ULL(6))
#define __HCRX_EL2_nMASK (GENMASK_ULL(24, 14) | \
GENMASK_ULL(11, 7) | \
GENMASK_ULL(5, 0))
#define __HCRX_EL2_RES0 ~(__HCRX_EL2_nMASK | __HCRX_EL2_MASK)
#define __HCRX_EL2_RES1 ~(__HCRX_EL2_nMASK | \
__HCRX_EL2_MASK | \
__HCRX_EL2_RES0)
/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~UL(0xf))

View File

@ -39,7 +39,7 @@
#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
#define KVM_VCPU_MAX_FEATURES 7
#define KVM_VCPU_MAX_FEATURES 9
#define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1)
#define KVM_REQ_SLEEP \
@ -53,6 +53,7 @@
#define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7)
#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8)
#define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9)
#define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
@ -273,11 +274,17 @@ struct kvm_sysreg_masks;
enum fgt_group_id {
__NO_FGT_GROUP__,
HFGxTR_GROUP,
HFGRTR_GROUP,
HFGWTR_GROUP = HFGRTR_GROUP,
HDFGRTR_GROUP,
HDFGWTR_GROUP = HDFGRTR_GROUP,
HFGITR_GROUP,
HAFGRTR_GROUP,
HFGRTR2_GROUP,
HFGWTR2_GROUP = HFGRTR2_GROUP,
HDFGRTR2_GROUP,
HDFGWTR2_GROUP = HDFGRTR2_GROUP,
HFGITR2_GROUP,
/* Must be last */
__NR_FGT_GROUP_IDS__
@ -359,8 +366,8 @@ struct kvm_arch {
cpumask_var_t supported_cpus;
/* PMCR_EL0.N value for the guest */
u8 pmcr_n;
/* Maximum number of counters for the guest */
u8 nr_pmu_counters;
/* Iterator for idreg debugfs */
u8 idreg_debugfs_iter;
@ -389,6 +396,9 @@ struct kvm_arch {
/* Masks for VNCR-backed and general EL2 sysregs */
struct kvm_sysreg_masks *sysreg_masks;
/* Count the number of VNCR_EL2 currently mapped */
atomic_t vncr_map_count;
/*
* For an untrusted host VM, 'pkvm.handle' is used to lookup
* the associated pKVM instance in the hypervisor.
@ -561,6 +571,13 @@ enum vcpu_sysreg {
VNCR(HDFGRTR_EL2),
VNCR(HDFGWTR_EL2),
VNCR(HAFGRTR_EL2),
VNCR(HFGRTR2_EL2),
VNCR(HFGWTR2_EL2),
VNCR(HFGITR2_EL2),
VNCR(HDFGRTR2_EL2),
VNCR(HDFGWTR2_EL2),
VNCR(VNCR_EL2),
VNCR(CNTVOFF_EL2),
VNCR(CNTV_CVAL_EL0),
@ -606,6 +623,37 @@ struct kvm_sysreg_masks {
} mask[NR_SYS_REGS - __SANITISED_REG_START__];
};
struct fgt_masks {
const char *str;
u64 mask;
u64 nmask;
u64 res0;
};
extern struct fgt_masks hfgrtr_masks;
extern struct fgt_masks hfgwtr_masks;
extern struct fgt_masks hfgitr_masks;
extern struct fgt_masks hdfgrtr_masks;
extern struct fgt_masks hdfgwtr_masks;
extern struct fgt_masks hafgrtr_masks;
extern struct fgt_masks hfgrtr2_masks;
extern struct fgt_masks hfgwtr2_masks;
extern struct fgt_masks hfgitr2_masks;
extern struct fgt_masks hdfgrtr2_masks;
extern struct fgt_masks hdfgwtr2_masks;
extern struct fgt_masks kvm_nvhe_sym(hfgrtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgwtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgitr_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgrtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgwtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hafgrtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgrtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgwtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgitr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgrtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgwtr2_masks);
struct kvm_cpu_context {
struct user_pt_regs regs; /* sp = sp_el0 */
@ -654,6 +702,8 @@ struct kvm_host_data {
#define KVM_HOST_DATA_FLAG_HAS_TRBE 1
#define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4
#define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5
#define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 6
#define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED 7
unsigned long flags;
struct kvm_cpu_context host_ctxt;
@ -730,6 +780,8 @@ struct vcpu_reset_state {
bool reset;
};
struct vncr_tlb;
struct kvm_vcpu_arch {
struct kvm_cpu_context ctxt;
@ -824,6 +876,9 @@ struct kvm_vcpu_arch {
/* Per-vcpu CCSIDR override or NULL */
u32 *ccsidr;
/* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */
struct vncr_tlb *vncr_tlb;
};
/*
@ -971,20 +1026,22 @@ struct kvm_vcpu_arch {
#define vcpu_sve_zcr_elx(vcpu) \
(unlikely(is_hyp_ctxt(vcpu)) ? ZCR_EL2 : ZCR_EL1)
#define vcpu_sve_state_size(vcpu) ({ \
#define sve_state_size_from_vl(sve_max_vl) ({ \
size_t __size_ret; \
unsigned int __vcpu_vq; \
unsigned int __vq; \
\
if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \
if (WARN_ON(!sve_vl_valid(sve_max_vl))) { \
__size_ret = 0; \
} else { \
__vcpu_vq = vcpu_sve_max_vq(vcpu); \
__size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \
__vq = sve_vq_from_vl(sve_max_vl); \
__size_ret = SVE_SIG_REGS_SIZE(__vq); \
} \
\
__size_ret; \
})
#define vcpu_sve_state_size(vcpu) sve_state_size_from_vl((vcpu)->arch.sve_max_vl)
#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
KVM_GUESTDBG_USE_SW_BP | \
KVM_GUESTDBG_USE_HW | \
@ -1550,12 +1607,16 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
kvm_cmp_feat_signed(kvm, id, fld, op, limit) : \
kvm_cmp_feat_unsigned(kvm, id, fld, op, limit))
#define kvm_has_feat(kvm, id, fld, limit) \
#define __kvm_has_feat(kvm, id, fld, limit) \
kvm_cmp_feat(kvm, id, fld, >=, limit)
#define kvm_has_feat_enum(kvm, id, fld, val) \
#define kvm_has_feat(kvm, ...) __kvm_has_feat(kvm, __VA_ARGS__)
#define __kvm_has_feat_enum(kvm, id, fld, val) \
kvm_cmp_feat_unsigned(kvm, id, fld, ==, val)
#define kvm_has_feat_enum(kvm, ...) __kvm_has_feat_enum(kvm, __VA_ARGS__)
#define kvm_has_feat_range(kvm, id, fld, min, max) \
(kvm_cmp_feat(kvm, id, fld, >=, min) && \
kvm_cmp_feat(kvm, id, fld, <=, max))
@ -1593,4 +1654,9 @@ static inline bool kvm_arch_has_irq_bypass(void)
return true;
}
void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt);
void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1);
void check_feature_map(void);
#endif /* __ARM64_KVM_HOST_H__ */

View File

@ -231,6 +231,38 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans)
shift; \
})
static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid)
{
u64 base, tg, num, scale;
int shift;
tg = FIELD_GET(GENMASK(47, 46), val);
switch(tg) {
case 1:
shift = 12;
break;
case 2:
shift = 14;
break;
case 3:
default: /* IMPDEF: handle tg==0 as 64k */
shift = 16;
break;
}
base = (val & GENMASK(36, 0)) << shift;
if (asid)
*asid = FIELD_GET(TLBIR_ASID_MASK, val);
scale = FIELD_GET(GENMASK(45, 44), val);
num = FIELD_GET(GENMASK(43, 39), val);
*range = __TLBI_RANGE_PAGES(num, scale) << shift;
return base;
}
static inline unsigned int ps_to_output_size(unsigned int ps)
{
switch (ps) {
@ -245,4 +277,72 @@ static inline unsigned int ps_to_output_size(unsigned int ps)
}
}
enum trans_regime {
TR_EL10,
TR_EL20,
TR_EL2,
};
struct s1_walk_info {
u64 baddr;
enum trans_regime regime;
unsigned int max_oa_bits;
unsigned int pgshift;
unsigned int txsz;
int sl;
bool as_el0;
bool hpd;
bool e0poe;
bool poe;
bool pan;
bool be;
bool s2;
};
struct s1_walk_result {
union {
struct {
u64 desc;
u64 pa;
s8 level;
u8 APTable;
bool nG;
u16 asid;
bool UXNTable;
bool PXNTable;
bool uwxn;
bool uov;
bool ur;
bool uw;
bool ux;
bool pwxn;
bool pov;
bool pr;
bool pw;
bool px;
};
struct {
u8 fst;
bool ptw;
bool s2;
};
};
bool failed;
};
int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va);
/* VNCR management */
int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu);
int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu);
void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val);
#define vncr_fixmap(c) \
({ \
u32 __c = (c); \
BUG_ON(__c >= NR_CPUS); \
(FIX_VNCR - __c); \
})
#endif /* __ARM64_KVM_NESTED_H */

View File

@ -59,6 +59,11 @@ typedef u64 kvm_pte_t;
#define KVM_PHYS_INVALID (-1ULL)
#define KVM_PTE_TYPE BIT(1)
#define KVM_PTE_TYPE_BLOCK 0
#define KVM_PTE_TYPE_PAGE 1
#define KVM_PTE_TYPE_TABLE 1
#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
@ -413,7 +418,7 @@ static inline bool kvm_pgtable_walk_lock_held(void)
*/
struct kvm_pgtable {
union {
struct rb_root pkvm_mappings;
struct rb_root_cached pkvm_mappings;
struct {
u32 ia_bits;
s8 start_level;

View File

@ -135,6 +135,12 @@ static inline unsigned long host_s2_pgtable_pages(void)
return res;
}
#ifdef CONFIG_NVHE_EL2_DEBUG
static inline unsigned long pkvm_selftest_pages(void) { return 32; }
#else
static inline unsigned long pkvm_selftest_pages(void) { return 0; }
#endif
#define KVM_FFA_MBOX_NR_PAGES 1
static inline unsigned long hyp_ffa_proxy_pages(void)
@ -167,6 +173,8 @@ struct pkvm_mapping {
struct rb_node node;
u64 gfn;
u64 pfn;
u64 nr_pages;
u64 __subtree_last; /* Internal member for interval tree */
};
int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,

View File

@ -11,6 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[];
extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
extern char __hyp_text_start[], __hyp_text_end[];
extern char __hyp_data_start[], __hyp_data_end[];
extern char __hyp_rodata_start[], __hyp_rodata_end[];
extern char __hyp_reloc_begin[], __hyp_reloc_end[];
extern char __hyp_bss_start[], __hyp_bss_end[];

View File

@ -117,6 +117,7 @@
#define SB_BARRIER_INSN __SYS_BARRIER_INSN(0, 7, 31)
/* Data cache zero operations */
#define SYS_DC_ISW sys_insn(1, 0, 7, 6, 2)
#define SYS_DC_IGSW sys_insn(1, 0, 7, 6, 4)
#define SYS_DC_IGDSW sys_insn(1, 0, 7, 6, 6)
@ -153,11 +154,13 @@
#define SYS_DC_CIGVAC sys_insn(1, 3, 7, 14, 3)
#define SYS_DC_CIGDVAC sys_insn(1, 3, 7, 14, 5)
/* Data cache zero operations */
#define SYS_DC_ZVA sys_insn(1, 3, 7, 4, 1)
#define SYS_DC_GVA sys_insn(1, 3, 7, 4, 3)
#define SYS_DC_GZVA sys_insn(1, 3, 7, 4, 4)
#define SYS_DC_CIVAPS sys_insn(1, 0, 7, 15, 1)
#define SYS_DC_CIGDVAPS sys_insn(1, 0, 7, 15, 5)
/*
* Automatically generated definitions for system registers, the
* manual encodings below are in the process of being converted to
@ -497,12 +500,22 @@
#define __PMEV_op2(n) ((n) & 0x7)
#define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3))
#define SYS_PMEVCNTSVRn_EL1(n) sys_reg(2, 0, 14, __CNTR_CRm(n), __PMEV_op2(n))
#define SYS_PMEVCNTRn_EL0(n) sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n))
#define __TYPER_CRm(n) (0xc | (((n) >> 3) & 0x3))
#define SYS_PMEVTYPERn_EL0(n) sys_reg(3, 3, 14, __TYPER_CRm(n), __PMEV_op2(n))
#define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7)
#define SYS_SPMCGCRn_EL1(n) sys_reg(2, 0, 9, 13, ((n) & 1))
#define __SPMEV_op2(n) ((n) & 0x7)
#define __SPMEV_crm(p, n) ((((p) & 7) << 1) | (((n) >> 3) & 1))
#define SYS_SPMEVCNTRn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b000, n), __SPMEV_op2(n))
#define SYS_SPMEVFILT2Rn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b011, n), __SPMEV_op2(n))
#define SYS_SPMEVFILTRn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b010, n), __SPMEV_op2(n))
#define SYS_SPMEVTYPERn_EL0(n) sys_reg(2, 3, 14, __SPMEV_crm(0b001, n), __SPMEV_op2(n))
#define SYS_VPIDR_EL2 sys_reg(3, 4, 0, 0, 0)
#define SYS_VMPIDR_EL2 sys_reg(3, 4, 0, 0, 5)
@ -521,7 +534,6 @@
#define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0)
#define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2)
#define SYS_VNCR_EL2 sys_reg(3, 4, 2, 2, 0)
#define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6)
#define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0)
#define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1)
@ -608,28 +620,18 @@
/* VHE encodings for architectural EL0/1 system registers */
#define SYS_BRBCR_EL12 sys_reg(2, 5, 9, 0, 0)
#define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0)
#define SYS_CPACR_EL12 sys_reg(3, 5, 1, 0, 2)
#define SYS_SCTLR2_EL12 sys_reg(3, 5, 1, 0, 3)
#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0)
#define SYS_TRFCR_EL12 sys_reg(3, 5, 1, 2, 1)
#define SYS_SMCR_EL12 sys_reg(3, 5, 1, 2, 6)
#define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0)
#define SYS_TTBR1_EL12 sys_reg(3, 5, 2, 0, 1)
#define SYS_TCR_EL12 sys_reg(3, 5, 2, 0, 2)
#define SYS_TCR2_EL12 sys_reg(3, 5, 2, 0, 3)
#define SYS_SPSR_EL12 sys_reg(3, 5, 4, 0, 0)
#define SYS_ELR_EL12 sys_reg(3, 5, 4, 0, 1)
#define SYS_AFSR0_EL12 sys_reg(3, 5, 5, 1, 0)
#define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1)
#define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0)
#define SYS_TFSR_EL12 sys_reg(3, 5, 5, 6, 0)
#define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0)
#define SYS_PMSCR_EL12 sys_reg(3, 5, 9, 9, 0)
#define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0)
#define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0)
#define SYS_VBAR_EL12 sys_reg(3, 5, 12, 0, 0)
#define SYS_CONTEXTIDR_EL12 sys_reg(3, 5, 13, 0, 1)
#define SYS_SCXTNUM_EL12 sys_reg(3, 5, 13, 0, 7)
#define SYS_CNTKCTL_EL12 sys_reg(3, 5, 14, 1, 0)
#define SYS_CNTP_TVAL_EL02 sys_reg(3, 5, 14, 2, 0)
@ -1091,6 +1093,15 @@
__emit_inst(0xd5000000|(\sreg)|(.L__gpr_num_\rt))
.endm
.macro msr_hcr_el2, reg
#if IS_ENABLED(CONFIG_AMPERE_ERRATUM_AC04_CPU_23)
dsb nsh
msr hcr_el2, \reg
isb
#else
msr hcr_el2, \reg
#endif
.endm
#else
#include <linux/bitfield.h>
@ -1178,6 +1189,13 @@
write_sysreg(__scs_new, sysreg); \
} while (0)
#define sysreg_clear_set_hcr(clear, set) do { \
u64 __scs_val = read_sysreg(hcr_el2); \
u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \
if (__scs_new != __scs_val) \
write_sysreg_hcr(__scs_new); \
} while (0)
#define sysreg_clear_set_s(sysreg, clear, set) do { \
u64 __scs_val = read_sysreg_s(sysreg); \
u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \
@ -1185,6 +1203,17 @@
write_sysreg_s(__scs_new, sysreg); \
} while (0)
#define write_sysreg_hcr(__val) do { \
if (IS_ENABLED(CONFIG_AMPERE_ERRATUM_AC04_CPU_23) && \
(!system_capabilities_finalized() || \
alternative_has_cap_unlikely(ARM64_WORKAROUND_AMPERE_AC04_CPU_23))) \
asm volatile("dsb nsh; msr hcr_el2, %x0; isb" \
: : "rZ" (__val)); \
else \
asm volatile("msr hcr_el2, %x0" \
: : "rZ" (__val)); \
} while (0)
#define read_sysreg_par() ({ \
u64 par; \
asm(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412)); \

View File

@ -35,6 +35,8 @@
#define VNCR_CNTP_CTL_EL0 0x180
#define VNCR_SCXTNUM_EL1 0x188
#define VNCR_TFSR_EL1 0x190
#define VNCR_HDFGRTR2_EL2 0x1A0
#define VNCR_HDFGWTR2_EL2 0x1B0
#define VNCR_HFGRTR_EL2 0x1B8
#define VNCR_HFGWTR_EL2 0x1C0
#define VNCR_HFGITR_EL2 0x1C8
@ -52,6 +54,9 @@
#define VNCR_PIRE0_EL1 0x290
#define VNCR_PIR_EL1 0x2A0
#define VNCR_POR_EL1 0x2A8
#define VNCR_HFGRTR2_EL2 0x2C0
#define VNCR_HFGWTR2_EL2 0x2C8
#define VNCR_HFGITR2_EL2 0x310
#define VNCR_ICH_LR0_EL2 0x400
#define VNCR_ICH_LR1_EL2 0x408
#define VNCR_ICH_LR2_EL2 0x410

View File

@ -431,10 +431,11 @@ enum {
/* Device Control API on vcpu fd */
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
#define KVM_ARM_VCPU_PMU_V3_IRQ 0
#define KVM_ARM_VCPU_PMU_V3_INIT 1
#define KVM_ARM_VCPU_PMU_V3_FILTER 2
#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3
#define KVM_ARM_VCPU_PMU_V3_IRQ 0
#define KVM_ARM_VCPU_PMU_V3_INIT 1
#define KVM_ARM_VCPU_PMU_V3_FILTER 2
#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3
#define KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS 4
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1

View File

@ -557,6 +557,13 @@ static const struct midr_range erratum_ac03_cpu_38_list[] = {
};
#endif
#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23
static const struct midr_range erratum_ac04_cpu_23_list[] = {
MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
{},
};
#endif
const struct arm64_cpu_capabilities arm64_errata[] = {
#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
{
@ -875,6 +882,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
.capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list),
},
#endif
#ifdef CONFIG_AMPERE_ERRATUM_AC04_CPU_23
{
.desc = "AmpereOne erratum AC04_CPU_23",
.capability = ARM64_WORKAROUND_AMPERE_AC04_CPU_23,
ERRATA_MIDR_RANGE_LIST(erratum_ac04_cpu_23_list),
},
#endif
{
.desc = "Broken CNTVOFF_EL2",

View File

@ -305,6 +305,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0),
S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0),
@ -2885,6 +2886,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_cpuid_feature,
ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP)
},
{
.desc = "Fine Grained Traps 2",
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.capability = ARM64_HAS_FGT2,
.matches = has_cpuid_feature,
ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2)
},
#ifdef CONFIG_ARM64_SME
{
.desc = "Scalable Matrix Extension",

View File

@ -97,7 +97,7 @@ SYM_CODE_START_LOCAL(__finalise_el2)
2:
// Engage the VHE magic!
mov_q x0, HCR_HOST_VHE_FLAGS
msr hcr_el2, x0
msr_hcr_el2 x0
isb
// Use the EL1 allocated stack, per-cpu offset

View File

@ -126,6 +126,8 @@ KVM_NVHE_ALIAS(__hyp_text_start);
KVM_NVHE_ALIAS(__hyp_text_end);
KVM_NVHE_ALIAS(__hyp_bss_start);
KVM_NVHE_ALIAS(__hyp_bss_end);
KVM_NVHE_ALIAS(__hyp_data_start);
KVM_NVHE_ALIAS(__hyp_data_end);
KVM_NVHE_ALIAS(__hyp_rodata_start);
KVM_NVHE_ALIAS(__hyp_rodata_end);

View File

@ -1118,7 +1118,7 @@ static struct break_hook kasan_break_hook = {
#ifdef CONFIG_UBSAN_TRAP
static int ubsan_handler(struct pt_regs *regs, unsigned long esr)
{
die(report_ubsan_failure(regs, esr & UBSAN_BRK_MASK), regs, esr);
die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr);
return DBG_HOOK_HANDLED;
}
@ -1145,7 +1145,7 @@ int __init early_brk64(unsigned long addr, unsigned long esr,
return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
#ifdef CONFIG_UBSAN_TRAP
if ((esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM)
if (esr_is_ubsan_brk(esr))
return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
return bug_handler(regs, esr) != DBG_HOOK_HANDLED;

View File

@ -13,7 +13,7 @@
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;
#define HYPERVISOR_DATA_SECTIONS \
#define HYPERVISOR_RODATA_SECTIONS \
HYP_SECTION_NAME(.rodata) : { \
. = ALIGN(PAGE_SIZE); \
__hyp_rodata_start = .; \
@ -23,6 +23,15 @@
__hyp_rodata_end = .; \
}
#define HYPERVISOR_DATA_SECTION \
HYP_SECTION_NAME(.data) : { \
. = ALIGN(PAGE_SIZE); \
__hyp_data_start = .; \
*(HYP_SECTION_NAME(.data)) \
. = ALIGN(PAGE_SIZE); \
__hyp_data_end = .; \
}
#define HYPERVISOR_PERCPU_SECTION \
. = ALIGN(PAGE_SIZE); \
HYP_SECTION_NAME(.data..percpu) : { \
@ -51,7 +60,8 @@
#define SBSS_ALIGN PAGE_SIZE
#else /* CONFIG_KVM */
#define HYPERVISOR_EXTABLE
#define HYPERVISOR_DATA_SECTIONS
#define HYPERVISOR_RODATA_SECTIONS
#define HYPERVISOR_DATA_SECTION
#define HYPERVISOR_PERCPU_SECTION
#define HYPERVISOR_RELOC_SECTION
#define SBSS_ALIGN 0
@ -190,7 +200,7 @@ SECTIONS
/* everything from this point to __init_begin will be marked RO NX */
RO_DATA(PAGE_SIZE)
HYPERVISOR_DATA_SECTIONS
HYPERVISOR_RODATA_SECTIONS
.got : { *(.got) }
/*
@ -295,6 +305,8 @@ SECTIONS
_sdata = .;
RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
HYPERVISOR_DATA_SECTION
/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback

View File

@ -14,7 +14,7 @@ CFLAGS_sys_regs.o += -Wno-override-init
CFLAGS_handle_exit.o += -Wno-override-init
kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
inject_fault.o va_layout.o handle_exit.o config.o \
guest.o debug.o reset.o sys_regs.o stacktrace.o \
vgic-sys-reg-v3.o fpsimd.o pkvm.o \
arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \

View File

@ -368,6 +368,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_EL1_32BIT:
r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
break;
case KVM_CAP_ARM_EL2:
r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT);
break;
case KVM_CAP_ARM_EL2_E2H0:
r = cpus_have_final_cap(ARM64_HAS_HCR_NV1);
break;
case KVM_CAP_GUEST_DEBUG_HW_BPS:
r = get_num_brps();
break;
@ -843,6 +849,10 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
return ret;
if (vcpu_has_nv(vcpu)) {
ret = kvm_vcpu_allocate_vncr_tlb(vcpu);
if (ret)
return ret;
ret = kvm_vgic_vcpu_nv_init(vcpu);
if (ret)
return ret;
@ -2450,6 +2460,19 @@ static void kvm_hyp_init_symbols(void)
kvm_nvhe_sym(__icache_flags) = __icache_flags;
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
/* Propagate the FGT state to the the nVHE side */
kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks;
kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks;
kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks;
kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks;
kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks;
kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks;
kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks;
kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks;
kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks;
kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks;
kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks;
/*
* Flush entire BSS since part of its data containing init symbols is read
* while the MMU is off.
@ -2604,6 +2627,13 @@ static int __init init_hyp_mode(void)
goto out_err;
}
err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start),
kvm_ksym_ref(__hyp_data_end), PAGE_HYP);
if (err) {
kvm_err("Cannot map .hyp.data section\n");
goto out_err;
}
err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
if (err) {

View File

@ -10,61 +10,11 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
enum trans_regime {
TR_EL10,
TR_EL20,
TR_EL2,
};
struct s1_walk_info {
u64 baddr;
enum trans_regime regime;
unsigned int max_oa_bits;
unsigned int pgshift;
unsigned int txsz;
int sl;
bool hpd;
bool e0poe;
bool poe;
bool pan;
bool be;
bool s2;
};
struct s1_walk_result {
union {
struct {
u64 desc;
u64 pa;
s8 level;
u8 APTable;
bool UXNTable;
bool PXNTable;
bool uwxn;
bool uov;
bool ur;
bool uw;
bool ux;
bool pwxn;
bool pov;
bool pr;
bool pw;
bool px;
};
struct {
u8 fst;
bool ptw;
bool s2;
};
};
bool failed;
};
static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
{
wr->fst = fst;
wr->ptw = ptw;
wr->s2 = s2;
wr->ptw = s1ptw;
wr->s2 = s1ptw;
wr->failed = true;
}
@ -145,20 +95,15 @@ static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
}
}
static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va)
{
u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
unsigned int stride, x;
bool va55, tbi, lva, as_el0;
bool va55, tbi, lva;
hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
wi->regime = compute_translation_regime(vcpu, op);
as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
(*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
va55 = va & BIT(55);
if (wi->regime == TR_EL2 && va55)
@ -319,7 +264,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
/* R_BNDVG and following statements */
if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
goto transfault_l0;
/* AArch64.S1StartLevel() */
@ -345,11 +290,11 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
return 0;
addrsz: /* Address Size Fault level 0 */
fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
return -EFAULT;
transfault_l0: /* Translation Fault level 0 */
fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false);
return -EFAULT;
}
@ -380,13 +325,13 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (ret) {
fail_s1_walk(wr,
(s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
true, true);
true);
return ret;
}
if (!kvm_s2_trans_readable(&s2_trans)) {
fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
true, true);
true);
return -EPERM;
}
@ -396,8 +341,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
if (ret) {
fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
true, false);
fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
return ret;
}
@ -457,6 +401,11 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (check_output_size(desc & GENMASK(47, va_bottom), wi))
goto addrsz;
if (!(desc & PTE_AF)) {
fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
return -EACCES;
}
va_bottom += contiguous_bit_shift(desc, wi, level);
wr->failed = false;
@ -465,13 +414,40 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
wr->pa = desc & GENMASK(47, va_bottom);
wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
if (wr->nG) {
u64 asid_ttbr, tcr;
switch (wi->regime) {
case TR_EL10:
tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
asid_ttbr = ((tcr & TCR_A1) ?
vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
vcpu_read_sys_reg(vcpu, TTBR0_EL1));
break;
case TR_EL20:
tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
asid_ttbr = ((tcr & TCR_A1) ?
vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
vcpu_read_sys_reg(vcpu, TTBR0_EL2));
break;
default:
BUG();
}
wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr);
if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
!(tcr & TCR_ASID16))
wr->asid &= GENMASK(7, 0);
}
return 0;
addrsz:
fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
return -EINVAL;
transfault:
fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
return -ENOENT;
}
@ -488,7 +464,6 @@ struct mmu_config {
u64 sctlr;
u64 vttbr;
u64 vtcr;
u64 hcr;
};
static void __mmu_config_save(struct mmu_config *config)
@ -511,13 +486,10 @@ static void __mmu_config_save(struct mmu_config *config)
config->sctlr = read_sysreg_el1(SYS_SCTLR);
config->vttbr = read_sysreg(vttbr_el2);
config->vtcr = read_sysreg(vtcr_el2);
config->hcr = read_sysreg(hcr_el2);
}
static void __mmu_config_restore(struct mmu_config *config)
{
write_sysreg(config->hcr, hcr_el2);
/*
* ARM errata 1165522 and 1530923 require TGE to be 1 before
* we update the guest state.
@ -1155,7 +1127,12 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
bool perm_fail = false;
int ret, idx;
ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
wi.regime = compute_translation_regime(vcpu, op);
wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
(*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
if (ret)
goto compute_par;
@ -1198,7 +1175,7 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
}
if (perm_fail)
fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
compute_par:
return compute_par_s1(vcpu, &wr, wi.regime);
@ -1210,7 +1187,8 @@ compute_par:
* If the translation is unsuccessful, the value may only contain
* PAR_EL1.F, and cannot be taken at face value. It isn't an
* indication of the translation having failed, only that the fast
* path did not succeed, *unless* it indicates a S1 permission fault.
* path did not succeed, *unless* it indicates a S1 permission or
* access fault.
*/
static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
@ -1266,8 +1244,8 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
__load_stage2(mmu, mmu->arch);
skip_mmu_switch:
/* Clear TGE, enable S2 translation, we're rolling */
write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2);
/* Temporarily switch back to guest context */
write_sysreg_hcr(vcpu->arch.hcr_el2);
isb();
switch (op) {
@ -1299,6 +1277,8 @@ skip_mmu_switch:
if (!fail)
par = read_sysreg_par();
write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
__mmu_config_restore(&config);
@ -1313,19 +1293,29 @@ static bool par_check_s1_perm_fault(u64 par)
!(par & SYS_PAR_EL1_S));
}
static bool par_check_s1_access_fault(u64 par)
{
u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
!(par & SYS_PAR_EL1_S));
}
void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
/*
* If PAR_EL1 reports that AT failed on a S1 permission fault, we
* know for sure that the PTW was able to walk the S1 tables and
* there's nothing else to do.
* If PAR_EL1 reports that AT failed on a S1 permission or access
* fault, we know for sure that the PTW was able to walk the S1
* tables and there's nothing else to do.
*
* If AT failed for any other reason, then we must walk the guest S1
* to emulate the instruction.
*/
if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
if ((par & SYS_PAR_EL1_F) &&
!par_check_s1_perm_fault(par) &&
!par_check_s1_access_fault(par))
par = handle_at_slow(vcpu, op, vaddr);
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
@ -1350,7 +1340,7 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
if (!vcpu_el2_e2h_is_set(vcpu))
val |= HCR_NV | HCR_NV1;
write_sysreg(val, hcr_el2);
write_sysreg_hcr(val);
isb();
par = SYS_PAR_EL1_F;
@ -1375,7 +1365,7 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
if (!fail)
par = read_sysreg_par();
write_sysreg(hcr, hcr_el2);
write_sysreg_hcr(hcr);
isb();
}
@ -1444,3 +1434,31 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
par = compute_par_s12(vcpu, par, &out);
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
}
/*
* Translate a VA for a given EL in a given translation regime, with
* or without PAN. This requires wi->{regime, as_el0, pan} to be
* set. The rest of the wi and wr should be 0-initialised.
*/
int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va)
{
int ret;
ret = setup_s1_walk(vcpu, wi, wr, va);
if (ret)
return ret;
if (wr->level == S1_MMU_DISABLED) {
wr->ur = wr->uw = wr->ux = true;
wr->pr = wr->pw = wr->px = true;
} else {
ret = walk_s1(vcpu, wi, wr, va);
if (ret)
return ret;
compute_s1_permissions(vcpu, wi, wr);
}
return 0;
}

1085
arch/arm64/kvm/config.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -622,6 +622,11 @@ struct encoding_to_trap_config {
const unsigned int line;
};
/*
* WARNING: using ranges is a treacherous endeavour, as sysregs that
* are part of an architectural range are not necessarily contiguous
* in the [Op0,Op1,CRn,CRm,Ops] space. Tread carefully.
*/
#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \
{ \
.encoding = sr_start, \
@ -1279,98 +1284,128 @@ enum fg_filter_id {
__NR_FG_FILTER_IDS__
};
#define SR_FGF(sr, g, b, p, f) \
{ \
.encoding = sr, \
.end = sr, \
.tc = { \
#define __FGT(g, b, p, f) \
{ \
.fgt = g ## _GROUP, \
.bit = g ## _EL2_ ## b ## _SHIFT, \
.pol = p, \
.fgf = f, \
}, \
}
#define FGT(g, b, p) __FGT(g, b, p, __NO_FGF__)
/*
* See the warning next to SR_RANGE_TRAP(), and apply the same
* level of caution.
*/
#define SR_FGF_RANGE(sr, e, g, b, p, f) \
{ \
.encoding = sr, \
.end = e, \
.tc = __FGT(g, b, p, f), \
.line = __LINE__, \
}
#define SR_FGT(sr, g, b, p) SR_FGF(sr, g, b, p, __NO_FGF__)
#define SR_FGF(sr, g, b, p, f) SR_FGF_RANGE(sr, sr, g, b, p, f)
#define SR_FGT(sr, g, b, p) SR_FGF_RANGE(sr, sr, g, b, p, __NO_FGF__)
#define SR_FGT_RANGE(sr, end, g, b, p) \
SR_FGF_RANGE(sr, end, g, b, p, __NO_FGF__)
static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
/* HFGRTR_EL2, HFGWTR_EL2 */
SR_FGT(SYS_AMAIR2_EL1, HFGxTR, nAMAIR2_EL1, 0),
SR_FGT(SYS_MAIR2_EL1, HFGxTR, nMAIR2_EL1, 0),
SR_FGT(SYS_S2POR_EL1, HFGxTR, nS2POR_EL1, 0),
SR_FGT(SYS_POR_EL1, HFGxTR, nPOR_EL1, 0),
SR_FGT(SYS_POR_EL0, HFGxTR, nPOR_EL0, 0),
SR_FGT(SYS_PIR_EL1, HFGxTR, nPIR_EL1, 0),
SR_FGT(SYS_PIRE0_EL1, HFGxTR, nPIRE0_EL1, 0),
SR_FGT(SYS_RCWMASK_EL1, HFGxTR, nRCWMASK_EL1, 0),
SR_FGT(SYS_TPIDR2_EL0, HFGxTR, nTPIDR2_EL0, 0),
SR_FGT(SYS_SMPRI_EL1, HFGxTR, nSMPRI_EL1, 0),
SR_FGT(SYS_GCSCR_EL1, HFGxTR, nGCS_EL1, 0),
SR_FGT(SYS_GCSPR_EL1, HFGxTR, nGCS_EL1, 0),
SR_FGT(SYS_GCSCRE0_EL1, HFGxTR, nGCS_EL0, 0),
SR_FGT(SYS_GCSPR_EL0, HFGxTR, nGCS_EL0, 0),
SR_FGT(SYS_ACCDATA_EL1, HFGxTR, nACCDATA_EL1, 0),
SR_FGT(SYS_ERXADDR_EL1, HFGxTR, ERXADDR_EL1, 1),
SR_FGT(SYS_ERXPFGCDN_EL1, HFGxTR, ERXPFGCDN_EL1, 1),
SR_FGT(SYS_ERXPFGCTL_EL1, HFGxTR, ERXPFGCTL_EL1, 1),
SR_FGT(SYS_ERXPFGF_EL1, HFGxTR, ERXPFGF_EL1, 1),
SR_FGT(SYS_ERXMISC0_EL1, HFGxTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXMISC1_EL1, HFGxTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXMISC2_EL1, HFGxTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXMISC3_EL1, HFGxTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXSTATUS_EL1, HFGxTR, ERXSTATUS_EL1, 1),
SR_FGT(SYS_ERXCTLR_EL1, HFGxTR, ERXCTLR_EL1, 1),
SR_FGT(SYS_ERXFR_EL1, HFGxTR, ERXFR_EL1, 1),
SR_FGT(SYS_ERRSELR_EL1, HFGxTR, ERRSELR_EL1, 1),
SR_FGT(SYS_ERRIDR_EL1, HFGxTR, ERRIDR_EL1, 1),
SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGxTR, ICC_IGRPENn_EL1, 1),
SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGxTR, ICC_IGRPENn_EL1, 1),
SR_FGT(SYS_VBAR_EL1, HFGxTR, VBAR_EL1, 1),
SR_FGT(SYS_TTBR1_EL1, HFGxTR, TTBR1_EL1, 1),
SR_FGT(SYS_TTBR0_EL1, HFGxTR, TTBR0_EL1, 1),
SR_FGT(SYS_TPIDR_EL0, HFGxTR, TPIDR_EL0, 1),
SR_FGT(SYS_TPIDRRO_EL0, HFGxTR, TPIDRRO_EL0, 1),
SR_FGT(SYS_TPIDR_EL1, HFGxTR, TPIDR_EL1, 1),
SR_FGT(SYS_TCR_EL1, HFGxTR, TCR_EL1, 1),
SR_FGT(SYS_TCR2_EL1, HFGxTR, TCR_EL1, 1),
SR_FGT(SYS_SCXTNUM_EL0, HFGxTR, SCXTNUM_EL0, 1),
SR_FGT(SYS_SCXTNUM_EL1, HFGxTR, SCXTNUM_EL1, 1),
SR_FGT(SYS_SCTLR_EL1, HFGxTR, SCTLR_EL1, 1),
SR_FGT(SYS_REVIDR_EL1, HFGxTR, REVIDR_EL1, 1),
SR_FGT(SYS_PAR_EL1, HFGxTR, PAR_EL1, 1),
SR_FGT(SYS_MPIDR_EL1, HFGxTR, MPIDR_EL1, 1),
SR_FGT(SYS_MIDR_EL1, HFGxTR, MIDR_EL1, 1),
SR_FGT(SYS_MAIR_EL1, HFGxTR, MAIR_EL1, 1),
SR_FGT(SYS_LORSA_EL1, HFGxTR, LORSA_EL1, 1),
SR_FGT(SYS_LORN_EL1, HFGxTR, LORN_EL1, 1),
SR_FGT(SYS_LORID_EL1, HFGxTR, LORID_EL1, 1),
SR_FGT(SYS_LOREA_EL1, HFGxTR, LOREA_EL1, 1),
SR_FGT(SYS_LORC_EL1, HFGxTR, LORC_EL1, 1),
SR_FGT(SYS_ISR_EL1, HFGxTR, ISR_EL1, 1),
SR_FGT(SYS_FAR_EL1, HFGxTR, FAR_EL1, 1),
SR_FGT(SYS_ESR_EL1, HFGxTR, ESR_EL1, 1),
SR_FGT(SYS_DCZID_EL0, HFGxTR, DCZID_EL0, 1),
SR_FGT(SYS_CTR_EL0, HFGxTR, CTR_EL0, 1),
SR_FGT(SYS_CSSELR_EL1, HFGxTR, CSSELR_EL1, 1),
SR_FGT(SYS_CPACR_EL1, HFGxTR, CPACR_EL1, 1),
SR_FGT(SYS_CONTEXTIDR_EL1, HFGxTR, CONTEXTIDR_EL1, 1),
SR_FGT(SYS_CLIDR_EL1, HFGxTR, CLIDR_EL1, 1),
SR_FGT(SYS_CCSIDR_EL1, HFGxTR, CCSIDR_EL1, 1),
SR_FGT(SYS_APIBKEYLO_EL1, HFGxTR, APIBKey, 1),
SR_FGT(SYS_APIBKEYHI_EL1, HFGxTR, APIBKey, 1),
SR_FGT(SYS_APIAKEYLO_EL1, HFGxTR, APIAKey, 1),
SR_FGT(SYS_APIAKEYHI_EL1, HFGxTR, APIAKey, 1),
SR_FGT(SYS_APGAKEYLO_EL1, HFGxTR, APGAKey, 1),
SR_FGT(SYS_APGAKEYHI_EL1, HFGxTR, APGAKey, 1),
SR_FGT(SYS_APDBKEYLO_EL1, HFGxTR, APDBKey, 1),
SR_FGT(SYS_APDBKEYHI_EL1, HFGxTR, APDBKey, 1),
SR_FGT(SYS_APDAKEYLO_EL1, HFGxTR, APDAKey, 1),
SR_FGT(SYS_APDAKEYHI_EL1, HFGxTR, APDAKey, 1),
SR_FGT(SYS_AMAIR_EL1, HFGxTR, AMAIR_EL1, 1),
SR_FGT(SYS_AIDR_EL1, HFGxTR, AIDR_EL1, 1),
SR_FGT(SYS_AFSR1_EL1, HFGxTR, AFSR1_EL1, 1),
SR_FGT(SYS_AFSR0_EL1, HFGxTR, AFSR0_EL1, 1),
SR_FGT(SYS_AMAIR2_EL1, HFGRTR, nAMAIR2_EL1, 0),
SR_FGT(SYS_MAIR2_EL1, HFGRTR, nMAIR2_EL1, 0),
SR_FGT(SYS_S2POR_EL1, HFGRTR, nS2POR_EL1, 0),
SR_FGT(SYS_POR_EL1, HFGRTR, nPOR_EL1, 0),
SR_FGT(SYS_POR_EL0, HFGRTR, nPOR_EL0, 0),
SR_FGT(SYS_PIR_EL1, HFGRTR, nPIR_EL1, 0),
SR_FGT(SYS_PIRE0_EL1, HFGRTR, nPIRE0_EL1, 0),
SR_FGT(SYS_RCWMASK_EL1, HFGRTR, nRCWMASK_EL1, 0),
SR_FGT(SYS_TPIDR2_EL0, HFGRTR, nTPIDR2_EL0, 0),
SR_FGT(SYS_SMPRI_EL1, HFGRTR, nSMPRI_EL1, 0),
SR_FGT(SYS_GCSCR_EL1, HFGRTR, nGCS_EL1, 0),
SR_FGT(SYS_GCSPR_EL1, HFGRTR, nGCS_EL1, 0),
SR_FGT(SYS_GCSCRE0_EL1, HFGRTR, nGCS_EL0, 0),
SR_FGT(SYS_GCSPR_EL0, HFGRTR, nGCS_EL0, 0),
SR_FGT(SYS_ACCDATA_EL1, HFGRTR, nACCDATA_EL1, 0),
SR_FGT(SYS_ERXADDR_EL1, HFGRTR, ERXADDR_EL1, 1),
SR_FGT(SYS_ERXPFGCDN_EL1, HFGRTR, ERXPFGCDN_EL1, 1),
SR_FGT(SYS_ERXPFGCTL_EL1, HFGRTR, ERXPFGCTL_EL1, 1),
SR_FGT(SYS_ERXPFGF_EL1, HFGRTR, ERXPFGF_EL1, 1),
SR_FGT(SYS_ERXMISC0_EL1, HFGRTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXMISC1_EL1, HFGRTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXMISC2_EL1, HFGRTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXMISC3_EL1, HFGRTR, ERXMISCn_EL1, 1),
SR_FGT(SYS_ERXSTATUS_EL1, HFGRTR, ERXSTATUS_EL1, 1),
SR_FGT(SYS_ERXCTLR_EL1, HFGRTR, ERXCTLR_EL1, 1),
SR_FGT(SYS_ERXFR_EL1, HFGRTR, ERXFR_EL1, 1),
SR_FGT(SYS_ERRSELR_EL1, HFGRTR, ERRSELR_EL1, 1),
SR_FGT(SYS_ERRIDR_EL1, HFGRTR, ERRIDR_EL1, 1),
SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGRTR, ICC_IGRPENn_EL1, 1),
SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGRTR, ICC_IGRPENn_EL1, 1),
SR_FGT(SYS_VBAR_EL1, HFGRTR, VBAR_EL1, 1),
SR_FGT(SYS_TTBR1_EL1, HFGRTR, TTBR1_EL1, 1),
SR_FGT(SYS_TTBR0_EL1, HFGRTR, TTBR0_EL1, 1),
SR_FGT(SYS_TPIDR_EL0, HFGRTR, TPIDR_EL0, 1),
SR_FGT(SYS_TPIDRRO_EL0, HFGRTR, TPIDRRO_EL0, 1),
SR_FGT(SYS_TPIDR_EL1, HFGRTR, TPIDR_EL1, 1),
SR_FGT(SYS_TCR_EL1, HFGRTR, TCR_EL1, 1),
SR_FGT(SYS_TCR2_EL1, HFGRTR, TCR_EL1, 1),
SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1),
SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1),
SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1),
SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1),
SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1),
SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1),
SR_FGT(SYS_MIDR_EL1, HFGRTR, MIDR_EL1, 1),
SR_FGT(SYS_MAIR_EL1, HFGRTR, MAIR_EL1, 1),
SR_FGT(SYS_LORSA_EL1, HFGRTR, LORSA_EL1, 1),
SR_FGT(SYS_LORN_EL1, HFGRTR, LORN_EL1, 1),
SR_FGT(SYS_LORID_EL1, HFGRTR, LORID_EL1, 1),
SR_FGT(SYS_LOREA_EL1, HFGRTR, LOREA_EL1, 1),
SR_FGT(SYS_LORC_EL1, HFGRTR, LORC_EL1, 1),
SR_FGT(SYS_ISR_EL1, HFGRTR, ISR_EL1, 1),
SR_FGT(SYS_FAR_EL1, HFGRTR, FAR_EL1, 1),
SR_FGT(SYS_ESR_EL1, HFGRTR, ESR_EL1, 1),
SR_FGT(SYS_DCZID_EL0, HFGRTR, DCZID_EL0, 1),
SR_FGT(SYS_CTR_EL0, HFGRTR, CTR_EL0, 1),
SR_FGT(SYS_CSSELR_EL1, HFGRTR, CSSELR_EL1, 1),
SR_FGT(SYS_CPACR_EL1, HFGRTR, CPACR_EL1, 1),
SR_FGT(SYS_CONTEXTIDR_EL1, HFGRTR, CONTEXTIDR_EL1, 1),
SR_FGT(SYS_CLIDR_EL1, HFGRTR, CLIDR_EL1, 1),
SR_FGT(SYS_CCSIDR_EL1, HFGRTR, CCSIDR_EL1, 1),
SR_FGT(SYS_APIBKEYLO_EL1, HFGRTR, APIBKey, 1),
SR_FGT(SYS_APIBKEYHI_EL1, HFGRTR, APIBKey, 1),
SR_FGT(SYS_APIAKEYLO_EL1, HFGRTR, APIAKey, 1),
SR_FGT(SYS_APIAKEYHI_EL1, HFGRTR, APIAKey, 1),
SR_FGT(SYS_APGAKEYLO_EL1, HFGRTR, APGAKey, 1),
SR_FGT(SYS_APGAKEYHI_EL1, HFGRTR, APGAKey, 1),
SR_FGT(SYS_APDBKEYLO_EL1, HFGRTR, APDBKey, 1),
SR_FGT(SYS_APDBKEYHI_EL1, HFGRTR, APDBKey, 1),
SR_FGT(SYS_APDAKEYLO_EL1, HFGRTR, APDAKey, 1),
SR_FGT(SYS_APDAKEYHI_EL1, HFGRTR, APDAKey, 1),
SR_FGT(SYS_AMAIR_EL1, HFGRTR, AMAIR_EL1, 1),
SR_FGT(SYS_AIDR_EL1, HFGRTR, AIDR_EL1, 1),
SR_FGT(SYS_AFSR1_EL1, HFGRTR, AFSR1_EL1, 1),
SR_FGT(SYS_AFSR0_EL1, HFGRTR, AFSR0_EL1, 1),
/* HFGRTR2_EL2, HFGWTR2_EL2 */
SR_FGT(SYS_ACTLRALIAS_EL1, HFGRTR2, nACTLRALIAS_EL1, 0),
SR_FGT(SYS_ACTLRMASK_EL1, HFGRTR2, nACTLRMASK_EL1, 0),
SR_FGT(SYS_CPACRALIAS_EL1, HFGRTR2, nCPACRALIAS_EL1, 0),
SR_FGT(SYS_CPACRMASK_EL1, HFGRTR2, nCPACRMASK_EL1, 0),
SR_FGT(SYS_PFAR_EL1, HFGRTR2, nPFAR_EL1, 0),
SR_FGT(SYS_RCWSMASK_EL1, HFGRTR2, nRCWSMASK_EL1, 0),
SR_FGT(SYS_SCTLR2ALIAS_EL1, HFGRTR2, nSCTLRALIAS2_EL1, 0),
SR_FGT(SYS_SCTLR2MASK_EL1, HFGRTR2, nSCTLR2MASK_EL1, 0),
SR_FGT(SYS_SCTLRALIAS_EL1, HFGRTR2, nSCTLRALIAS_EL1, 0),
SR_FGT(SYS_SCTLRMASK_EL1, HFGRTR2, nSCTLRMASK_EL1, 0),
SR_FGT(SYS_TCR2ALIAS_EL1, HFGRTR2, nTCR2ALIAS_EL1, 0),
SR_FGT(SYS_TCR2MASK_EL1, HFGRTR2, nTCR2MASK_EL1, 0),
SR_FGT(SYS_TCRALIAS_EL1, HFGRTR2, nTCRALIAS_EL1, 0),
SR_FGT(SYS_TCRMASK_EL1, HFGRTR2, nTCRMASK_EL1, 0),
SR_FGT(SYS_ERXGSR_EL1, HFGRTR2, nERXGSR_EL1, 0),
/* HFGITR_EL2 */
SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1),
SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1),
@ -1480,6 +1515,11 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_IC_IVAU, HFGITR, ICIVAU, 1),
SR_FGT(SYS_IC_IALLU, HFGITR, ICIALLU, 1),
SR_FGT(SYS_IC_IALLUIS, HFGITR, ICIALLUIS, 1),
/* HFGITR2_EL2 */
SR_FGT(SYS_DC_CIGDVAPS, HFGITR2, nDCCIVAPS, 0),
SR_FGT(SYS_DC_CIVAPS, HFGITR2, nDCCIVAPS, 0),
/* HDFGRTR_EL2 */
SR_FGT(SYS_PMBIDR_EL1, HDFGRTR, PMBIDR_EL1, 1),
SR_FGT(SYS_PMSNEVFR_EL1, HDFGRTR, nPMSNEVFR_EL1, 0),
@ -1789,68 +1829,12 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_PMCNTENSET_EL0, HDFGRTR, PMCNTEN, 1),
SR_FGT(SYS_PMCCNTR_EL0, HDFGRTR, PMCCNTR_EL0, 1),
SR_FGT(SYS_PMCCFILTR_EL0, HDFGRTR, PMCCFILTR_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(0), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(1), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(2), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(3), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(4), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(5), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(6), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(7), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(8), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(9), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(10), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(11), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(12), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(13), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(14), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(15), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(16), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(17), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(18), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(19), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(20), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(21), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(22), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(23), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(24), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(25), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(26), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(27), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(28), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(29), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVTYPERn_EL0(30), HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(0), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(1), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(2), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(3), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(4), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(5), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(6), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(7), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(8), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(9), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(10), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(11), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(12), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(13), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(14), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(15), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(16), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(17), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(18), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(19), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(20), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(21), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(22), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(23), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(24), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(25), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(26), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(27), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(28), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(29), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_PMEVCNTRn_EL0(30), HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT_RANGE(SYS_PMEVTYPERn_EL0(0),
SYS_PMEVTYPERn_EL0(30),
HDFGRTR, PMEVTYPERn_EL0, 1),
SR_FGT_RANGE(SYS_PMEVCNTRn_EL0(0),
SYS_PMEVCNTRn_EL0(30),
HDFGRTR, PMEVCNTRn_EL0, 1),
SR_FGT(SYS_OSDLR_EL1, HDFGRTR, OSDLR_EL1, 1),
SR_FGT(SYS_OSECCR_EL1, HDFGRTR, OSECCR_EL1, 1),
SR_FGT(SYS_OSLSR_EL1, HDFGRTR, OSLSR_EL1, 1),
@ -1928,6 +1912,59 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_DBGBCRn_EL1(13), HDFGRTR, DBGBCRn_EL1, 1),
SR_FGT(SYS_DBGBCRn_EL1(14), HDFGRTR, DBGBCRn_EL1, 1),
SR_FGT(SYS_DBGBCRn_EL1(15), HDFGRTR, DBGBCRn_EL1, 1),
/* HDFGRTR2_EL2 */
SR_FGT(SYS_MDSELR_EL1, HDFGRTR2, nMDSELR_EL1, 0),
SR_FGT(SYS_MDSTEPOP_EL1, HDFGRTR2, nMDSTEPOP_EL1, 0),
SR_FGT(SYS_PMCCNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0),
SR_FGT_RANGE(SYS_PMEVCNTSVRn_EL1(0),
SYS_PMEVCNTSVRn_EL1(30),
HDFGRTR2, nPMSSDATA, 0),
SR_FGT(SYS_PMICNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0),
SR_FGT(SYS_PMECR_EL1, HDFGRTR2, nPMECR_EL1, 0),
SR_FGT(SYS_PMIAR_EL1, HDFGRTR2, nPMIAR_EL1, 0),
SR_FGT(SYS_PMICFILTR_EL0, HDFGRTR2, nPMICFILTR_EL0, 0),
SR_FGT(SYS_PMICNTR_EL0, HDFGRTR2, nPMICNTR_EL0, 0),
SR_FGT(SYS_PMSSCR_EL1, HDFGRTR2, nPMSSCR_EL1, 0),
SR_FGT(SYS_PMUACR_EL1, HDFGRTR2, nPMUACR_EL1, 0),
SR_FGT(SYS_SPMACCESSR_EL1, HDFGRTR2, nSPMACCESSR_EL1, 0),
SR_FGT(SYS_SPMCFGR_EL1, HDFGRTR2, nSPMID, 0),
SR_FGT(SYS_SPMDEVARCH_EL1, HDFGRTR2, nSPMID, 0),
SR_FGT(SYS_SPMCGCRn_EL1(0), HDFGRTR2, nSPMID, 0),
SR_FGT(SYS_SPMCGCRn_EL1(1), HDFGRTR2, nSPMID, 0),
SR_FGT(SYS_SPMIIDR_EL1, HDFGRTR2, nSPMID, 0),
SR_FGT(SYS_SPMCNTENCLR_EL0, HDFGRTR2, nSPMCNTEN, 0),
SR_FGT(SYS_SPMCNTENSET_EL0, HDFGRTR2, nSPMCNTEN, 0),
SR_FGT(SYS_SPMCR_EL0, HDFGRTR2, nSPMCR_EL0, 0),
SR_FGT(SYS_SPMDEVAFF_EL1, HDFGRTR2, nSPMDEVAFF_EL1, 0),
/*
* We have up to 64 of these registers in ranges of 16, banked via
* SPMSELR_EL0.BANK. We're only concerned with the accessors here,
* not the architectural registers.
*/
SR_FGT_RANGE(SYS_SPMEVCNTRn_EL0(0),
SYS_SPMEVCNTRn_EL0(15),
HDFGRTR2, nSPMEVCNTRn_EL0, 0),
SR_FGT_RANGE(SYS_SPMEVFILT2Rn_EL0(0),
SYS_SPMEVFILT2Rn_EL0(15),
HDFGRTR2, nSPMEVTYPERn_EL0, 0),
SR_FGT_RANGE(SYS_SPMEVFILTRn_EL0(0),
SYS_SPMEVFILTRn_EL0(15),
HDFGRTR2, nSPMEVTYPERn_EL0, 0),
SR_FGT_RANGE(SYS_SPMEVTYPERn_EL0(0),
SYS_SPMEVTYPERn_EL0(15),
HDFGRTR2, nSPMEVTYPERn_EL0, 0),
SR_FGT(SYS_SPMINTENCLR_EL1, HDFGRTR2, nSPMINTEN, 0),
SR_FGT(SYS_SPMINTENSET_EL1, HDFGRTR2, nSPMINTEN, 0),
SR_FGT(SYS_SPMOVSCLR_EL0, HDFGRTR2, nSPMOVS, 0),
SR_FGT(SYS_SPMOVSSET_EL0, HDFGRTR2, nSPMOVS, 0),
SR_FGT(SYS_SPMSCR_EL1, HDFGRTR2, nSPMSCR_EL1, 0),
SR_FGT(SYS_SPMSELR_EL0, HDFGRTR2, nSPMSELR_EL0, 0),
SR_FGT(SYS_TRCITECR_EL1, HDFGRTR2, nTRCITECR_EL1, 0),
SR_FGT(SYS_PMBMAR_EL1, HDFGRTR2, nPMBMAR_EL1, 0),
SR_FGT(SYS_PMSDSFR_EL1, HDFGRTR2, nPMSDSFR_EL1, 0),
SR_FGT(SYS_TRBMPAM_EL1, HDFGRTR2, nTRBMPAM_EL1, 0),
/*
* HDFGWTR_EL2
*
@ -1938,12 +1975,19 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
* read-side mappings, and only the write-side mappings that
* differ from the read side, and the trap handler will pick
* the correct shadow register based on the access type.
*
* Same model applies to the FEAT_FGT2 registers.
*/
SR_FGT(SYS_TRFCR_EL1, HDFGWTR, TRFCR_EL1, 1),
SR_FGT(SYS_TRCOSLAR, HDFGWTR, TRCOSLAR, 1),
SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1),
SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1),
SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1),
/* HDFGWTR2_EL2 */
SR_FGT(SYS_PMZR_EL0, HDFGWTR2, nPMZR_EL0, 0),
SR_FGT(SYS_SPMZR_EL0, HDFGWTR2, nSPMEVCNTRn_EL0, 0),
/*
* HAFGRTR_EL2
*/
@ -1989,6 +2033,20 @@ static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1),
};
/*
* Additional FGTs that do not fire with ESR_EL2.EC==0x18. This table
* isn't used for exception routing, but only as a promise that the
* trap is handled somewhere else.
*/
static const union trap_config non_0x18_fgt[] __initconst = {
FGT(HFGITR, PSBCSYNC, 1),
FGT(HFGITR, nGCSSTR_EL1, 0),
FGT(HFGITR, SVC_EL1, 1),
FGT(HFGITR, SVC_EL0, 1),
FGT(HFGITR, ERET, 1),
FGT(HFGITR2, TSBCSYNC, 1),
};
static union trap_config get_trap_config(u32 sysreg)
{
return (union trap_config) {
@ -2033,6 +2091,130 @@ static u32 encoding_next(u32 encoding)
return sys_reg(op0 + 1, 0, 0, 0, 0);
}
#define FGT_MASKS(__n, __m) \
struct fgt_masks __n = { .str = #__m, .res0 = __m, }
FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0);
FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0);
FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0);
FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0);
FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0);
FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0);
FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0);
FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0);
FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0);
FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0);
FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0);
static __init bool aggregate_fgt(union trap_config tc)
{
struct fgt_masks *rmasks, *wmasks;
switch (tc.fgt) {
case HFGRTR_GROUP:
rmasks = &hfgrtr_masks;
wmasks = &hfgwtr_masks;
break;
case HDFGRTR_GROUP:
rmasks = &hdfgrtr_masks;
wmasks = &hdfgwtr_masks;
break;
case HAFGRTR_GROUP:
rmasks = &hafgrtr_masks;
wmasks = NULL;
break;
case HFGITR_GROUP:
rmasks = &hfgitr_masks;
wmasks = NULL;
break;
case HFGRTR2_GROUP:
rmasks = &hfgrtr2_masks;
wmasks = &hfgwtr2_masks;
break;
case HDFGRTR2_GROUP:
rmasks = &hdfgrtr2_masks;
wmasks = &hdfgwtr2_masks;
break;
case HFGITR2_GROUP:
rmasks = &hfgitr2_masks;
wmasks = NULL;
break;
}
/*
* A bit can be reserved in either the R or W register, but
* not both.
*/
if ((BIT(tc.bit) & rmasks->res0) &&
(!wmasks || (BIT(tc.bit) & wmasks->res0)))
return false;
if (tc.pol)
rmasks->mask |= BIT(tc.bit) & ~rmasks->res0;
else
rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0;
if (wmasks) {
if (tc.pol)
wmasks->mask |= BIT(tc.bit) & ~wmasks->res0;
else
wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0;
}
return true;
}
static __init int check_fgt_masks(struct fgt_masks *masks)
{
unsigned long duplicate = masks->mask & masks->nmask;
u64 res0 = masks->res0;
int ret = 0;
if (duplicate) {
int i;
for_each_set_bit(i, &duplicate, 64) {
kvm_err("%s[%d] bit has both polarities\n",
masks->str, i);
}
ret = -EINVAL;
}
masks->res0 = ~(masks->mask | masks->nmask);
if (masks->res0 != res0)
kvm_info("Implicit %s = %016llx, expecting %016llx\n",
masks->str, masks->res0, res0);
return ret;
}
static __init int check_all_fgt_masks(int ret)
{
static struct fgt_masks * const masks[] __initconst = {
&hfgrtr_masks,
&hfgwtr_masks,
&hfgitr_masks,
&hdfgrtr_masks,
&hdfgwtr_masks,
&hafgrtr_masks,
&hfgrtr2_masks,
&hfgwtr2_masks,
&hfgitr2_masks,
&hdfgrtr2_masks,
&hdfgwtr2_masks,
};
int err = 0;
for (int i = 0; i < ARRAY_SIZE(masks); i++)
err |= check_fgt_masks(masks[i]);
return ret ?: err;
}
#define for_each_encoding_in(__x, __s, __e) \
for (u32 __x = __s; __x <= __e; __x = encoding_next(__x))
int __init populate_nv_trap_config(void)
{
int ret = 0;
@ -2041,6 +2223,7 @@ int __init populate_nv_trap_config(void)
BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS));
BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS));
BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS));
BUILD_BUG_ON(__HCRX_EL2_MASK & __HCRX_EL2_nMASK);
for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) {
const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i];
@ -2051,7 +2234,7 @@ int __init populate_nv_trap_config(void)
ret = -EINVAL;
}
for (u32 enc = cgt->encoding; enc <= cgt->end; enc = encoding_next(enc)) {
for_each_encoding_in(enc, cgt->encoding, cgt->end) {
prev = xa_store(&sr_forward_xa, enc,
xa_mk_value(cgt->tc.val), GFP_KERNEL);
if (prev && !xa_is_err(prev)) {
@ -2066,6 +2249,10 @@ int __init populate_nv_trap_config(void)
}
}
if (__HCRX_EL2_RES0 != HCRX_EL2_RES0)
kvm_info("Sanitised HCR_EL2_RES0 = %016llx, expecting %016llx\n",
__HCRX_EL2_RES0, HCRX_EL2_RES0);
kvm_info("nv: %ld coarse grained trap handlers\n",
ARRAY_SIZE(encoding_to_cgt));
@ -2082,23 +2269,39 @@ int __init populate_nv_trap_config(void)
print_nv_trap_error(fgt, "Invalid FGT", ret);
}
tc = get_trap_config(fgt->encoding);
for_each_encoding_in(enc, fgt->encoding, fgt->end) {
tc = get_trap_config(enc);
if (tc.fgt) {
ret = -EINVAL;
print_nv_trap_error(fgt, "Duplicate FGT", ret);
}
if (tc.fgt) {
ret = -EINVAL;
print_nv_trap_error(fgt, "Duplicate FGT", ret);
}
tc.val |= fgt->tc.val;
prev = xa_store(&sr_forward_xa, fgt->encoding,
xa_mk_value(tc.val), GFP_KERNEL);
tc.val |= fgt->tc.val;
prev = xa_store(&sr_forward_xa, enc,
xa_mk_value(tc.val), GFP_KERNEL);
if (xa_is_err(prev)) {
ret = xa_err(prev);
print_nv_trap_error(fgt, "Failed FGT insertion", ret);
if (xa_is_err(prev)) {
ret = xa_err(prev);
print_nv_trap_error(fgt, "Failed FGT insertion", ret);
}
if (!aggregate_fgt(tc)) {
ret = -EINVAL;
print_nv_trap_error(fgt, "FGT bit is reserved", ret);
}
}
}
for (int i = 0; i < ARRAY_SIZE(non_0x18_fgt); i++) {
if (!aggregate_fgt(non_0x18_fgt[i])) {
ret = -EINVAL;
kvm_err("non_0x18_fgt[%d] is reserved\n", i);
}
}
ret = check_all_fgt_masks(ret);
kvm_info("nv: %ld fine grained trap handlers\n",
ARRAY_SIZE(encoding_to_fgt));
@ -2215,11 +2418,11 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr)
return masks->mask[sr - __VNCR_START__].res0;
}
static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read,
u64 val, const union trap_config tc)
static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr,
const union trap_config tc)
{
struct kvm *kvm = vcpu->kvm;
enum vcpu_sysreg sr;
u64 val;
/*
* KVM doesn't know about any FGTs that apply to the host, and hopefully
@ -2228,6 +2431,8 @@ static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read,
if (is_hyp_ctxt(vcpu))
return false;
val = __vcpu_sys_reg(vcpu, sr);
if (tc.pol)
return (val & BIT(tc.bit));
@ -2242,38 +2447,17 @@ static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read,
if (val & BIT(tc.bit))
return false;
switch ((enum fgt_group_id)tc.fgt) {
case HFGxTR_GROUP:
sr = is_read ? HFGRTR_EL2 : HFGWTR_EL2;
break;
case HDFGRTR_GROUP:
sr = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2;
break;
case HAFGRTR_GROUP:
sr = HAFGRTR_EL2;
break;
case HFGITR_GROUP:
sr = HFGITR_EL2;
break;
default:
WARN_ONCE(1, "Unhandled FGT group");
return false;
}
return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit));
}
bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index)
{
enum vcpu_sysreg fgtreg;
union trap_config tc;
enum trap_behaviour b;
bool is_read;
u32 sysreg;
u64 esr, val;
u64 esr;
esr = kvm_vcpu_get_esr(vcpu);
sysreg = esr_sys64_to_sysreg(esr);
@ -2319,26 +2503,20 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index)
case __NO_FGT_GROUP__:
break;
case HFGxTR_GROUP:
if (is_read)
val = __vcpu_sys_reg(vcpu, HFGRTR_EL2);
else
val = __vcpu_sys_reg(vcpu, HFGWTR_EL2);
case HFGRTR_GROUP:
fgtreg = is_read ? HFGRTR_EL2 : HFGWTR_EL2;
break;
case HDFGRTR_GROUP:
if (is_read)
val = __vcpu_sys_reg(vcpu, HDFGRTR_EL2);
else
val = __vcpu_sys_reg(vcpu, HDFGWTR_EL2);
fgtreg = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2;
break;
case HAFGRTR_GROUP:
val = __vcpu_sys_reg(vcpu, HAFGRTR_EL2);
fgtreg = HAFGRTR_EL2;
break;
case HFGITR_GROUP:
val = __vcpu_sys_reg(vcpu, HFGITR_EL2);
fgtreg = HFGITR_EL2;
switch (tc.fgf) {
u64 tmp;
@ -2352,13 +2530,26 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index)
}
break;
case __NR_FGT_GROUP_IDS__:
case HFGRTR2_GROUP:
fgtreg = is_read ? HFGRTR2_EL2 : HFGWTR2_EL2;
break;
case HDFGRTR2_GROUP:
fgtreg = is_read ? HDFGRTR2_EL2 : HDFGWTR2_EL2;
break;
case HFGITR2_GROUP:
fgtreg = HFGITR2_EL2;
break;
default:
/* Something is really wrong, bail out */
WARN_ONCE(1, "__NR_FGT_GROUP_IDS__");
WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n",
sysreg, tc.val);
goto local;
}
if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc))
if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, fgtreg, tc))
goto inject;
b = compute_trap_behaviour(vcpu, tc);
@ -2471,13 +2662,6 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
{
u64 spsr, elr, esr;
/*
* Forward this trap to the virtual EL2 if the virtual
* HCR_EL2.NV bit is set and this is coming from !EL2.
*/
if (forward_hcr_traps(vcpu, HCR_NV))
return;
spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2);
spsr = kvm_check_illegal_exception_return(vcpu, spsr);

View File

@ -10,6 +10,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/ubsan.h>
#include <asm/esr.h>
#include <asm/exception.h>
@ -298,6 +299,81 @@ static int handle_svc(struct kvm_vcpu *vcpu)
return 1;
}
static int kvm_handle_gcs(struct kvm_vcpu *vcpu)
{
/* We don't expect GCS, so treat it with contempt */
if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, GCS, IMP))
WARN_ON_ONCE(1);
kvm_inject_undefined(vcpu);
return 1;
}
static int handle_other(struct kvm_vcpu *vcpu)
{
bool is_l2 = vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu);
u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2);
u64 esr = kvm_vcpu_get_esr(vcpu);
u64 iss = ESR_ELx_ISS(esr);
struct kvm *kvm = vcpu->kvm;
bool allowed, fwd = false;
/*
* We only trap for two reasons:
*
* - the feature is disabled, and the only outcome is to
* generate an UNDEF.
*
* - the feature is enabled, but a NV guest wants to trap the
* feature used by its L2 guest. We forward the exception in
* this case.
*
* What we don't expect is to end-up here if the guest is
* expected be be able to directly use the feature, hence the
* WARN_ON below.
*/
switch (iss) {
case ESR_ELx_ISS_OTHER_ST64BV:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V);
if (is_l2)
fwd = !(hcrx & HCRX_EL2_EnASR);
break;
case ESR_ELx_ISS_OTHER_ST64BV0:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA);
if (is_l2)
fwd = !(hcrx & HCRX_EL2_EnAS0);
break;
case ESR_ELx_ISS_OTHER_LDST64B:
allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64);
if (is_l2)
fwd = !(hcrx & HCRX_EL2_EnALS);
break;
case ESR_ELx_ISS_OTHER_TSBCSYNC:
allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1);
if (is_l2)
fwd = (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC);
break;
case ESR_ELx_ISS_OTHER_PSBCSYNC:
allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5);
if (is_l2)
fwd = (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC);
break;
default:
/* Clearly, we're missing something. */
WARN_ON_ONCE(1);
allowed = false;
}
WARN_ON_ONCE(allowed && !fwd);
if (allowed && fwd)
kvm_inject_nested_sync(vcpu, esr);
else
kvm_inject_undefined(vcpu);
return 1;
}
static exit_handle_fn arm_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec,
[ESR_ELx_EC_WFx] = kvm_handle_wfx,
@ -307,6 +383,7 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_CP14_LS] = kvm_handle_cp14_load_store,
[ESR_ELx_EC_CP10_ID] = kvm_handle_cp10_id,
[ESR_ELx_EC_CP14_64] = kvm_handle_cp14_64,
[ESR_ELx_EC_OTHER] = handle_other,
[ESR_ELx_EC_HVC32] = handle_hvc,
[ESR_ELx_EC_SMC32] = handle_smc,
[ESR_ELx_EC_HVC64] = handle_hvc,
@ -317,6 +394,7 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_ERET] = kvm_handle_eret,
[ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort,
[ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort,
[ESR_ELx_EC_DABT_CUR] = kvm_handle_vncr_abort,
[ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
[ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug,
[ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
@ -324,6 +402,7 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_BRK64] = kvm_handle_guest_debug,
[ESR_ELx_EC_FP_ASIMD] = kvm_handle_fpasimd,
[ESR_ELx_EC_PAC] = kvm_handle_ptrauth,
[ESR_ELx_EC_GCS] = kvm_handle_gcs,
};
static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
@ -474,6 +553,11 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
print_nvhe_hyp_panic("BUG", panic_addr);
} else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) {
kvm_nvhe_report_cfi_failure(panic_addr);
} else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) &&
ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
esr_is_ubsan_brk(esr)) {
print_nvhe_hyp_panic(report_ubsan_failure(esr & UBSAN_BRK_MASK),
panic_addr);
} else {
print_nvhe_hyp_panic("panic", panic_addr);
}

View File

@ -65,12 +65,56 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
}
}
#define reg_to_fgt_masks(reg) \
({ \
struct fgt_masks *m; \
switch(reg) { \
case HFGRTR_EL2: \
m = &hfgrtr_masks; \
break; \
case HFGWTR_EL2: \
m = &hfgwtr_masks; \
break; \
case HFGITR_EL2: \
m = &hfgitr_masks; \
break; \
case HDFGRTR_EL2: \
m = &hdfgrtr_masks; \
break; \
case HDFGWTR_EL2: \
m = &hdfgwtr_masks; \
break; \
case HAFGRTR_EL2: \
m = &hafgrtr_masks; \
break; \
case HFGRTR2_EL2: \
m = &hfgrtr2_masks; \
break; \
case HFGWTR2_EL2: \
m = &hfgwtr2_masks; \
break; \
case HFGITR2_EL2: \
m = &hfgitr2_masks; \
break; \
case HDFGRTR2_EL2: \
m = &hdfgrtr2_masks; \
break; \
case HDFGWTR2_EL2: \
m = &hdfgwtr2_masks; \
break; \
default: \
BUILD_BUG_ON(1); \
} \
\
m; \
})
#define compute_clr_set(vcpu, reg, clr, set) \
do { \
u64 hfg; \
hfg = __vcpu_sys_reg(vcpu, reg) & ~__ ## reg ## _RES0; \
set |= hfg & __ ## reg ## _MASK; \
clr |= ~hfg & __ ## reg ## _nMASK; \
u64 hfg = __vcpu_sys_reg(vcpu, reg); \
struct fgt_masks *m = reg_to_fgt_masks(reg); \
set |= hfg & m->mask; \
clr |= ~hfg & m->nmask; \
} while(0)
#define reg_to_fgt_group_id(reg) \
@ -79,7 +123,7 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
switch(reg) { \
case HFGRTR_EL2: \
case HFGWTR_EL2: \
id = HFGxTR_GROUP; \
id = HFGRTR_GROUP; \
break; \
case HFGITR_EL2: \
id = HFGITR_GROUP; \
@ -91,6 +135,17 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
case HAFGRTR_EL2: \
id = HAFGRTR_GROUP; \
break; \
case HFGRTR2_EL2: \
case HFGWTR2_EL2: \
id = HFGRTR2_GROUP; \
break; \
case HFGITR2_EL2: \
id = HFGITR2_GROUP; \
break; \
case HDFGRTR2_EL2: \
case HDFGWTR2_EL2: \
id = HDFGRTR2_GROUP; \
break; \
default: \
BUILD_BUG_ON(1); \
} \
@ -101,13 +156,16 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
#define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \
do { \
u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \
set |= hfg & __ ## reg ## _MASK; \
clr |= hfg & __ ## reg ## _nMASK; \
struct fgt_masks *m = reg_to_fgt_masks(reg); \
set |= hfg & m->mask; \
clr |= hfg & m->nmask; \
} while(0)
#define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set) \
do { \
u64 c = 0, s = 0; \
struct fgt_masks *m = reg_to_fgt_masks(reg); \
u64 c = clr, s = set; \
u64 val; \
\
ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg); \
if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \
@ -115,30 +173,15 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
\
compute_undef_clr_set(vcpu, kvm, reg, c, s); \
\
s |= set; \
c |= clr; \
if (c || s) { \
u64 val = __ ## reg ## _nMASK; \
val |= s; \
val &= ~c; \
write_sysreg_s(val, SYS_ ## reg); \
} \
val = m->nmask; \
val |= s; \
val &= ~c; \
write_sysreg_s(val, SYS_ ## reg); \
} while(0)
#define update_fgt_traps(hctxt, vcpu, kvm, reg) \
update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0)
/*
* Validate the fine grain trap masks.
* Check that the masks do not overlap and that all bits are accounted for.
*/
#define CHECK_FGT_MASKS(reg) \
do { \
BUILD_BUG_ON((__ ## reg ## _MASK) & (__ ## reg ## _nMASK)); \
BUILD_BUG_ON(~((__ ## reg ## _RES0) ^ (__ ## reg ## _MASK) ^ \
(__ ## reg ## _nMASK))); \
} while(0)
static inline bool cpu_has_amu(void)
{
u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1);
@ -152,56 +195,60 @@ static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
struct kvm *kvm = kern_hyp_va(vcpu->kvm);
CHECK_FGT_MASKS(HFGRTR_EL2);
CHECK_FGT_MASKS(HFGWTR_EL2);
CHECK_FGT_MASKS(HFGITR_EL2);
CHECK_FGT_MASKS(HDFGRTR_EL2);
CHECK_FGT_MASKS(HDFGWTR_EL2);
CHECK_FGT_MASKS(HAFGRTR_EL2);
CHECK_FGT_MASKS(HCRX_EL2);
if (!cpus_have_final_cap(ARM64_HAS_FGT))
return;
update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2);
update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0,
cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ?
HFGxTR_EL2_TCR_EL1_MASK : 0);
HFGWTR_EL2_TCR_EL1_MASK : 0);
update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2);
update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2);
update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2);
if (cpu_has_amu())
update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2);
if (!cpus_have_final_cap(ARM64_HAS_FGT2))
return;
update_fgt_traps(hctxt, vcpu, kvm, HFGRTR2_EL2);
update_fgt_traps(hctxt, vcpu, kvm, HFGWTR2_EL2);
update_fgt_traps(hctxt, vcpu, kvm, HFGITR2_EL2);
update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR2_EL2);
update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR2_EL2);
}
#define __deactivate_fgt(htcxt, vcpu, kvm, reg) \
#define __deactivate_fgt(htcxt, vcpu, reg) \
do { \
if ((vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) || \
kvm->arch.fgu[reg_to_fgt_group_id(reg)]) \
write_sysreg_s(ctxt_sys_reg(hctxt, reg), \
SYS_ ## reg); \
write_sysreg_s(ctxt_sys_reg(hctxt, reg), \
SYS_ ## reg); \
} while(0)
static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
struct kvm *kvm = kern_hyp_va(vcpu->kvm);
if (!cpus_have_final_cap(ARM64_HAS_FGT))
return;
__deactivate_fgt(hctxt, vcpu, kvm, HFGRTR_EL2);
if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2);
else
__deactivate_fgt(hctxt, vcpu, kvm, HFGWTR_EL2);
__deactivate_fgt(hctxt, vcpu, kvm, HFGITR_EL2);
__deactivate_fgt(hctxt, vcpu, kvm, HDFGRTR_EL2);
__deactivate_fgt(hctxt, vcpu, kvm, HDFGWTR_EL2);
__deactivate_fgt(hctxt, vcpu, HFGRTR_EL2);
__deactivate_fgt(hctxt, vcpu, HFGWTR_EL2);
__deactivate_fgt(hctxt, vcpu, HFGITR_EL2);
__deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2);
__deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2);
if (cpu_has_amu())
__deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2);
__deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2);
if (!cpus_have_final_cap(ARM64_HAS_FGT2))
return;
__deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2);
__deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2);
__deactivate_fgt(hctxt, vcpu, HFGITR2_EL2);
__deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2);
__deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2);
}
static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
@ -260,12 +307,9 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
if (cpus_have_final_cap(ARM64_HAS_HCX)) {
u64 hcrx = vcpu->arch.hcrx_el2;
if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
u64 clr = 0, set = 0;
compute_clr_set(vcpu, HCRX_EL2, clr, set);
hcrx |= set;
hcrx &= ~clr;
u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2);
hcrx |= val & __HCRX_EL2_MASK;
hcrx &= ~(~val & __HCRX_EL2_nMASK);
}
ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2);
@ -300,7 +344,7 @@ static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr)
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM))
hcr |= HCR_TVM;
write_sysreg(hcr, hcr_el2);
write_sysreg_hcr(hcr);
if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);

View File

@ -39,12 +39,12 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu,
int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
enum kvm_pgtable_prot prot);
int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm);
int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm);
int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot);
int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm);
int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm);
int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *hyp_vm);
int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm);
int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu);
bool addr_is_memory(phys_addr_t phys);
@ -67,4 +67,10 @@ static __always_inline void __load_host_stage2(void)
else
write_sysreg(0, vttbr_el2);
}
#ifdef CONFIG_NVHE_EL2_DEBUG
void pkvm_ownership_selftest(void *base);
#else
static inline void pkvm_ownership_selftest(void *base) { }
#endif
#endif /* __KVM_NVHE_MEM_PROTECT__ */

View File

@ -8,23 +8,30 @@
#include <linux/types.h>
/*
* Bits 0-1 are reserved to track the memory ownership state of each page:
* 00: The page is owned exclusively by the page-table owner.
* 01: The page is owned by the page-table owner, but is shared
* with another entity.
* 10: The page is shared with, but not owned by the page-table owner.
* 11: Reserved for future use (lending).
* Bits 0-1 are used to encode the memory ownership state of each page from the
* point of view of a pKVM "component" (host, hyp, guest, ... see enum
* pkvm_component_id):
* 00: The page is owned and exclusively accessible by the component;
* 01: The page is owned and accessible by the component, but is also
* accessible by another component;
* 10: The page is accessible but not owned by the component;
* The storage of this state depends on the component: either in the
* hyp_vmemmap for the host and hyp states or in PTE software bits for guests.
*/
enum pkvm_page_state {
PKVM_PAGE_OWNED = 0ULL,
PKVM_PAGE_SHARED_OWNED = BIT(0),
PKVM_PAGE_SHARED_BORROWED = BIT(1),
__PKVM_PAGE_RESERVED = BIT(0) | BIT(1),
/* Meta-states which aren't encoded directly in the PTE's SW bits */
PKVM_NOPAGE = BIT(2),
/*
* 'Meta-states' are not stored directly in PTE SW bits for guest
* states, but inferred from the context (e.g. invalid PTE entries).
* For the host and hyp, meta-states are stored directly in the
* struct hyp_page.
*/
PKVM_NOPAGE = BIT(0) | BIT(1),
};
#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED)
#define PKVM_PAGE_STATE_MASK (BIT(0) | BIT(1))
#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot,
@ -44,8 +51,15 @@ struct hyp_page {
u16 refcount;
u8 order;
/* Host (non-meta) state. Guarded by the host stage-2 lock. */
enum pkvm_page_state host_state : 8;
/* Host state. Guarded by the host stage-2 lock. */
unsigned __host_state : 4;
/*
* Complement of the hyp state. Guarded by the hyp stage-1 lock. We use
* the complement so that the initial 0 in __hyp_state_comp (due to the
* entire vmemmap starting off zeroed) encodes PKVM_NOPAGE.
*/
unsigned __hyp_state_comp : 4;
u32 host_share_guest_count;
};
@ -82,6 +96,26 @@ static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys)
#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
static inline enum pkvm_page_state get_host_state(struct hyp_page *p)
{
return p->__host_state;
}
static inline void set_host_state(struct hyp_page *p, enum pkvm_page_state state)
{
p->__host_state = state;
}
static inline enum pkvm_page_state get_hyp_state(struct hyp_page *p)
{
return p->__hyp_state_comp ^ PKVM_PAGE_STATE_MASK;
}
static inline void set_hyp_state(struct hyp_page *p, enum pkvm_page_state state)
{
p->__hyp_state_comp = state ^ PKVM_PAGE_STATE_MASK;
}
/*
* Refcounting for 'struct hyp_page'.
* hyp_pool::lock must be held if atomic access to the refcount is required.

View File

@ -13,9 +13,11 @@
extern struct kvm_pgtable pkvm_pgtable;
extern hyp_spinlock_t pkvm_pgd_lock;
int hyp_create_pcpu_fixmap(void);
int hyp_create_fixmap(void);
void *hyp_fixmap_map(phys_addr_t phys);
void hyp_fixmap_unmap(void);
void *hyp_fixblock_map(phys_addr_t phys, size_t *size);
void hyp_fixblock_unmap(void);
int hyp_create_idmap(u32 hyp_va_bits);
int hyp_map_vectors(void);

View File

@ -99,3 +99,9 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS), $(KBUILD_CFLAG
# causes a build failure. Remove profile optimization flags.
KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables
ifeq ($(CONFIG_UBSAN_KVM_EL2),y)
UBSAN_SANITIZE := y
# Always use brk and not hooks
ccflags-y += $(CFLAGS_UBSAN_TRAP)
endif

View File

@ -124,7 +124,7 @@ SYM_FUNC_START(__hyp_do_panic)
/* Ensure host stage-2 is disabled */
mrs x0, hcr_el2
bic x0, x0, #HCR_VM
msr hcr_el2, x0
msr_hcr_el2 x0
isb
tlbi vmalls12e1
dsb nsh

View File

@ -100,7 +100,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
msr mair_el2, x1
ldr x1, [x0, #NVHE_INIT_HCR_EL2]
msr hcr_el2, x1
msr_hcr_el2 x1
mov x2, #HCR_E2H
and x2, x1, x2
@ -262,7 +262,7 @@ reset:
alternative_if ARM64_KVM_PROTECTED_MODE
mov_q x5, HCR_HOST_NVHE_FLAGS
msr hcr_el2, x5
msr_hcr_el2 x5
alternative_else_nop_endif
/* Install stub vectors */

View File

@ -123,10 +123,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt;
hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
/* Limit guest vector length to the maximum supported by the host. */
hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl);
hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE);
hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) &
@ -249,7 +245,8 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, pfn, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
DECLARE_REG(u64, nr_pages, host_ctxt, 3);
DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
@ -264,7 +261,7 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt)
if (ret)
goto out;
ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot);
ret = __pkvm_host_share_guest(pfn, gfn, nr_pages, hyp_vcpu, prot);
out:
cpu_reg(host_ctxt, 1) = ret;
}
@ -273,6 +270,7 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
DECLARE_REG(u64, nr_pages, host_ctxt, 3);
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
@ -283,7 +281,7 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt)
if (!hyp_vm)
goto out;
ret = __pkvm_host_unshare_guest(gfn, hyp_vm);
ret = __pkvm_host_unshare_guest(gfn, nr_pages, hyp_vm);
put_pkvm_hyp_vm(hyp_vm);
out:
cpu_reg(host_ctxt, 1) = ret;
@ -312,6 +310,7 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
DECLARE_REG(u64, nr_pages, host_ctxt, 3);
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
@ -322,7 +321,7 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt
if (!hyp_vm)
goto out;
ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm);
ret = __pkvm_host_wrprotect_guest(gfn, nr_pages, hyp_vm);
put_pkvm_hyp_vm(hyp_vm);
out:
cpu_reg(host_ctxt, 1) = ret;
@ -332,7 +331,8 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho
{
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
DECLARE_REG(bool, mkold, host_ctxt, 3);
DECLARE_REG(u64, nr_pages, host_ctxt, 3);
DECLARE_REG(bool, mkold, host_ctxt, 4);
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
@ -343,7 +343,7 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho
if (!hyp_vm)
goto out;
ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm);
ret = __pkvm_host_test_clear_young_guest(gfn, nr_pages, mkold, hyp_vm);
put_pkvm_hyp_vm(hyp_vm);
out:
cpu_reg(host_ctxt, 1) = ret;

View File

@ -25,5 +25,7 @@ SECTIONS {
BEGIN_HYP_SECTION(.data..percpu)
PERCPU_INPUT(L1_CACHE_BYTES)
END_HYP_SECTION
HYP_SECTION(.bss)
HYP_SECTION(.data)
}

View File

@ -60,6 +60,11 @@ static void hyp_unlock_component(void)
hyp_spin_unlock(&pkvm_pgd_lock);
}
#define for_each_hyp_page(__p, __st, __sz) \
for (struct hyp_page *__p = hyp_phys_to_page(__st), \
*__e = __p + ((__sz) >> PAGE_SHIFT); \
__p < __e; __p++)
static void *host_s2_zalloc_pages_exact(size_t size)
{
void *addr = hyp_alloc_pages(&host_s2_pool, get_order(size));
@ -161,12 +166,6 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
return 0;
}
static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
enum kvm_pgtable_prot prot)
{
return true;
}
static void *guest_s2_zalloc_pages_exact(size_t size)
{
void *addr = hyp_alloc_pages(&current_vm->pool, get_order(size));
@ -217,16 +216,42 @@ static void guest_s2_put_page(void *addr)
hyp_put_page(&current_vm->pool, addr);
}
static void __apply_guest_page(void *va, size_t size,
void (*func)(void *addr, size_t size))
{
size += va - PTR_ALIGN_DOWN(va, PAGE_SIZE);
va = PTR_ALIGN_DOWN(va, PAGE_SIZE);
size = PAGE_ALIGN(size);
while (size) {
size_t map_size = PAGE_SIZE;
void *map;
if (IS_ALIGNED((unsigned long)va, PMD_SIZE) && size >= PMD_SIZE)
map = hyp_fixblock_map(__hyp_pa(va), &map_size);
else
map = hyp_fixmap_map(__hyp_pa(va));
func(map, map_size);
if (map_size == PMD_SIZE)
hyp_fixblock_unmap();
else
hyp_fixmap_unmap();
size -= map_size;
va += map_size;
}
}
static void clean_dcache_guest_page(void *va, size_t size)
{
__clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
hyp_fixmap_unmap();
__apply_guest_page(va, size, __clean_dcache_guest_page);
}
static void invalidate_icache_guest_page(void *va, size_t size)
{
__invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
hyp_fixmap_unmap();
__apply_guest_page(va, size, __invalidate_icache_guest_page);
}
int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
@ -255,8 +280,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
};
guest_lock_component(vm);
ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
guest_stage2_force_pte_cb);
ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0, NULL);
guest_unlock_component(vm);
if (ret)
return ret;
@ -309,7 +333,7 @@ int __pkvm_prot_finalize(void)
*/
kvm_flush_dcache_to_poc(params, sizeof(*params));
write_sysreg(params->hcr_el2, hcr_el2);
write_sysreg_hcr(params->hcr_el2);
__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
/*
@ -467,7 +491,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
return -EAGAIN;
if (pte) {
WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE);
WARN_ON(addr_is_memory(addr) &&
get_host_state(hyp_phys_to_page(addr)) != PKVM_NOPAGE);
return -EPERM;
}
@ -493,10 +518,8 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state)
{
phys_addr_t end = addr + size;
for (; addr < end; addr += PAGE_SIZE)
hyp_phys_to_page(addr)->host_state = state;
for_each_hyp_page(page, addr, size)
set_host_state(page, state);
}
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
@ -618,16 +641,16 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
static int __host_check_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
u64 end = addr + size;
int ret;
ret = check_range_allowed_memory(addr, end);
ret = check_range_allowed_memory(addr, addr + size);
if (ret)
return ret;
hyp_assert_lock_held(&host_mmu.lock);
for (; addr < end; addr += PAGE_SIZE) {
if (hyp_phys_to_page(addr)->host_state != state)
for_each_hyp_page(page, addr, size) {
if (get_host_state(page) != state)
return -EPERM;
}
@ -637,7 +660,7 @@ static int __host_check_page_state_range(u64 addr, u64 size,
static int __host_set_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) {
if (get_host_state(hyp_phys_to_page(addr)) == PKVM_NOPAGE) {
int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
if (ret)
@ -649,24 +672,20 @@ static int __host_set_page_state_range(u64 addr, u64 size,
return 0;
}
static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr)
static void __hyp_set_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state)
{
if (!kvm_pte_valid(pte))
return PKVM_NOPAGE;
return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
for_each_hyp_page(page, phys, size)
set_hyp_state(page, state);
}
static int __hyp_check_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
static int __hyp_check_page_state_range(phys_addr_t phys, u64 size, enum pkvm_page_state state)
{
struct check_walk_data d = {
.desired = state,
.get_page_state = hyp_get_page_state,
};
for_each_hyp_page(page, phys, size) {
if (get_hyp_state(page) != state)
return -EPERM;
}
hyp_assert_lock_held(&pkvm_pgd_lock);
return check_page_state_range(&pkvm_pgtable, addr, size, &d);
return 0;
}
static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr)
@ -677,10 +696,9 @@ static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr)
return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
}
static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr,
static int __guest_check_page_state_range(struct pkvm_hyp_vm *vm, u64 addr,
u64 size, enum pkvm_page_state state)
{
struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
struct check_walk_data d = {
.desired = state,
.get_page_state = guest_get_page_state,
@ -693,8 +711,6 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr,
int __pkvm_host_share_hyp(u64 pfn)
{
u64 phys = hyp_pfn_to_phys(pfn);
void *virt = __hyp_va(phys);
enum kvm_pgtable_prot prot;
u64 size = PAGE_SIZE;
int ret;
@ -704,14 +720,11 @@ int __pkvm_host_share_hyp(u64 pfn)
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (ret)
goto unlock;
if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE);
if (ret)
goto unlock;
}
ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE);
if (ret)
goto unlock;
prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED);
WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot));
__hyp_set_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED));
unlock:
@ -734,7 +747,7 @@ int __pkvm_host_unshare_hyp(u64 pfn)
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (ret)
goto unlock;
ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED);
ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
if (ret)
goto unlock;
if (hyp_page_count((void *)virt)) {
@ -742,7 +755,7 @@ int __pkvm_host_unshare_hyp(u64 pfn)
goto unlock;
}
WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size);
__hyp_set_page_state_range(phys, size, PKVM_NOPAGE);
WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED));
unlock:
@ -757,7 +770,6 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
u64 phys = hyp_pfn_to_phys(pfn);
u64 size = PAGE_SIZE * nr_pages;
void *virt = __hyp_va(phys);
enum kvm_pgtable_prot prot;
int ret;
host_lock_component();
@ -766,14 +778,12 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (ret)
goto unlock;
if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE);
if (ret)
goto unlock;
}
ret = __hyp_check_page_state_range(phys, size, PKVM_NOPAGE);
if (ret)
goto unlock;
prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot));
__hyp_set_page_state_range(phys, size, PKVM_PAGE_OWNED);
WARN_ON(pkvm_create_mappings_locked(virt, virt + size, PAGE_HYP));
WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP));
unlock:
@ -793,15 +803,14 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
host_lock_component();
hyp_lock_component();
ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED);
ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (ret)
goto unlock;
ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE);
if (ret)
goto unlock;
if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE);
if (ret)
goto unlock;
}
__hyp_set_page_state_range(phys, size, PKVM_NOPAGE);
WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size);
WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST));
@ -816,24 +825,30 @@ int hyp_pin_shared_mem(void *from, void *to)
{
u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
u64 end = PAGE_ALIGN((u64)to);
u64 phys = __hyp_pa(start);
u64 size = end - start;
struct hyp_page *p;
int ret;
host_lock_component();
hyp_lock_component();
ret = __host_check_page_state_range(__hyp_pa(start), size,
PKVM_PAGE_SHARED_OWNED);
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (ret)
goto unlock;
ret = __hyp_check_page_state_range(start, size,
PKVM_PAGE_SHARED_BORROWED);
ret = __hyp_check_page_state_range(phys, size, PKVM_PAGE_SHARED_BORROWED);
if (ret)
goto unlock;
for (cur = start; cur < end; cur += PAGE_SIZE)
hyp_page_ref_inc(hyp_virt_to_page(cur));
for (cur = start; cur < end; cur += PAGE_SIZE) {
p = hyp_virt_to_page(cur);
hyp_page_ref_inc(p);
if (p->refcount == 1)
WARN_ON(pkvm_create_mappings_locked((void *)cur,
(void *)cur + PAGE_SIZE,
PAGE_HYP));
}
unlock:
hyp_unlock_component();
@ -846,12 +861,17 @@ void hyp_unpin_shared_mem(void *from, void *to)
{
u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
u64 end = PAGE_ALIGN((u64)to);
struct hyp_page *p;
host_lock_component();
hyp_lock_component();
for (cur = start; cur < end; cur += PAGE_SIZE)
hyp_page_ref_dec(hyp_virt_to_page(cur));
for (cur = start; cur < end; cur += PAGE_SIZE) {
p = hyp_virt_to_page(cur);
if (p->refcount == 1)
WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, cur, PAGE_SIZE) != PAGE_SIZE);
hyp_page_ref_dec(p);
}
hyp_unlock_component();
host_unlock_component();
@ -887,49 +907,84 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
return ret;
}
int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu,
static int __guest_check_transition_size(u64 phys, u64 ipa, u64 nr_pages, u64 *size)
{
size_t block_size;
if (nr_pages == 1) {
*size = PAGE_SIZE;
return 0;
}
/* We solely support second to last level huge mapping */
block_size = kvm_granule_size(KVM_PGTABLE_LAST_LEVEL - 1);
if (nr_pages != block_size >> PAGE_SHIFT)
return -EINVAL;
if (!IS_ALIGNED(phys | ipa, block_size))
return -EINVAL;
*size = block_size;
return 0;
}
int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu *vcpu,
enum kvm_pgtable_prot prot)
{
struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
u64 phys = hyp_pfn_to_phys(pfn);
u64 ipa = hyp_pfn_to_phys(gfn);
struct hyp_page *page;
u64 size;
int ret;
if (prot & ~KVM_PGTABLE_PROT_RWX)
return -EINVAL;
ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
ret = __guest_check_transition_size(phys, ipa, nr_pages, &size);
if (ret)
return ret;
ret = check_range_allowed_memory(phys, phys + size);
if (ret)
return ret;
host_lock_component();
guest_lock_component(vm);
ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE);
ret = __guest_check_page_state_range(vm, ipa, size, PKVM_NOPAGE);
if (ret)
goto unlock;
page = hyp_phys_to_page(phys);
switch (page->host_state) {
case PKVM_PAGE_OWNED:
WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED));
break;
case PKVM_PAGE_SHARED_OWNED:
if (page->host_share_guest_count)
break;
/* Only host to np-guest multi-sharing is tolerated */
WARN_ON(1);
fallthrough;
default:
ret = -EPERM;
goto unlock;
for_each_hyp_page(page, phys, size) {
switch (get_host_state(page)) {
case PKVM_PAGE_OWNED:
continue;
case PKVM_PAGE_SHARED_OWNED:
if (page->host_share_guest_count == U32_MAX) {
ret = -EBUSY;
goto unlock;
}
/* Only host to np-guest multi-sharing is tolerated */
if (page->host_share_guest_count)
continue;
fallthrough;
default:
ret = -EPERM;
goto unlock;
}
}
WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys,
for_each_hyp_page(page, phys, size) {
set_host_state(page, PKVM_PAGE_SHARED_OWNED);
page->host_share_guest_count++;
}
WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, size, phys,
pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED),
&vcpu->vcpu.arch.pkvm_memcache, 0));
page->host_share_guest_count++;
unlock:
guest_unlock_component(vm);
@ -938,10 +993,9 @@ unlock:
return ret;
}
static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa)
static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa, u64 size)
{
enum pkvm_page_state state;
struct hyp_page *page;
kvm_pte_t pte;
u64 phys;
s8 level;
@ -952,7 +1006,7 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip
return ret;
if (!kvm_pte_valid(pte))
return -ENOENT;
if (level != KVM_PGTABLE_LAST_LEVEL)
if (kvm_granule_size(level) != size)
return -E2BIG;
state = guest_get_page_state(pte, ipa);
@ -960,43 +1014,49 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip
return -EPERM;
phys = kvm_pte_to_phys(pte);
ret = check_range_allowed_memory(phys, phys + PAGE_SIZE);
ret = check_range_allowed_memory(phys, phys + size);
if (WARN_ON(ret))
return ret;
page = hyp_phys_to_page(phys);
if (page->host_state != PKVM_PAGE_SHARED_OWNED)
return -EPERM;
if (WARN_ON(!page->host_share_guest_count))
return -EINVAL;
for_each_hyp_page(page, phys, size) {
if (get_host_state(page) != PKVM_PAGE_SHARED_OWNED)
return -EPERM;
if (WARN_ON(!page->host_share_guest_count))
return -EINVAL;
}
*__phys = phys;
return 0;
}
int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm)
int __pkvm_host_unshare_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm)
{
u64 ipa = hyp_pfn_to_phys(gfn);
struct hyp_page *page;
u64 phys;
u64 size, phys;
int ret;
ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
if (ret)
return ret;
host_lock_component();
guest_lock_component(vm);
ret = __check_host_shared_guest(vm, &phys, ipa);
ret = __check_host_shared_guest(vm, &phys, ipa, size);
if (ret)
goto unlock;
ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, size);
if (ret)
goto unlock;
page = hyp_phys_to_page(phys);
page->host_share_guest_count--;
if (!page->host_share_guest_count)
WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED));
for_each_hyp_page(page, phys, size) {
/* __check_host_shared_guest() protects against underflow */
page->host_share_guest_count--;
if (!page->host_share_guest_count)
set_host_state(page, PKVM_PAGE_OWNED);
}
unlock:
guest_unlock_component(vm);
@ -1005,7 +1065,7 @@ unlock:
return ret;
}
static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa)
static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa, u64 size)
{
u64 phys;
int ret;
@ -1016,7 +1076,7 @@ static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa)
host_lock_component();
guest_lock_component(vm);
ret = __check_host_shared_guest(vm, &phys, ipa);
ret = __check_host_shared_guest(vm, &phys, ipa, size);
guest_unlock_component(vm);
host_unlock_component();
@ -1036,7 +1096,7 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_
if (prot & ~KVM_PGTABLE_PROT_RWX)
return -EINVAL;
assert_host_shared_guest(vm, ipa);
assert_host_shared_guest(vm, ipa, PAGE_SIZE);
guest_lock_component(vm);
ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0);
guest_unlock_component(vm);
@ -1044,33 +1104,41 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_
return ret;
}
int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm)
int __pkvm_host_wrprotect_guest(u64 gfn, u64 nr_pages, struct pkvm_hyp_vm *vm)
{
u64 ipa = hyp_pfn_to_phys(gfn);
u64 size, ipa = hyp_pfn_to_phys(gfn);
int ret;
if (pkvm_hyp_vm_is_protected(vm))
return -EPERM;
assert_host_shared_guest(vm, ipa);
ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
if (ret)
return ret;
assert_host_shared_guest(vm, ipa, size);
guest_lock_component(vm);
ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE);
ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, size);
guest_unlock_component(vm);
return ret;
}
int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm)
int __pkvm_host_test_clear_young_guest(u64 gfn, u64 nr_pages, bool mkold, struct pkvm_hyp_vm *vm)
{
u64 ipa = hyp_pfn_to_phys(gfn);
u64 size, ipa = hyp_pfn_to_phys(gfn);
int ret;
if (pkvm_hyp_vm_is_protected(vm))
return -EPERM;
assert_host_shared_guest(vm, ipa);
ret = __guest_check_transition_size(0, ipa, nr_pages, &size);
if (ret)
return ret;
assert_host_shared_guest(vm, ipa, size);
guest_lock_component(vm);
ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold);
ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, size, mkold);
guest_unlock_component(vm);
return ret;
@ -1084,10 +1152,210 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu)
if (pkvm_hyp_vm_is_protected(vm))
return -EPERM;
assert_host_shared_guest(vm, ipa);
assert_host_shared_guest(vm, ipa, PAGE_SIZE);
guest_lock_component(vm);
kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0);
guest_unlock_component(vm);
return 0;
}
#ifdef CONFIG_NVHE_EL2_DEBUG
struct pkvm_expected_state {
enum pkvm_page_state host;
enum pkvm_page_state hyp;
enum pkvm_page_state guest[2]; /* [ gfn, gfn + 1 ] */
};
static struct pkvm_expected_state selftest_state;
static struct hyp_page *selftest_page;
static struct pkvm_hyp_vm selftest_vm = {
.kvm = {
.arch = {
.mmu = {
.arch = &selftest_vm.kvm.arch,
.pgt = &selftest_vm.pgt,
},
},
},
};
static struct pkvm_hyp_vcpu selftest_vcpu = {
.vcpu = {
.arch = {
.hw_mmu = &selftest_vm.kvm.arch.mmu,
},
.kvm = &selftest_vm.kvm,
},
};
static void init_selftest_vm(void *virt)
{
struct hyp_page *p = hyp_virt_to_page(virt);
int i;
selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt));
for (i = 0; i < pkvm_selftest_pages(); i++) {
if (p[i].refcount)
continue;
p[i].refcount = 1;
hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i]));
}
}
static u64 selftest_ipa(void)
{
return BIT(selftest_vm.pgt.ia_bits - 1);
}
static void assert_page_state(void)
{
void *virt = hyp_page_to_virt(selftest_page);
u64 size = PAGE_SIZE << selftest_page->order;
struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
u64 phys = hyp_virt_to_phys(virt);
u64 ipa[2] = { selftest_ipa(), selftest_ipa() + PAGE_SIZE };
struct pkvm_hyp_vm *vm;
vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
host_lock_component();
WARN_ON(__host_check_page_state_range(phys, size, selftest_state.host));
host_unlock_component();
hyp_lock_component();
WARN_ON(__hyp_check_page_state_range(phys, size, selftest_state.hyp));
hyp_unlock_component();
guest_lock_component(&selftest_vm);
WARN_ON(__guest_check_page_state_range(vm, ipa[0], size, selftest_state.guest[0]));
WARN_ON(__guest_check_page_state_range(vm, ipa[1], size, selftest_state.guest[1]));
guest_unlock_component(&selftest_vm);
}
#define assert_transition_res(res, fn, ...) \
do { \
WARN_ON(fn(__VA_ARGS__) != res); \
assert_page_state(); \
} while (0)
void pkvm_ownership_selftest(void *base)
{
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_RWX;
void *virt = hyp_alloc_pages(&host_s2_pool, 0);
struct pkvm_hyp_vcpu *vcpu = &selftest_vcpu;
struct pkvm_hyp_vm *vm = &selftest_vm;
u64 phys, size, pfn, gfn;
WARN_ON(!virt);
selftest_page = hyp_virt_to_page(virt);
selftest_page->refcount = 0;
init_selftest_vm(base);
size = PAGE_SIZE << selftest_page->order;
phys = hyp_virt_to_phys(virt);
pfn = hyp_phys_to_pfn(phys);
gfn = hyp_phys_to_pfn(selftest_ipa());
selftest_state.host = PKVM_NOPAGE;
selftest_state.hyp = PKVM_PAGE_OWNED;
selftest_state.guest[0] = selftest_state.guest[1] = PKVM_NOPAGE;
assert_page_state();
assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
selftest_state.host = PKVM_PAGE_OWNED;
selftest_state.hyp = PKVM_NOPAGE;
assert_transition_res(0, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
selftest_state.host = PKVM_PAGE_SHARED_OWNED;
selftest_state.hyp = PKVM_PAGE_SHARED_BORROWED;
assert_transition_res(0, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
assert_transition_res(0, hyp_pin_shared_mem, virt, virt + size);
hyp_unpin_shared_mem(virt, virt + size);
WARN_ON(hyp_page_count(virt) != 1);
assert_transition_res(-EBUSY, __pkvm_host_unshare_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
hyp_unpin_shared_mem(virt, virt + size);
assert_page_state();
WARN_ON(hyp_page_count(virt));
selftest_state.host = PKVM_PAGE_OWNED;
selftest_state.hyp = PKVM_NOPAGE;
assert_transition_res(0, __pkvm_host_unshare_hyp, pfn);
selftest_state.host = PKVM_PAGE_SHARED_OWNED;
selftest_state.hyp = PKVM_NOPAGE;
assert_transition_res(0, __pkvm_host_share_ffa, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-ENOENT, __pkvm_host_unshare_guest, gfn, 1, vm);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
selftest_state.host = PKVM_PAGE_OWNED;
selftest_state.hyp = PKVM_NOPAGE;
assert_transition_res(0, __pkvm_host_unshare_ffa, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_unshare_ffa, pfn, 1);
selftest_state.host = PKVM_PAGE_SHARED_OWNED;
selftest_state.guest[0] = PKVM_PAGE_SHARED_BORROWED;
assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-EPERM, __pkvm_host_share_guest, pfn, gfn, 1, vcpu, prot);
assert_transition_res(-EPERM, __pkvm_host_share_ffa, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_donate_hyp, pfn, 1);
assert_transition_res(-EPERM, __pkvm_host_share_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_host_unshare_hyp, pfn);
assert_transition_res(-EPERM, __pkvm_hyp_donate_host, pfn, 1);
assert_transition_res(-EPERM, hyp_pin_shared_mem, virt, virt + size);
selftest_state.guest[1] = PKVM_PAGE_SHARED_BORROWED;
assert_transition_res(0, __pkvm_host_share_guest, pfn, gfn + 1, 1, vcpu, prot);
WARN_ON(hyp_virt_to_page(virt)->host_share_guest_count != 2);
selftest_state.guest[0] = PKVM_NOPAGE;
assert_transition_res(0, __pkvm_host_unshare_guest, gfn, 1, vm);
selftest_state.guest[1] = PKVM_NOPAGE;
selftest_state.host = PKVM_PAGE_OWNED;
assert_transition_res(0, __pkvm_host_unshare_guest, gfn + 1, 1, vm);
selftest_state.host = PKVM_NOPAGE;
selftest_state.hyp = PKVM_PAGE_OWNED;
assert_transition_res(0, __pkvm_host_donate_hyp, pfn, 1);
selftest_page->refcount = 1;
hyp_put_page(&host_s2_pool, virt);
}
#endif

View File

@ -229,9 +229,8 @@ int hyp_map_vectors(void)
return 0;
}
void *hyp_fixmap_map(phys_addr_t phys)
static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys)
{
struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
kvm_pte_t pte, *ptep = slot->ptep;
pte = *ptep;
@ -243,10 +242,21 @@ void *hyp_fixmap_map(phys_addr_t phys)
return (void *)slot->addr;
}
void *hyp_fixmap_map(phys_addr_t phys)
{
return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys);
}
static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
{
kvm_pte_t *ptep = slot->ptep;
u64 addr = slot->addr;
u32 level;
if (FIELD_GET(KVM_PTE_TYPE, *ptep) == KVM_PTE_TYPE_PAGE)
level = KVM_PGTABLE_LAST_LEVEL;
else
level = KVM_PGTABLE_LAST_LEVEL - 1; /* create_fixblock() guarantees PMD level */
WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
@ -260,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
* https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
*/
dsb(ishst);
__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL);
__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
dsb(ish);
isb();
}
@ -273,9 +283,9 @@ void hyp_fixmap_unmap(void)
static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg);
struct hyp_fixmap_slot *slot = (struct hyp_fixmap_slot *)ctx->arg;
if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL)
if (!kvm_pte_valid(ctx->old) || (ctx->end - ctx->start) != kvm_granule_size(ctx->level))
return -EINVAL;
slot->addr = ctx->addr;
@ -296,13 +306,84 @@ static int create_fixmap_slot(u64 addr, u64 cpu)
struct kvm_pgtable_walker walker = {
.cb = __create_fixmap_slot_cb,
.flags = KVM_PGTABLE_WALK_LEAF,
.arg = (void *)cpu,
.arg = per_cpu_ptr(&fixmap_slots, cpu),
};
return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
}
int hyp_create_pcpu_fixmap(void)
#if PAGE_SHIFT < 16
#define HAS_FIXBLOCK
static struct hyp_fixmap_slot hyp_fixblock_slot;
static DEFINE_HYP_SPINLOCK(hyp_fixblock_lock);
#endif
static int create_fixblock(void)
{
#ifdef HAS_FIXBLOCK
struct kvm_pgtable_walker walker = {
.cb = __create_fixmap_slot_cb,
.flags = KVM_PGTABLE_WALK_LEAF,
.arg = &hyp_fixblock_slot,
};
unsigned long addr;
phys_addr_t phys;
int ret, i;
/* Find a RAM phys address, PMD aligned */
for (i = 0; i < hyp_memblock_nr; i++) {
phys = ALIGN(hyp_memory[i].base, PMD_SIZE);
if (phys + PMD_SIZE < (hyp_memory[i].base + hyp_memory[i].size))
break;
}
if (i >= hyp_memblock_nr)
return -EINVAL;
hyp_spin_lock(&pkvm_pgd_lock);
addr = ALIGN(__io_map_base, PMD_SIZE);
ret = __pkvm_alloc_private_va_range(addr, PMD_SIZE);
if (ret)
goto unlock;
ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PMD_SIZE, phys, PAGE_HYP);
if (ret)
goto unlock;
ret = kvm_pgtable_walk(&pkvm_pgtable, addr, PMD_SIZE, &walker);
unlock:
hyp_spin_unlock(&pkvm_pgd_lock);
return ret;
#else
return 0;
#endif
}
void *hyp_fixblock_map(phys_addr_t phys, size_t *size)
{
#ifdef HAS_FIXBLOCK
*size = PMD_SIZE;
hyp_spin_lock(&hyp_fixblock_lock);
return fixmap_map_slot(&hyp_fixblock_slot, phys);
#else
*size = PAGE_SIZE;
return hyp_fixmap_map(phys);
#endif
}
void hyp_fixblock_unmap(void)
{
#ifdef HAS_FIXBLOCK
fixmap_clear_slot(&hyp_fixblock_slot);
hyp_spin_unlock(&hyp_fixblock_lock);
#else
hyp_fixmap_unmap();
#endif
}
int hyp_create_fixmap(void)
{
unsigned long addr, i;
int ret;
@ -322,7 +403,7 @@ int hyp_create_pcpu_fixmap(void)
return ret;
}
return 0;
return create_fixblock();
}
int hyp_create_idmap(u32 hyp_va_bits)

View File

@ -372,6 +372,18 @@ static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
}
static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu)
{
void *sve_state;
if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE))
return;
sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state);
hyp_unpin_shared_mem(sve_state,
sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu));
}
static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
unsigned int nr_vcpus)
{
@ -384,6 +396,7 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
continue;
unpin_host_vcpu(hyp_vcpu->host_vcpu);
unpin_host_sve_state(hyp_vcpu);
}
}
@ -398,12 +411,40 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
pkvm_init_features_from_host(hyp_vm, host_kvm);
}
static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
static int pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu)
{
struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
unsigned int sve_max_vl;
size_t sve_state_size;
void *sve_state;
int ret = 0;
if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE))
if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) {
vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED);
return 0;
}
/* Limit guest vector length to the maximum supported by the host. */
sve_max_vl = min(READ_ONCE(host_vcpu->arch.sve_max_vl), kvm_host_sve_max_vl);
sve_state_size = sve_state_size_from_vl(sve_max_vl);
sve_state = kern_hyp_va(READ_ONCE(host_vcpu->arch.sve_state));
if (!sve_state || !sve_state_size) {
ret = -EINVAL;
goto err;
}
ret = hyp_pin_shared_mem(sve_state, sve_state + sve_state_size);
if (ret)
goto err;
vcpu->arch.sve_state = sve_state;
vcpu->arch.sve_max_vl = sve_max_vl;
return 0;
err:
clear_bit(KVM_ARM_VCPU_SVE, vcpu->kvm->arch.vcpu_features);
return ret;
}
static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
@ -432,7 +473,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
if (ret)
goto done;
pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu);
ret = pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu);
done:
if (ret)
unpin_host_vcpu(host_vcpu);

View File

@ -28,6 +28,7 @@ static void *vmemmap_base;
static void *vm_table_base;
static void *hyp_pgt_base;
static void *host_s2_pgt_base;
static void *selftest_base;
static void *ffa_proxy_pages;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static struct hyp_pool hpool;
@ -38,6 +39,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
hyp_early_alloc_init(virt, size);
nr_pages = pkvm_selftest_pages();
selftest_base = hyp_early_alloc_contig(nr_pages);
if (nr_pages && !selftest_base)
return -ENOMEM;
nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page));
vmemmap_base = hyp_early_alloc_contig(nr_pages);
if (!vmemmap_base)
@ -119,6 +125,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
if (ret)
return ret;
ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP);
if (ret)
return ret;
ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
if (ret)
return ret;
@ -180,6 +190,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
enum pkvm_page_state state;
struct hyp_page *page;
phys_addr_t phys;
if (!kvm_pte_valid(ctx->old))
@ -192,19 +203,25 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
if (!addr_is_memory(phys))
return -EINVAL;
page = hyp_phys_to_page(phys);
/*
* Adjust the host stage-2 mappings to match the ownership attributes
* configured in the hypervisor stage-1.
* configured in the hypervisor stage-1, and make sure to propagate them
* to the hyp_vmemmap state.
*/
state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(ctx->old));
switch (state) {
case PKVM_PAGE_OWNED:
set_hyp_state(page, PKVM_PAGE_OWNED);
return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
case PKVM_PAGE_SHARED_OWNED:
hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED;
set_hyp_state(page, PKVM_PAGE_SHARED_OWNED);
set_host_state(page, PKVM_PAGE_SHARED_BORROWED);
break;
case PKVM_PAGE_SHARED_BORROWED:
hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED;
set_hyp_state(page, PKVM_PAGE_SHARED_BORROWED);
set_host_state(page, PKVM_PAGE_SHARED_OWNED);
break;
default:
return -EINVAL;
@ -295,7 +312,7 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
ret = hyp_create_pcpu_fixmap();
ret = hyp_create_fixmap();
if (ret)
goto out;
@ -304,6 +321,8 @@ void __noreturn __pkvm_init_finalise(void)
goto out;
pkvm_hyp_vm_table_init(vm_table_base);
pkvm_ownership_selftest(selftest_base);
out:
/*
* We tail-called to here from handle___pkvm_init() and will not return,

View File

@ -33,6 +33,18 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
struct fgt_masks hfgrtr_masks;
struct fgt_masks hfgwtr_masks;
struct fgt_masks hfgitr_masks;
struct fgt_masks hdfgrtr_masks;
struct fgt_masks hdfgwtr_masks;
struct fgt_masks hafgrtr_masks;
struct fgt_masks hfgrtr2_masks;
struct fgt_masks hfgwtr2_masks;
struct fgt_masks hfgitr2_masks;
struct fgt_masks hdfgrtr2_masks;
struct fgt_masks hdfgwtr2_masks;
extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
@ -142,7 +154,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
__deactivate_traps_common(vcpu);
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
write_sysreg_hcr(this_cpu_ptr(&kvm_init_params)->hcr_el2);
__deactivate_cptr_traps(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);

View File

@ -11,12 +11,6 @@
#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
#define KVM_PTE_TYPE BIT(1)
#define KVM_PTE_TYPE_BLOCK 0
#define KVM_PTE_TYPE_PAGE 1
#define KVM_PTE_TYPE_TABLE 1
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;

View File

@ -446,7 +446,7 @@ u64 __vgic_v3_get_gic_config(void)
if (has_vhe()) {
flags = local_daif_save();
} else {
sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO);
sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO);
isb();
}
@ -461,7 +461,7 @@ u64 __vgic_v3_get_gic_config(void)
if (has_vhe()) {
local_daif_restore(flags);
} else {
sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0);
sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0);
isb();
}
@ -1058,11 +1058,11 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
switch (sysreg) {
case SYS_ICC_IGRPEN0_EL1:
if (is_read &&
(__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
(__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1))
return true;
if (!is_read &&
(__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
(__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1))
return true;
fallthrough;
@ -1079,11 +1079,11 @@ static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
case SYS_ICC_IGRPEN1_EL1:
if (is_read &&
(__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
(__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1))
return true;
if (!is_read &&
(__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
(__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1))
return true;
fallthrough;

View File

@ -48,21 +48,46 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
static u64 __compute_hcr(struct kvm_vcpu *vcpu)
{
u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
u64 hcr = vcpu->arch.hcr_el2;
if (!vcpu_has_nv(vcpu))
return hcr;
/*
* We rely on the invariant that a vcpu entered from HYP
* context must also exit in the same context, as only an ERET
* instruction can kick us out of it, and we obviously trap
* that sucker. PSTATE.M will get fixed-up on exit.
*/
if (is_hyp_ctxt(vcpu)) {
host_data_set_flag(VCPU_IN_HYP_CONTEXT);
hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB;
if (!vcpu_el2_e2h_is_set(vcpu))
hcr |= HCR_NV1;
write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2);
} else {
host_data_clear_flag(VCPU_IN_HYP_CONTEXT);
if (guest_hcr & HCR_NV) {
u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id()));
/* Inherit the low bits from the actual register */
va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0);
write_sysreg_s(va, SYS_VNCR_EL2);
/* Force NV2 in case the guest is forgetful... */
guest_hcr |= HCR_NV2;
}
}
return hcr | (__vcpu_sys_reg(vcpu, HCR_EL2) & ~NV_HCR_GUEST_EXCLUDE);
BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) &&
host_data_test_flag(L1_VNCR_MAPPED));
return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE);
}
static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
@ -184,7 +209,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
___deactivate_traps(vcpu);
write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
if (has_cntpoff()) {
struct timer_map map;
@ -459,6 +484,14 @@ static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
if (ret)
return false;
/*
* If we have to check for any VNCR mapping being invalidated,
* go back to the slow path for further processing.
*/
if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) &&
atomic_read(&vcpu->kvm->arch.vncr_map_count))
return false;
__kvm_skip_instr(vcpu);
return true;
@ -568,9 +601,12 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
/*
* If we were in HYP context on entry, adjust the PSTATE view
* so that the usual helpers work correctly.
* so that the usual helpers work correctly. This enforces our
* invariant that the guest's HYP context status is preserved
* across a run.
*/
if (vcpu_has_nv(vcpu) && (read_sysreg(hcr_el2) & HCR_NV)) {
if (vcpu_has_nv(vcpu) &&
unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) {
u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
switch (mode) {
@ -586,6 +622,10 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
*vcpu_cpsr(vcpu) |= mode;
}
/* Apply extreme paranoia! */
BUG_ON(vcpu_has_nv(vcpu) &&
!!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu));
return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers);
}

View File

@ -63,7 +63,7 @@ static void enter_vmid_context(struct kvm_s2_mmu *mmu,
__load_stage2(mmu, mmu->arch);
val = read_sysreg(hcr_el2);
val &= ~HCR_TGE;
write_sysreg(val, hcr_el2);
write_sysreg_hcr(val);
isb();
}
@ -73,7 +73,7 @@ static void exit_vmid_context(struct tlb_inv_context *cxt)
* We're done with the TLB operation, let's restore the host's
* view of HCR_EL2.
*/
write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
isb();
/* ... and the stage-2 MMU context that we switched away from */

View File

@ -1304,6 +1304,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
if (map_size == PAGE_SIZE)
return true;
/* pKVM only supports PMD_SIZE huge-mappings */
if (is_protected_kvm_enabled() && map_size != PMD_SIZE)
return false;
size = memslot->npages * PAGE_SIZE;
gpa_start = memslot->base_gfn << PAGE_SHIFT;
@ -1540,7 +1544,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* logging_active is guaranteed to never be true for VM_PFNMAP
* memslots.
*/
if (logging_active || is_protected_kvm_enabled()) {
if (logging_active) {
force_pte = true;
vma_shift = PAGE_SHIFT;
} else {

View File

@ -8,6 +8,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <asm/fixmap.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
@ -16,6 +17,24 @@
#include "sys_regs.h"
struct vncr_tlb {
/* The guest's VNCR_EL2 */
u64 gva;
struct s1_walk_info wi;
struct s1_walk_result wr;
u64 hpa;
/* -1 when not mapped on a CPU */
int cpu;
/*
* true if the TLB is valid. Can only be changed with the
* mmu_lock held.
*/
bool valid;
};
/*
* Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
* memory usage and potential number of different sets of S2 PTs in
@ -28,6 +47,7 @@ void kvm_init_nested(struct kvm *kvm)
{
kvm->arch.nested_mmus = NULL;
kvm->arch.nested_mmus_size = 0;
atomic_set(&kvm->arch.vncr_map_count, 0);
}
static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
@ -55,6 +75,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
return -EINVAL;
if (!vcpu->arch.ctxt.vncr_array)
vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT |
__GFP_ZERO);
if (!vcpu->arch.ctxt.vncr_array)
return -ENOMEM;
/*
* Let's treat memory allocation failures as benign: If we fail to
* allocate anything, return an error and keep the allocated array
@ -85,6 +112,9 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);
free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
vcpu->arch.ctxt.vncr_array = NULL;
return ret;
}
@ -405,6 +435,30 @@ static unsigned int ttl_to_size(u8 ttl)
return max_size;
}
static u8 pgshift_level_to_ttl(u16 shift, u8 level)
{
u8 ttl;
switch(shift) {
case 12:
ttl = TLBI_TTL_TG_4K;
break;
case 14:
ttl = TLBI_TTL_TG_16K;
break;
case 16:
ttl = TLBI_TTL_TG_64K;
break;
default:
BUG();
}
ttl <<= 2;
ttl |= level & 3;
return ttl;
}
/*
* Compute the equivalent of the TTL field by parsing the shadow PT. The
* granule size is extracted from the cached VTCR_EL2.TG0 while the level is
@ -676,23 +730,36 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
{
/*
* The vCPU kept its reference on the MMU after the last put, keep
* rolling with it.
* If the vCPU kept its reference on the MMU after the last put,
* keep rolling with it.
*/
if (vcpu->arch.hw_mmu)
return;
if (is_hyp_ctxt(vcpu)) {
vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
if (!vcpu->arch.hw_mmu)
vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
} else {
write_lock(&vcpu->kvm->mmu_lock);
vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
write_unlock(&vcpu->kvm->mmu_lock);
if (!vcpu->arch.hw_mmu) {
scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
}
if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV)
kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
}
}
void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
{
/* Unconditionally drop the VNCR mapping if we have one */
if (host_data_test_flag(L1_VNCR_MAPPED)) {
BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id());
BUG_ON(is_hyp_ctxt(vcpu));
clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu));
vcpu->arch.vncr_tlb->cpu = -1;
host_data_clear_flag(L1_VNCR_MAPPED);
atomic_dec(&vcpu->kvm->arch.vncr_map_count);
}
/*
* Keep a reference on the associated stage-2 MMU if the vCPU is
* scheduling out and not in WFI emulation, suggesting it is likely to
@ -743,6 +810,245 @@ int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
return kvm_inject_nested_sync(vcpu, esr_el2);
}
static void invalidate_vncr(struct vncr_tlb *vt)
{
vt->valid = false;
if (vt->cpu != -1)
clear_fixmap(vncr_fixmap(vt->cpu));
}
static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
{
struct kvm_vcpu *vcpu;
unsigned long i;
lockdep_assert_held_write(&kvm->mmu_lock);
if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
return;
kvm_for_each_vcpu(i, vcpu, kvm) {
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
u64 ipa_start, ipa_end, ipa_size;
/*
* Careful here: We end-up here from an MMU notifier,
* and this can race against a vcpu not being onlined
* yet, without the pseudo-TLB being allocated.
*
* Skip those, as they obviously don't participate in
* the invalidation at this stage.
*/
if (!vt)
continue;
if (!vt->valid)
continue;
ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
vt->wr.level));
ipa_start = vt->wr.pa & (ipa_size - 1);
ipa_end = ipa_start + ipa_size;
if (ipa_end <= start || ipa_start >= end)
continue;
invalidate_vncr(vt);
}
}
struct s1e2_tlbi_scope {
enum {
TLBI_ALL,
TLBI_VA,
TLBI_VAA,
TLBI_ASID,
} type;
u16 asid;
u64 va;
u64 size;
};
static void invalidate_vncr_va(struct kvm *kvm,
struct s1e2_tlbi_scope *scope)
{
struct kvm_vcpu *vcpu;
unsigned long i;
lockdep_assert_held_write(&kvm->mmu_lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
u64 va_start, va_end, va_size;
if (!vt->valid)
continue;
va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
vt->wr.level));
va_start = vt->gva & (va_size - 1);
va_end = va_start + va_size;
switch (scope->type) {
case TLBI_ALL:
break;
case TLBI_VA:
if (va_end <= scope->va ||
va_start >= (scope->va + scope->size))
continue;
if (vt->wr.nG && vt->wr.asid != scope->asid)
continue;
break;
case TLBI_VAA:
if (va_end <= scope->va ||
va_start >= (scope->va + scope->size))
continue;
break;
case TLBI_ASID:
if (!vt->wr.nG || vt->wr.asid != scope->asid)
continue;
break;
}
invalidate_vncr(vt);
}
}
static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
struct s1e2_tlbi_scope *scope)
{
switch (inst) {
case OP_TLBI_ALLE2:
case OP_TLBI_ALLE2IS:
case OP_TLBI_ALLE2OS:
case OP_TLBI_VMALLE1:
case OP_TLBI_VMALLE1IS:
case OP_TLBI_VMALLE1OS:
case OP_TLBI_ALLE2NXS:
case OP_TLBI_ALLE2ISNXS:
case OP_TLBI_ALLE2OSNXS:
case OP_TLBI_VMALLE1NXS:
case OP_TLBI_VMALLE1ISNXS:
case OP_TLBI_VMALLE1OSNXS:
scope->type = TLBI_ALL;
break;
case OP_TLBI_VAE2:
case OP_TLBI_VAE2IS:
case OP_TLBI_VAE2OS:
case OP_TLBI_VAE1:
case OP_TLBI_VAE1IS:
case OP_TLBI_VAE1OS:
case OP_TLBI_VAE2NXS:
case OP_TLBI_VAE2ISNXS:
case OP_TLBI_VAE2OSNXS:
case OP_TLBI_VAE1NXS:
case OP_TLBI_VAE1ISNXS:
case OP_TLBI_VAE1OSNXS:
case OP_TLBI_VALE2:
case OP_TLBI_VALE2IS:
case OP_TLBI_VALE2OS:
case OP_TLBI_VALE1:
case OP_TLBI_VALE1IS:
case OP_TLBI_VALE1OS:
case OP_TLBI_VALE2NXS:
case OP_TLBI_VALE2ISNXS:
case OP_TLBI_VALE2OSNXS:
case OP_TLBI_VALE1NXS:
case OP_TLBI_VALE1ISNXS:
case OP_TLBI_VALE1OSNXS:
scope->type = TLBI_VA;
scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
if (!scope->size)
scope->size = SZ_1G;
scope->va = (val << 12) & ~(scope->size - 1);
scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
break;
case OP_TLBI_ASIDE1:
case OP_TLBI_ASIDE1IS:
case OP_TLBI_ASIDE1OS:
case OP_TLBI_ASIDE1NXS:
case OP_TLBI_ASIDE1ISNXS:
case OP_TLBI_ASIDE1OSNXS:
scope->type = TLBI_ASID;
scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
break;
case OP_TLBI_VAAE1:
case OP_TLBI_VAAE1IS:
case OP_TLBI_VAAE1OS:
case OP_TLBI_VAAE1NXS:
case OP_TLBI_VAAE1ISNXS:
case OP_TLBI_VAAE1OSNXS:
case OP_TLBI_VAALE1:
case OP_TLBI_VAALE1IS:
case OP_TLBI_VAALE1OS:
case OP_TLBI_VAALE1NXS:
case OP_TLBI_VAALE1ISNXS:
case OP_TLBI_VAALE1OSNXS:
scope->type = TLBI_VAA;
scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
if (!scope->size)
scope->size = SZ_1G;
scope->va = (val << 12) & ~(scope->size - 1);
break;
case OP_TLBI_RVAE2:
case OP_TLBI_RVAE2IS:
case OP_TLBI_RVAE2OS:
case OP_TLBI_RVAE1:
case OP_TLBI_RVAE1IS:
case OP_TLBI_RVAE1OS:
case OP_TLBI_RVAE2NXS:
case OP_TLBI_RVAE2ISNXS:
case OP_TLBI_RVAE2OSNXS:
case OP_TLBI_RVAE1NXS:
case OP_TLBI_RVAE1ISNXS:
case OP_TLBI_RVAE1OSNXS:
case OP_TLBI_RVALE2:
case OP_TLBI_RVALE2IS:
case OP_TLBI_RVALE2OS:
case OP_TLBI_RVALE1:
case OP_TLBI_RVALE1IS:
case OP_TLBI_RVALE1OS:
case OP_TLBI_RVALE2NXS:
case OP_TLBI_RVALE2ISNXS:
case OP_TLBI_RVALE2OSNXS:
case OP_TLBI_RVALE1NXS:
case OP_TLBI_RVALE1ISNXS:
case OP_TLBI_RVALE1OSNXS:
scope->type = TLBI_VA;
scope->va = decode_range_tlbi(val, &scope->size, &scope->asid);
break;
case OP_TLBI_RVAAE1:
case OP_TLBI_RVAAE1IS:
case OP_TLBI_RVAAE1OS:
case OP_TLBI_RVAAE1NXS:
case OP_TLBI_RVAAE1ISNXS:
case OP_TLBI_RVAAE1OSNXS:
case OP_TLBI_RVAALE1:
case OP_TLBI_RVAALE1IS:
case OP_TLBI_RVAALE1OS:
case OP_TLBI_RVAALE1NXS:
case OP_TLBI_RVAALE1ISNXS:
case OP_TLBI_RVAALE1OSNXS:
scope->type = TLBI_VAA;
scope->va = decode_range_tlbi(val, &scope->size, NULL);
break;
}
}
void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val)
{
struct s1e2_tlbi_scope scope = {};
compute_s1_tlbi_range(vcpu, inst, val, &scope);
guard(write_lock)(&vcpu->kvm->mmu_lock);
invalidate_vncr_va(vcpu->kvm, &scope);
}
void kvm_nested_s2_wp(struct kvm *kvm)
{
int i;
@ -755,6 +1061,8 @@ void kvm_nested_s2_wp(struct kvm *kvm)
if (kvm_s2_mmu_valid(mmu))
kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
}
kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
}
void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
@ -769,6 +1077,8 @@ void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
if (kvm_s2_mmu_valid(mmu))
kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
}
kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
}
void kvm_nested_s2_flush(struct kvm *kvm)
@ -801,6 +1111,295 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
kvm_uninit_stage2_mmu(kvm);
}
/*
* Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter:
*
* - We introduce an internal representation of a vcpu-private TLB,
* representing the mapping between the guest VA contained in VNCR_EL2,
* the IPA the guest's EL2 PTs point to, and the actual PA this lives at.
*
* - On translation fault from a nested VNCR access, we create such a TLB.
* If there is no mapping to describe, the guest inherits the fault.
* Crucially, no actual mapping is done at this stage.
*
* - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above
* TLB exists, we map it in the fixmap for this CPU, and run with it. We
* have to respect the permissions dictated by the guest, but not the
* memory type (FWB is a must).
*
* - Note that we usually don't do a vcpu_load() on the back of a fault
* (unless we are preempted), so the resolution of a translation fault
* must go via a request that will map the VNCR page in the fixmap.
* vcpu_load() might as well use the same mechanism.
*
* - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was
* mapped, we unmap it. Yes it is that simple. The TLB still exists
* though, and may be reused at a later load.
*
* - On permission fault, we simply forward the fault to the guest's EL2.
* Get out of my way.
*
* - On any TLBI for the EL2&0 translation regime, we must find any TLB that
* intersects with the TLBI request, invalidate it, and unmap the page
* from the fixmap. Because we need to look at all the vcpu-private TLBs,
* this requires some wide-ranging locking to ensure that nothing races
* against it. This may require some refcounting to avoid the search when
* no such TLB is present.
*
* - On MMU notifiers, we must invalidate our TLB in a similar way, but
* looking at the IPA instead. The funny part is that there may not be a
* stage-2 mapping for this page if L1 hasn't accessed it using LD/ST
* instructions.
*/
int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu)
{
if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
return 0;
vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb),
GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.vncr_tlb)
return -ENOMEM;
return 0;
}
static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
{
return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
}
static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
{
bool write_fault, writable;
unsigned long mmu_seq;
struct vncr_tlb *vt;
struct page *page;
u64 va, pfn, gfn;
int ret;
vt = vcpu->arch.vncr_tlb;
/*
* If we're about to walk the EL2 S1 PTs, we must invalidate the
* current TLB, as it could be sampled from another vcpu doing a
* TLBI *IS. A real CPU wouldn't do that, but we only keep a single
* translation, so not much of a choice.
*
* We also prepare the next walk wilst we're at it.
*/
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
invalidate_vncr(vt);
vt->wi = (struct s1_walk_info) {
.regime = TR_EL20,
.as_el0 = false,
.pan = false,
};
vt->wr = (struct s1_walk_result){};
}
guard(srcu)(&vcpu->kvm->srcu);
va = read_vncr_el2(vcpu);
ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va);
if (ret)
return ret;
write_fault = kvm_is_write_fault(vcpu);
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
gfn = vt->wr.pa >> PAGE_SHIFT;
pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page);
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
return -EFAULT;
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
return -EAGAIN;
vt->gva = va;
vt->hpa = pfn << PAGE_SHIFT;
vt->valid = true;
vt->cpu = -1;
kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw);
}
if (vt->wr.pw)
mark_page_dirty(vcpu->kvm, gfn);
return 0;
}
static void inject_vncr_perm(struct kvm_vcpu *vcpu)
{
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
u64 esr = kvm_vcpu_get_esr(vcpu);
/* Adjust the fault level to reflect that of the guest's */
esr &= ~ESR_ELx_FSC;
esr |= FIELD_PREP(ESR_ELx_FSC,
ESR_ELx_FSC_PERM_L(vt->wr.level));
kvm_inject_nested_sync(vcpu, esr);
}
static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
{
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
lockdep_assert_held_read(&vcpu->kvm->mmu_lock);
if (!vt->valid)
return false;
if (read_vncr_el2(vcpu) != vt->gva)
return false;
if (vt->wr.nG) {
u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
u64 ttbr = ((tcr & TCR_A1) ?
vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
vcpu_read_sys_reg(vcpu, TTBR0_EL2));
u16 asid;
asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
!(tcr & TCR_ASID16))
asid &= GENMASK(7, 0);
return asid != vt->wr.asid;
}
return true;
}
int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
{
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
u64 esr = kvm_vcpu_get_esr(vcpu);
BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT));
if (esr_fsc_is_permission_fault(esr)) {
inject_vncr_perm(vcpu);
} else if (esr_fsc_is_translation_fault(esr)) {
bool valid;
int ret;
scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
valid = kvm_vncr_tlb_lookup(vcpu);
if (!valid)
ret = kvm_translate_vncr(vcpu);
else
ret = -EPERM;
switch (ret) {
case -EAGAIN:
case -ENOMEM:
/* Let's try again... */
break;
case -EFAULT:
case -EINVAL:
case -ENOENT:
case -EACCES:
/*
* Translation failed, inject the corresponding
* exception back to EL2.
*/
BUG_ON(!vt->wr.failed);
esr &= ~ESR_ELx_FSC;
esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst);
kvm_inject_nested_sync(vcpu, esr);
break;
case -EPERM:
/* Hack to deal with POE until we get kernel support */
inject_vncr_perm(vcpu);
break;
case 0:
break;
}
} else {
WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr);
}
return 1;
}
static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
{
struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
pgprot_t prot;
guard(preempt)();
guard(read_lock)(&vcpu->kvm->mmu_lock);
/*
* The request to map VNCR may have raced against some other
* event, such as an interrupt, and may not be valid anymore.
*/
if (is_hyp_ctxt(vcpu))
return;
/*
* Check that the pseudo-TLB is valid and that VNCR_EL2 still
* contains the expected value. If it doesn't, we simply bail out
* without a mapping -- a transformed MSR/MRS will generate the
* fault and allows us to populate the pseudo-TLB.
*/
if (!vt->valid)
return;
if (read_vncr_el2(vcpu) != vt->gva)
return;
if (vt->wr.nG) {
u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
u64 ttbr = ((tcr & TCR_A1) ?
vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
vcpu_read_sys_reg(vcpu, TTBR0_EL2));
u16 asid;
asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
!(tcr & TCR_ASID16))
asid &= GENMASK(7, 0);
if (asid != vt->wr.asid)
return;
}
vt->cpu = smp_processor_id();
if (vt->wr.pw && vt->wr.pr)
prot = PAGE_KERNEL;
else if (vt->wr.pr)
prot = PAGE_KERNEL_RO;
else
prot = PAGE_NONE;
/*
* We can't map write-only (or no permission at all) in the kernel,
* but the guest can do it if using POE, so we'll have to turn a
* translation fault into a permission fault at runtime.
* FIXME: WO doesn't work at all, need POE support in the kernel.
*/
if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) {
__set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot);
host_data_set_flag(L1_VNCR_MAPPED);
atomic_inc(&vcpu->kvm->arch.vncr_map_count);
}
}
/*
* Our emulated CPU doesn't support all the possible features. For the
* sake of simplicity (and probably mental sanity), wipe out a number
@ -1018,216 +1617,49 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1);
/* HCR_EL2 */
res0 = BIT(48);
res1 = HCR_RW;
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, TWED, IMP))
res0 |= GENMASK(63, 59);
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, MTE2))
res0 |= (HCR_TID5 | HCR_DCT | HCR_ATA);
if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, TTLBxS))
res0 |= (HCR_TTLBIS | HCR_TTLBOS);
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) &&
!kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2))
res0 |= HCR_ENSCXT;
if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, EVT, IMP))
res0 |= (HCR_TOCU | HCR_TICAB | HCR_TID4);
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1))
res0 |= HCR_AMVOFFEN;
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1))
res0 |= HCR_FIEN;
if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, FWB, IMP))
res0 |= HCR_FWB;
/* Implementation choice: NV2 is the only supported config */
if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
res0 |= (HCR_NV2 | HCR_NV | HCR_AT);
if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, NI))
res0 |= HCR_NV1;
if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) &&
kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
res0 |= (HCR_API | HCR_APK);
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP))
res0 |= BIT(39);
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP))
res0 |= (HCR_TEA | HCR_TERR);
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP))
res0 |= HCR_TLOR;
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
res0 |= HCR_E2H;
if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, E2H0, IMP))
res1 |= HCR_E2H;
get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1);
set_sysreg_masks(kvm, HCR_EL2, res0, res1);
/* HCRX_EL2 */
res0 = HCRX_EL2_RES0;
res1 = HCRX_EL2_RES1;
if (!kvm_has_feat(kvm, ID_AA64ISAR3_EL1, PACM, TRIVIAL_IMP))
res0 |= HCRX_EL2_PACMEn;
if (!kvm_has_feat(kvm, ID_AA64PFR2_EL1, FPMR, IMP))
res0 |= HCRX_EL2_EnFPM;
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP))
res0 |= HCRX_EL2_GCSEn;
if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, SYSREG_128, IMP))
res0 |= HCRX_EL2_EnIDCP128;
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, DEV_ASYNC))
res0 |= (HCRX_EL2_EnSDERR | HCRX_EL2_EnSNERR);
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, DF2, IMP))
res0 |= HCRX_EL2_TMEA;
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP))
res0 |= HCRX_EL2_D128En;
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP))
res0 |= HCRX_EL2_PTTWI;
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP))
res0 |= HCRX_EL2_SCTLR2En;
if (!kvm_has_tcr2(kvm))
res0 |= HCRX_EL2_TCR2En;
if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP))
res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2);
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, CMOW, IMP))
res0 |= HCRX_EL2_CMOW;
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, NMI, IMP))
res0 |= (HCRX_EL2_VFNMI | HCRX_EL2_VINMI | HCRX_EL2_TALLINT);
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP) ||
!(read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS))
res0 |= HCRX_EL2_SMPME;
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
res0 |= (HCRX_EL2_FGTnXS | HCRX_EL2_FnXS);
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V))
res0 |= HCRX_EL2_EnASR;
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64))
res0 |= HCRX_EL2_EnALS;
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA))
res0 |= HCRX_EL2_EnAS0;
get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1);
set_sysreg_masks(kvm, HCRX_EL2, res0, res1);
/* HFG[RW]TR_EL2 */
res0 = res1 = 0;
if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) &&
kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey |
HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey |
HFGxTR_EL2_APIBKey);
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP))
res0 |= (HFGxTR_EL2_LORC_EL1 | HFGxTR_EL2_LOREA_EL1 |
HFGxTR_EL2_LORID_EL1 | HFGxTR_EL2_LORN_EL1 |
HFGxTR_EL2_LORSA_EL1);
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, CSV2, CSV2_2) &&
!kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2))
res0 |= (HFGxTR_EL2_SCXTNUM_EL1 | HFGxTR_EL2_SCXTNUM_EL0);
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP))
res0 |= HFGxTR_EL2_ICC_IGRPENn_EL1;
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP))
res0 |= (HFGxTR_EL2_ERRIDR_EL1 | HFGxTR_EL2_ERRSELR_EL1 |
HFGxTR_EL2_ERXFR_EL1 | HFGxTR_EL2_ERXCTLR_EL1 |
HFGxTR_EL2_ERXSTATUS_EL1 | HFGxTR_EL2_ERXMISCn_EL1 |
HFGxTR_EL2_ERXPFGF_EL1 | HFGxTR_EL2_ERXPFGCTL_EL1 |
HFGxTR_EL2_ERXPFGCDN_EL1 | HFGxTR_EL2_ERXADDR_EL1);
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA))
res0 |= HFGxTR_EL2_nACCDATA_EL1;
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP))
res0 |= (HFGxTR_EL2_nGCS_EL0 | HFGxTR_EL2_nGCS_EL1);
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP))
res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0);
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP))
res0 |= HFGxTR_EL2_nRCWMASK_EL1;
if (!kvm_has_s1pie(kvm))
res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1);
if (!kvm_has_s1poe(kvm))
res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1);
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
res0 |= HFGxTR_EL2_nS2POR_EL1;
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP))
res0 |= (HFGxTR_EL2_nMAIR2_EL1 | HFGxTR_EL2_nAMAIR2_EL1);
set_sysreg_masks(kvm, HFGRTR_EL2, res0 | __HFGRTR_EL2_RES0, res1);
set_sysreg_masks(kvm, HFGWTR_EL2, res0 | __HFGWTR_EL2_RES0, res1);
get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1);
set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1);
get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1);
set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1);
/* HDFG[RW]TR_EL2 */
res0 = res1 = 0;
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP))
res0 |= HDFGRTR_EL2_OSDLR_EL1;
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP))
res0 |= (HDFGRTR_EL2_PMEVCNTRn_EL0 | HDFGRTR_EL2_PMEVTYPERn_EL0 |
HDFGRTR_EL2_PMCCFILTR_EL0 | HDFGRTR_EL2_PMCCNTR_EL0 |
HDFGRTR_EL2_PMCNTEN | HDFGRTR_EL2_PMINTEN |
HDFGRTR_EL2_PMOVS | HDFGRTR_EL2_PMSELR_EL0 |
HDFGRTR_EL2_PMMIR_EL1 | HDFGRTR_EL2_PMUSERENR_EL0 |
HDFGRTR_EL2_PMCEIDn_EL0);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP))
res0 |= (HDFGRTR_EL2_PMBLIMITR_EL1 | HDFGRTR_EL2_PMBPTR_EL1 |
HDFGRTR_EL2_PMBSR_EL1 | HDFGRTR_EL2_PMSCR_EL1 |
HDFGRTR_EL2_PMSEVFR_EL1 | HDFGRTR_EL2_PMSFCR_EL1 |
HDFGRTR_EL2_PMSICR_EL1 | HDFGRTR_EL2_PMSIDR_EL1 |
HDFGRTR_EL2_PMSIRR_EL1 | HDFGRTR_EL2_PMSLATFR_EL1 |
HDFGRTR_EL2_PMBIDR_EL1);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP))
res0 |= (HDFGRTR_EL2_TRC | HDFGRTR_EL2_TRCAUTHSTATUS |
HDFGRTR_EL2_TRCAUXCTLR | HDFGRTR_EL2_TRCCLAIM |
HDFGRTR_EL2_TRCCNTVRn | HDFGRTR_EL2_TRCID |
HDFGRTR_EL2_TRCIMSPECn | HDFGRTR_EL2_TRCOSLSR |
HDFGRTR_EL2_TRCPRGCTLR | HDFGRTR_EL2_TRCSEQSTR |
HDFGRTR_EL2_TRCSSCSRn | HDFGRTR_EL2_TRCSTATR |
HDFGRTR_EL2_TRCVICTLR);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP))
res0 |= (HDFGRTR_EL2_TRBBASER_EL1 | HDFGRTR_EL2_TRBIDR_EL1 |
HDFGRTR_EL2_TRBLIMITR_EL1 | HDFGRTR_EL2_TRBMAR_EL1 |
HDFGRTR_EL2_TRBPTR_EL1 | HDFGRTR_EL2_TRBSR_EL1 |
HDFGRTR_EL2_TRBTRG_EL1);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP))
res0 |= (HDFGRTR_EL2_nBRBIDR | HDFGRTR_EL2_nBRBCTL |
HDFGRTR_EL2_nBRBDATA);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2))
res0 |= HDFGRTR_EL2_nPMSNEVFR_EL1;
set_sysreg_masks(kvm, HDFGRTR_EL2, res0 | HDFGRTR_EL2_RES0, res1);
/* Reuse the bits from the read-side and add the write-specific stuff */
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP))
res0 |= (HDFGWTR_EL2_PMCR_EL0 | HDFGWTR_EL2_PMSWINC_EL0);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP))
res0 |= HDFGWTR_EL2_TRCOSLAR;
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
res0 |= HDFGWTR_EL2_TRFCR_EL1;
set_sysreg_masks(kvm, HFGWTR_EL2, res0 | HDFGWTR_EL2_RES0, res1);
get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1);
set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1);
get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1);
set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1);
/* HFGITR_EL2 */
res0 = HFGITR_EL2_RES0;
res1 = HFGITR_EL2_RES1;
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, DPB, DPB2))
res0 |= HFGITR_EL2_DCCVADP;
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2))
res0 |= (HFGITR_EL2_ATS1E1RP | HFGITR_EL2_ATS1E1WP);
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
res0 |= (HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS |
HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS |
HFGITR_EL2_TLBIVAALE1OS | HFGITR_EL2_TLBIVALE1OS |
HFGITR_EL2_TLBIVAAE1OS | HFGITR_EL2_TLBIASIDE1OS |
HFGITR_EL2_TLBIVAE1OS | HFGITR_EL2_TLBIVMALLE1OS);
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
res0 |= (HFGITR_EL2_TLBIRVAALE1 | HFGITR_EL2_TLBIRVALE1 |
HFGITR_EL2_TLBIRVAAE1 | HFGITR_EL2_TLBIRVAE1 |
HFGITR_EL2_TLBIRVAALE1IS | HFGITR_EL2_TLBIRVALE1IS |
HFGITR_EL2_TLBIRVAAE1IS | HFGITR_EL2_TLBIRVAE1IS |
HFGITR_EL2_TLBIRVAALE1OS | HFGITR_EL2_TLBIRVALE1OS |
HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS);
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, IMP))
res0 |= (HFGITR_EL2_CFPRCTX | HFGITR_EL2_DVPRCTX |
HFGITR_EL2_CPPRCTX);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP))
res0 |= (HFGITR_EL2_nBRBINJ | HFGITR_EL2_nBRBIALL);
if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, GCS, IMP))
res0 |= (HFGITR_EL2_nGCSPUSHM_EL1 | HFGITR_EL2_nGCSSTR_EL1 |
HFGITR_EL2_nGCSEPP);
if (!kvm_has_feat(kvm, ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX))
res0 |= HFGITR_EL2_COSPRCTX;
if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP))
res0 |= HFGITR_EL2_ATS1E1A;
get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1);
set_sysreg_masks(kvm, HFGITR_EL2, res0, res1);
/* HAFGRTR_EL2 - not a lot to see here */
res0 = HAFGRTR_EL2_RES0;
res1 = HAFGRTR_EL2_RES1;
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1))
res0 |= ~(res0 | res1);
get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1);
set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1);
/* HFG[RW]TR2_EL2 */
get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1);
set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1);
get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1);
set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1);
/* HDFG[RW]TR2_EL2 */
get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1);
set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1);
get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1);
set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1);
/* HFGITR2_EL2 */
get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1);
set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1);
/* TCR2_EL2 */
res0 = TCR2_EL2_RES0;
res1 = TCR2_EL2_RES1;
@ -1318,6 +1750,9 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1);
/* VNCR_EL2 */
set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1);
out:
for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
(void)__vcpu_sys_reg(vcpu, sr);
@ -1338,6 +1773,9 @@ void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
write_unlock(&vcpu->kvm->mmu_lock);
}
if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu))
kvm_map_l1_vncr(vcpu);
/* Must be last, as may switch context! */
if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
kvm_inject_nested_irq(vcpu);

View File

@ -5,12 +5,12 @@
*/
#include <linux/init.h>
#include <linux/interval_tree_generic.h>
#include <linux/kmemleak.h>
#include <linux/kvm_host.h>
#include <asm/kvm_mmu.h>
#include <linux/memblock.h>
#include <linux/mutex.h>
#include <linux/sort.h>
#include <asm/kvm_pkvm.h>
@ -24,23 +24,6 @@ static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
phys_addr_t hyp_mem_base;
phys_addr_t hyp_mem_size;
static int cmp_hyp_memblock(const void *p1, const void *p2)
{
const struct memblock_region *r1 = p1;
const struct memblock_region *r2 = p2;
return r1->base < r2->base ? -1 : (r1->base > r2->base);
}
static void __init sort_memblock_regions(void)
{
sort(hyp_memory,
*hyp_memblock_nr_ptr,
sizeof(struct memblock_region),
cmp_hyp_memblock,
NULL);
}
static int __init register_memblock_regions(void)
{
struct memblock_region *reg;
@ -52,7 +35,6 @@ static int __init register_memblock_regions(void)
hyp_memory[*hyp_memblock_nr_ptr] = *reg;
(*hyp_memblock_nr_ptr)++;
}
sort_memblock_regions();
return 0;
}
@ -79,6 +61,7 @@ void __init kvm_hyp_reserve(void)
hyp_mem_pages += host_s2_pgtable_pages();
hyp_mem_pages += hyp_vm_table_pages();
hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
hyp_mem_pages += pkvm_selftest_pages();
hyp_mem_pages += hyp_ffa_proxy_pages();
/*
@ -262,6 +245,7 @@ static int __init finalize_pkvm(void)
* at, which would end badly once inaccessible.
*/
kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start);
kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start);
kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
@ -273,80 +257,68 @@ static int __init finalize_pkvm(void)
}
device_initcall_sync(finalize_pkvm);
static int cmp_mappings(struct rb_node *node, const struct rb_node *parent)
static u64 __pkvm_mapping_start(struct pkvm_mapping *m)
{
struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node);
struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node);
if (a->gfn < b->gfn)
return -1;
if (a->gfn > b->gfn)
return 1;
return 0;
return m->gfn * PAGE_SIZE;
}
static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn)
static u64 __pkvm_mapping_end(struct pkvm_mapping *m)
{
struct rb_node *node = root->rb_node, *prev = NULL;
struct pkvm_mapping *mapping;
while (node) {
mapping = rb_entry(node, struct pkvm_mapping, node);
if (mapping->gfn == gfn)
return node;
prev = node;
node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right;
}
return prev;
return (m->gfn + m->nr_pages) * PAGE_SIZE - 1;
}
INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last,
__pkvm_mapping_start, __pkvm_mapping_end, static,
pkvm_mapping);
/*
* __tmp is updated to rb_next(__tmp) *before* entering the body of the loop to allow freeing
* of __map inline.
* __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow
* freeing of __map inline.
*/
#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \
for (struct rb_node *__tmp = find_first_mapping_node(&(__pgt)->pkvm_mappings, \
((__start) >> PAGE_SHIFT)); \
for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings, \
__start, __end - 1); \
__tmp && ({ \
__map = rb_entry(__tmp, struct pkvm_mapping, node); \
__tmp = rb_next(__tmp); \
__map = __tmp; \
__tmp = pkvm_mapping_iter_next(__map, __start, __end - 1); \
true; \
}); \
) \
if (__map->gfn < ((__start) >> PAGE_SHIFT)) \
continue; \
else if (__map->gfn >= ((__end) >> PAGE_SHIFT)) \
break; \
else
)
int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
struct kvm_pgtable_mm_ops *mm_ops)
{
pgt->pkvm_mappings = RB_ROOT;
pgt->pkvm_mappings = RB_ROOT_CACHED;
pgt->mmu = mmu;
return 0;
}
static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end)
{
struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
pkvm_handle_t handle = kvm->arch.pkvm.handle;
struct pkvm_mapping *mapping;
int ret;
if (!handle)
return 0;
for_each_mapping_in_range_safe(pgt, start, end, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
mapping->nr_pages);
if (WARN_ON(ret))
return ret;
pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
kfree(mapping);
}
return 0;
}
void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
{
struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
pkvm_handle_t handle = kvm->arch.pkvm.handle;
struct pkvm_mapping *mapping;
struct rb_node *node;
if (!handle)
return;
node = rb_first(&pgt->pkvm_mappings);
while (node) {
mapping = rb_entry(node, struct pkvm_mapping, node);
kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn);
node = rb_next(node);
rb_erase(&mapping->node, &pgt->pkvm_mappings);
kfree(mapping);
}
__pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL));
}
int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
@ -360,42 +332,46 @@ int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
u64 pfn = phys >> PAGE_SHIFT;
int ret;
if (size != PAGE_SIZE)
if (size != PAGE_SIZE && size != PMD_SIZE)
return -EINVAL;
lockdep_assert_held_write(&kvm->mmu_lock);
ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot);
if (ret) {
/* Is the gfn already mapped due to a racing vCPU? */
if (ret == -EPERM)
/*
* Calling stage2_map() on top of existing mappings is either happening because of a race
* with another vCPU, or because we're changing between page and block mappings. As per
* user_mem_abort(), same-size permission faults are handled in the relax_perms() path.
*/
mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1);
if (mapping) {
if (size == (mapping->nr_pages * PAGE_SIZE))
return -EAGAIN;
/* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */
ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
if (ret)
return ret;
mapping = NULL;
}
ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot);
if (WARN_ON(ret))
return ret;
swap(mapping, cache->mapping);
mapping->gfn = gfn;
mapping->pfn = pfn;
WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm_mappings, cmp_mappings));
mapping->nr_pages = size / PAGE_SIZE;
pkvm_mapping_insert(mapping, &pgt->pkvm_mappings);
return ret;
}
int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
pkvm_handle_t handle = kvm->arch.pkvm.handle;
struct pkvm_mapping *mapping;
int ret = 0;
lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock);
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn);
if (WARN_ON(ret))
break;
rb_erase(&mapping->node, &pgt->pkvm_mappings);
kfree(mapping);
}
return ret;
return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
}
int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
@ -407,7 +383,8 @@ int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn);
ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
mapping->nr_pages);
if (WARN_ON(ret))
break;
}
@ -422,7 +399,8 @@ int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
__clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE);
__clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn),
PAGE_SIZE * mapping->nr_pages);
return 0;
}
@ -437,7 +415,7 @@ bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64
lockdep_assert_held(&kvm->mmu_lock);
for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
mkold);
mapping->nr_pages, mkold);
return young;
}

View File

@ -280,7 +280,7 @@ static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu)
return 0;
hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2));
n = vcpu->kvm->arch.pmcr_n;
n = vcpu->kvm->arch.nr_pmu_counters;
/*
* Programming HPMN to a value greater than PMCR_EL0.N is
@ -608,14 +608,12 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
if (val & ARMV8_PMU_PMCR_P) {
/*
* Unlike other PMU sysregs, the controls in PMCR_EL0 always apply
* to the 'guest' range of counters and never the 'hyp' range.
*/
unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) &
~kvm_pmu_hyp_counter_mask(vcpu) &
~BIT(ARMV8_PMU_CYCLE_IDX);
if (!vcpu_is_el2(vcpu))
mask &= ~kvm_pmu_hyp_counter_mask(vcpu);
for_each_set_bit(i, &mask, 32)
kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true);
}
@ -1027,12 +1025,30 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm)
return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS);
}
static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr)
{
kvm->arch.nr_pmu_counters = nr;
/* Reset MDCR_EL2.HPMN behind the vcpus' back... */
if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) {
struct kvm_vcpu *vcpu;
unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2);
val &= ~MDCR_EL2_HPMN;
val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters);
__vcpu_sys_reg(vcpu, MDCR_EL2) = val;
}
}
}
static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu)
{
lockdep_assert_held(&kvm->arch.config_lock);
kvm->arch.arm_pmu = arm_pmu;
kvm->arch.pmcr_n = kvm_arm_pmu_get_max_counters(kvm);
kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm));
}
/**
@ -1088,6 +1104,20 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
return ret;
}
static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n)
{
struct kvm *kvm = vcpu->kvm;
if (!kvm->arch.arm_pmu)
return -EINVAL;
if (n > kvm_arm_pmu_get_max_counters(kvm))
return -EINVAL;
kvm_arm_set_nr_counters(kvm, n);
return 0;
}
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
struct kvm *kvm = vcpu->kvm;
@ -1184,6 +1214,15 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id);
}
case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: {
unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr;
unsigned int n;
if (get_user(n, uaddr))
return -EFAULT;
return kvm_arm_pmu_v3_set_nr_counters(vcpu, n);
}
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
@ -1222,6 +1261,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
case KVM_ARM_VCPU_PMU_V3_INIT:
case KVM_ARM_VCPU_PMU_V3_FILTER:
case KVM_ARM_VCPU_PMU_V3_SET_PMU:
case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS:
if (kvm_vcpu_has_pmu(vcpu))
return 0;
}
@ -1260,8 +1300,12 @@ u8 kvm_arm_pmu_get_pmuver_limit(void)
u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu)
{
u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
u64 n = vcpu->kvm->arch.nr_pmu_counters;
return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N);
if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu))
n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2));
return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N);
}
void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu)

View File

@ -158,6 +158,8 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
if (sve_state)
kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu));
kfree(sve_state);
free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
kfree(vcpu->arch.vncr_tlb);
kfree(vcpu->arch.ccsidr);
}

View File

@ -785,7 +785,7 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 mask = BIT(ARMV8_PMU_CYCLE_IDX);
u8 n = vcpu->kvm->arch.pmcr_n;
u8 n = vcpu->kvm->arch.nr_pmu_counters;
if (n)
mask |= GENMASK(n - 1, 0);
@ -1216,8 +1216,9 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
* with the existing KVM behavior.
*/
if (!kvm_vm_has_ran_once(kvm) &&
!vcpu_has_nv(vcpu) &&
new_n <= kvm_arm_pmu_get_max_counters(kvm))
kvm->arch.pmcr_n = new_n;
kvm->arch.nr_pmu_counters = new_n;
mutex_unlock(&kvm->arch.config_lock);
@ -1600,13 +1601,14 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val = sanitise_id_aa64pfr0_el1(vcpu, val);
break;
case SYS_ID_AA64PFR1_EL1:
if (!kvm_has_mte(vcpu->kvm))
if (!kvm_has_mte(vcpu->kvm)) {
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac);
}
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
@ -1959,11 +1961,34 @@ static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
{
u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK;
u8 mte = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE, hw_val);
u8 user_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, user_val);
u8 hw_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, hw_val);
/* See set_id_aa64pfr0_el1 for comment about MPAM */
if ((hw_val & mpam_mask) == (user_val & mpam_mask))
user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;
/*
* Previously MTE_frac was hidden from guest. However, if the
* hardware supports MTE2 but not MTE_ASYM_FAULT then a value
* of 0 for this field indicates that the hardware supports
* MTE_ASYNC. Whereas, 0xf indicates MTE_ASYNC is not supported.
*
* As KVM must accept values from KVM provided by user-space,
* when ID_AA64PFR1_EL1.MTE is 2 allow user-space to set
* ID_AA64PFR1_EL1.MTE_frac to 0. However, ignore it to avoid
* incorrectly claiming hardware support for MTE_ASYNC in the
* guest.
*/
if (mte == ID_AA64PFR1_EL1_MTE_MTE2 &&
hw_mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI &&
user_mte_frac == ID_AA64PFR1_EL1_MTE_frac_ASYNC) {
user_val &= ~ID_AA64PFR1_EL1_MTE_frac_MASK;
user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK;
}
return set_id_reg(vcpu, rd, user_val);
}
@ -2287,15 +2312,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
"trap of EL2 register redirected to EL1");
}
#define EL2_REG(name, acc, rst, v) { \
SYS_DESC(SYS_##name), \
.access = acc, \
.reset = rst, \
.reg = name, \
.visibility = el2_visibility, \
.val = v, \
}
#define EL2_REG_FILTERED(name, acc, rst, v, filter) { \
SYS_DESC(SYS_##name), \
.access = acc, \
@ -2305,6 +2321,9 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
.val = v, \
}
#define EL2_REG(name, acc, rst, v) \
EL2_REG_FILTERED(name, acc, rst, v, el2_visibility)
#define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v)
#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v)
@ -2452,6 +2471,16 @@ static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu,
return __el2_visibility(vcpu, rd, sve_visibility);
}
static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
if (el2_visibility(vcpu, rd) == 0 &&
kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
return 0;
return REG_HIDDEN;
}
static bool access_zcr_el2(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@ -2576,16 +2605,33 @@ static bool access_mdcr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
u64 old = __vcpu_sys_reg(vcpu, MDCR_EL2);
u64 hpmn, val, old = __vcpu_sys_reg(vcpu, MDCR_EL2);
if (!access_rw(vcpu, p, r))
return false;
if (!p->is_write) {
p->regval = old;
return true;
}
val = p->regval;
hpmn = FIELD_GET(MDCR_EL2_HPMN, val);
/*
* Request a reload of the PMU to enable/disable the counters affected
* by HPME.
* If HPMN is out of bounds, limit it to what we actually
* support. This matches the UNKNOWN definition of the field
* in that case, and keeps the emulation simple. Sort of.
*/
if ((old ^ __vcpu_sys_reg(vcpu, MDCR_EL2)) & MDCR_EL2_HPME)
if (hpmn > vcpu->kvm->arch.nr_pmu_counters) {
hpmn = vcpu->kvm->arch.nr_pmu_counters;
u64_replace_bits(val, hpmn, MDCR_EL2_HPMN);
}
__vcpu_sys_reg(vcpu, MDCR_EL2) = val;
/*
* Request a reload of the PMU to enable/disable the counters
* affected by HPME.
*/
if ((old ^ val) & MDCR_EL2_HPME)
kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
return true;
@ -2704,6 +2750,12 @@ static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
.set_user = set_imp_id_reg, \
.reset = reset_imp_id_reg, \
.val = mask, \
}
static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
__vcpu_sys_reg(vcpu, r->reg) = vcpu->kvm->arch.nr_pmu_counters;
return vcpu->kvm->arch.nr_pmu_counters;
}
/*
@ -3249,7 +3301,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
EL2_REG_VNCR(HCR_EL2, reset_hcr, 0),
EL2_REG(MDCR_EL2, access_mdcr, reset_val, 0),
EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0),
EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
EL2_REG_VNCR(HSTR_EL2, reset_val, 0),
EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0),
@ -3269,6 +3321,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
tcr2_el2_visibility),
EL2_REG_VNCR(VTTBR_EL2, reset_val, 0),
EL2_REG_VNCR(VTCR_EL2, reset_val, 0),
EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0,
vncr_el2_visibility),
{ SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 },
EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0),
@ -3552,8 +3606,7 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
u64 base, range, tg, num, scale;
int shift;
u64 base, range;
if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
return undef_access(vcpu, p, r);
@ -3563,26 +3616,7 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
* of the guest's S2 (different base granule size, for example), we
* decide to ignore TTL and only use the described range.
*/
tg = FIELD_GET(GENMASK(47, 46), p->regval);
scale = FIELD_GET(GENMASK(45, 44), p->regval);
num = FIELD_GET(GENMASK(43, 39), p->regval);
base = p->regval & GENMASK(36, 0);
switch(tg) {
case 1:
shift = 12;
break;
case 2:
shift = 14;
break;
case 3:
default: /* IMPDEF: handle tg==0 as 64k */
shift = 16;
break;
}
base <<= shift;
range = __TLBI_RANGE_PAGES(num, scale) << shift;
base = decode_range_tlbi(p->regval, &range, NULL);
kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
&(union tlbi_info) {
@ -3648,11 +3682,22 @@ static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu,
WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding));
}
static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding))
return undef_access(vcpu, p, r);
kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval);
return true;
}
static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
/*
* If we're here, this is because we've trapped on a EL1 TLBI
@ -3663,6 +3708,13 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
* - HCR_EL2.E2H == 0 : a non-VHE guest
* - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode
*
* Another possibility is that we are invalidating the EL2 context
* using EL1 instructions, but that we landed here because we need
* additional invalidation for structures that are not held in the
* CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In
* that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 }
* as we don't allow an NV-capable L1 in a nVHE configuration.
*
* We don't expect these helpers to ever be called when running
* in a vEL1 context.
*/
@ -3672,7 +3724,13 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding))
return undef_access(vcpu, p, r);
kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) {
kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval);
return true;
}
kvm_s2_mmu_iterate_by_vmid(vcpu->kvm,
get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)),
&(union tlbi_info) {
.va = {
.addr = p->regval,
@ -3794,16 +3852,21 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is),
SYS_INSN(TLBI_ALLE2OS, undef_access),
SYS_INSN(TLBI_VAE2OS, undef_access),
SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2),
SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE1OS, handle_alle1is),
SYS_INSN(TLBI_VALE2OS, undef_access),
SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2),
SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is),
SYS_INSN(TLBI_RVAE2IS, undef_access),
SYS_INSN(TLBI_RVALE2IS, undef_access),
SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2),
SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2),
SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE1IS, handle_alle1is),
SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2),
SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is),
SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is),
SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is),
@ -3813,11 +3876,17 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is),
SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is),
SYS_INSN(TLBI_RVAE2OS, undef_access),
SYS_INSN(TLBI_RVALE2OS, undef_access),
SYS_INSN(TLBI_RVAE2, undef_access),
SYS_INSN(TLBI_RVALE2, undef_access),
SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2),
SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2),
SYS_INSN(TLBI_RVAE2, handle_tlbi_el2),
SYS_INSN(TLBI_RVALE2, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE2, handle_tlbi_el2),
SYS_INSN(TLBI_VAE2, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE1, handle_alle1is),
SYS_INSN(TLBI_VALE2, handle_tlbi_el2),
SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is),
SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is),
@ -3825,19 +3894,19 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is),
SYS_INSN(TLBI_ALLE2OSNXS, undef_access),
SYS_INSN(TLBI_VAE2OSNXS, undef_access),
SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2),
SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is),
SYS_INSN(TLBI_VALE2OSNXS, undef_access),
SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2),
SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is),
SYS_INSN(TLBI_RVAE2ISNXS, undef_access),
SYS_INSN(TLBI_RVALE2ISNXS, undef_access),
SYS_INSN(TLBI_ALLE2ISNXS, undef_access),
SYS_INSN(TLBI_VAE2ISNXS, undef_access),
SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2),
SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2),
SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is),
SYS_INSN(TLBI_VALE2ISNXS, undef_access),
SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2),
SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is),
SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is),
SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is),
@ -3847,14 +3916,14 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is),
SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is),
SYS_INSN(TLBI_RVAE2OSNXS, undef_access),
SYS_INSN(TLBI_RVALE2OSNXS, undef_access),
SYS_INSN(TLBI_RVAE2NXS, undef_access),
SYS_INSN(TLBI_RVALE2NXS, undef_access),
SYS_INSN(TLBI_ALLE2NXS, undef_access),
SYS_INSN(TLBI_VAE2NXS, undef_access),
SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2),
SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2),
SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2),
SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2),
SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2),
SYS_INSN(TLBI_ALLE1NXS, handle_alle1is),
SYS_INSN(TLBI_VALE2NXS, undef_access),
SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2),
SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is),
};
@ -5153,65 +5222,13 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags))
goto out;
kvm->arch.fgu[HFGxTR_GROUP] = (HFGxTR_EL2_nAMAIR2_EL1 |
HFGxTR_EL2_nMAIR2_EL1 |
HFGxTR_EL2_nS2POR_EL1 |
HFGxTR_EL2_nACCDATA_EL1 |
HFGxTR_EL2_nSMPRI_EL1_MASK |
HFGxTR_EL2_nTPIDR2_EL0_MASK);
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1OS|
HFGITR_EL2_TLBIRVALE1OS |
HFGITR_EL2_TLBIRVAAE1OS |
HFGITR_EL2_TLBIRVAE1OS |
HFGITR_EL2_TLBIVAALE1OS |
HFGITR_EL2_TLBIVALE1OS |
HFGITR_EL2_TLBIVAAE1OS |
HFGITR_EL2_TLBIASIDE1OS |
HFGITR_EL2_TLBIVAE1OS |
HFGITR_EL2_TLBIVMALLE1OS);
if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_TLBIRVAALE1 |
HFGITR_EL2_TLBIRVALE1 |
HFGITR_EL2_TLBIRVAAE1 |
HFGITR_EL2_TLBIRVAE1 |
HFGITR_EL2_TLBIRVAALE1IS|
HFGITR_EL2_TLBIRVALE1IS |
HFGITR_EL2_TLBIRVAAE1IS |
HFGITR_EL2_TLBIRVAE1IS |
HFGITR_EL2_TLBIRVAALE1OS|
HFGITR_EL2_TLBIRVALE1OS |
HFGITR_EL2_TLBIRVAAE1OS |
HFGITR_EL2_TLBIRVAE1OS);
if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP))
kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A;
if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2))
kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP |
HFGITR_EL2_ATS1E1WP);
if (!kvm_has_s1pie(kvm))
kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 |
HFGxTR_EL2_nPIR_EL1);
if (!kvm_has_s1poe(kvm))
kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 |
HFGxTR_EL2_nPOR_EL0);
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP))
kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 |
HAFGRTR_EL2_RES1);
if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) {
kvm->arch.fgu[HDFGRTR_GROUP] |= (HDFGRTR_EL2_nBRBDATA |
HDFGRTR_EL2_nBRBCTL |
HDFGRTR_EL2_nBRBIDR);
kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_nBRBINJ |
HFGITR_EL2_nBRBIALL);
}
compute_fgu(kvm, HFGRTR_GROUP);
compute_fgu(kvm, HFGITR_GROUP);
compute_fgu(kvm, HDFGRTR_GROUP);
compute_fgu(kvm, HAFGRTR_GROUP);
compute_fgu(kvm, HFGRTR2_GROUP);
compute_fgu(kvm, HFGITR2_GROUP);
compute_fgu(kvm, HDFGRTR2_GROUP);
set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags);
out:
@ -5269,6 +5286,8 @@ int __init kvm_sys_reg_table_init(void)
ret = populate_nv_trap_config();
check_feature_map();
for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++)
ret = populate_sysreg_config(sys_reg_descs + i, i);

View File

@ -176,7 +176,7 @@ TRACE_EVENT(kvm_set_way_flush,
),
TP_printk("S/W flush at 0x%016lx (cache %s)",
__entry->vcpu_pc, __entry->cache ? "on" : "off")
__entry->vcpu_pc, str_on_off(__entry->cache))
);
TRACE_EVENT(kvm_toggle_cache,
@ -196,8 +196,8 @@ TRACE_EVENT(kvm_toggle_cache,
),
TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
__entry->vcpu_pc, __entry->was ? "on" : "off",
__entry->now ? "on" : "off")
__entry->vcpu_pc, str_on_off(__entry->was),
str_on_off(__entry->now))
);
/*

View File

@ -320,3 +320,227 @@ void vgic_debug_init(struct kvm *kvm)
void vgic_debug_destroy(struct kvm *kvm)
{
}
/**
* struct vgic_its_iter - Iterator for traversing VGIC ITS device tables.
* @dev: Pointer to the current its_device being processed.
* @ite: Pointer to the current its_ite within the device being processed.
*
* This structure is used to maintain the current position during iteration
* over the ITS device tables. It holds pointers to both the current device
* and the current ITE within that device.
*/
struct vgic_its_iter {
struct its_device *dev;
struct its_ite *ite;
};
/**
* end_of_iter - Checks if the iterator has reached the end.
* @iter: The iterator to check.
*
* When the iterator completed processing the final ITE in the last device
* table, it was marked to indicate the end of iteration by setting its
* device and ITE pointers to NULL.
* This function checks whether the iterator was marked as end.
*
* Return: True if the iterator is marked as end, false otherwise.
*/
static inline bool end_of_iter(struct vgic_its_iter *iter)
{
return !iter->dev && !iter->ite;
}
/**
* vgic_its_iter_next - Advances the iterator to the next entry in the ITS tables.
* @its: The VGIC ITS structure.
* @iter: The iterator to advance.
*
* This function moves the iterator to the next ITE within the current device,
* or to the first ITE of the next device if the current ITE is the last in
* the device. If the current device is the last device, the iterator is set
* to indicate the end of iteration.
*/
static void vgic_its_iter_next(struct vgic_its *its, struct vgic_its_iter *iter)
{
struct its_device *dev = iter->dev;
struct its_ite *ite = iter->ite;
if (!ite || list_is_last(&ite->ite_list, &dev->itt_head)) {
if (list_is_last(&dev->dev_list, &its->device_list)) {
dev = NULL;
ite = NULL;
} else {
dev = list_next_entry(dev, dev_list);
ite = list_first_entry_or_null(&dev->itt_head,
struct its_ite,
ite_list);
}
} else {
ite = list_next_entry(ite, ite_list);
}
iter->dev = dev;
iter->ite = ite;
}
/**
* vgic_its_debug_start - Start function for the seq_file interface.
* @s: The seq_file structure.
* @pos: The starting position (offset).
*
* This function initializes the iterator to the beginning of the ITS tables
* and advances it to the specified position. It acquires the its_lock mutex
* to protect shared data.
*
* Return: An iterator pointer on success, NULL if no devices are found or
* the end of the list is reached, or ERR_PTR(-ENOMEM) on memory
* allocation failure.
*/
static void *vgic_its_debug_start(struct seq_file *s, loff_t *pos)
{
struct vgic_its *its = s->private;
struct vgic_its_iter *iter;
struct its_device *dev;
loff_t offset = *pos;
mutex_lock(&its->its_lock);
dev = list_first_entry_or_null(&its->device_list,
struct its_device, dev_list);
if (!dev)
return NULL;
iter = kmalloc(sizeof(*iter), GFP_KERNEL);
if (!iter)
return ERR_PTR(-ENOMEM);
iter->dev = dev;
iter->ite = list_first_entry_or_null(&dev->itt_head,
struct its_ite, ite_list);
while (!end_of_iter(iter) && offset--)
vgic_its_iter_next(its, iter);
if (end_of_iter(iter)) {
kfree(iter);
return NULL;
}
return iter;
}
/**
* vgic_its_debug_next - Next function for the seq_file interface.
* @s: The seq_file structure.
* @v: The current iterator.
* @pos: The current position (offset).
*
* This function advances the iterator to the next entry and increments the
* position.
*
* Return: An iterator pointer on success, or NULL if the end of the list is
* reached.
*/
static void *vgic_its_debug_next(struct seq_file *s, void *v, loff_t *pos)
{
struct vgic_its *its = s->private;
struct vgic_its_iter *iter = v;
++*pos;
vgic_its_iter_next(its, iter);
if (end_of_iter(iter)) {
kfree(iter);
return NULL;
}
return iter;
}
/**
* vgic_its_debug_stop - Stop function for the seq_file interface.
* @s: The seq_file structure.
* @v: The current iterator.
*
* This function frees the iterator and releases the its_lock mutex.
*/
static void vgic_its_debug_stop(struct seq_file *s, void *v)
{
struct vgic_its *its = s->private;
struct vgic_its_iter *iter = v;
if (!IS_ERR_OR_NULL(iter))
kfree(iter);
mutex_unlock(&its->its_lock);
}
/**
* vgic_its_debug_show - Show function for the seq_file interface.
* @s: The seq_file structure.
* @v: The current iterator.
*
* This function formats and prints the ITS table entry information to the
* seq_file output.
*
* Return: 0 on success.
*/
static int vgic_its_debug_show(struct seq_file *s, void *v)
{
struct vgic_its_iter *iter = v;
struct its_device *dev = iter->dev;
struct its_ite *ite = iter->ite;
if (list_is_first(&ite->ite_list, &dev->itt_head)) {
seq_printf(s, "\n");
seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n",
dev->device_id, BIT_ULL(dev->num_eventid_bits) - 1);
seq_printf(s, "EVENT_ID INTID HWINTID TARGET COL_ID HW\n");
seq_printf(s, "-----------------------------------------------\n");
}
if (ite && ite->irq && ite->collection) {
seq_printf(s, "%8u %8u %8u %8u %8u %2d\n",
ite->event_id, ite->irq->intid, ite->irq->hwintid,
ite->collection->target_addr,
ite->collection->collection_id, ite->irq->hw);
}
return 0;
}
static const struct seq_operations vgic_its_debug_sops = {
.start = vgic_its_debug_start,
.next = vgic_its_debug_next,
.stop = vgic_its_debug_stop,
.show = vgic_its_debug_show
};
DEFINE_SEQ_ATTRIBUTE(vgic_its_debug);
/**
* vgic_its_debug_init - Initializes the debugfs interface for VGIC ITS.
* @dev: The KVM device structure.
*
* This function creates a debugfs file named "vgic-its-state@%its_base"
* to expose the ITS table information.
*
* Return: 0 on success.
*/
int vgic_its_debug_init(struct kvm_device *dev)
{
struct vgic_its *its = dev->private;
char *name;
name = kasprintf(GFP_KERNEL, "vgic-its-state@%llx", (u64)its->vgic_its_base);
if (!name)
return -ENOMEM;
debugfs_create_file(name, 0444, dev->kvm->debugfs_dentry, its, &vgic_its_debug_fops);
kfree(name);
return 0;
}
void vgic_its_debug_destroy(struct kvm_device *dev)
{
}

View File

@ -154,36 +154,6 @@ out_unlock:
return irq;
}
struct its_device {
struct list_head dev_list;
/* the head for the list of ITTEs */
struct list_head itt_head;
u32 num_eventid_bits;
gpa_t itt_addr;
u32 device_id;
};
#define COLLECTION_NOT_MAPPED ((u32)~0)
struct its_collection {
struct list_head coll_list;
u32 collection_id;
u32 target_addr;
};
#define its_is_collection_mapped(coll) ((coll) && \
((coll)->target_addr != COLLECTION_NOT_MAPPED))
struct its_ite {
struct list_head ite_list;
struct vgic_irq *irq;
struct its_collection *collection;
u32 event_id;
};
/**
* struct vgic_its_abi - ITS abi ops and settings
* @cte_esz: collection table entry size
@ -1938,6 +1908,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev)
mutex_lock(&its->its_lock);
vgic_its_debug_destroy(kvm_dev);
vgic_its_free_device_list(kvm, its);
vgic_its_free_collection_list(kvm, its);
vgic_its_invalidate_cache(its);
@ -2771,7 +2743,12 @@ static int vgic_its_set_attr(struct kvm_device *dev,
if (ret)
return ret;
return vgic_register_its_iodev(dev->kvm, its, addr);
ret = vgic_register_its_iodev(dev->kvm, its, addr);
if (ret)
return ret;
return vgic_its_debug_init(dev);
}
case KVM_DEV_ARM_VGIC_GRP_CTRL:
return vgic_its_ctrl(dev->kvm, its, attr->attr);

View File

@ -240,9 +240,6 @@ static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
goto next;
}
/* It is illegal to have the EOI bit set with HW */
lr &= ~ICH_LR_EOI;
/* Translate the virtual mapping to the real one */
lr &= ~ICH_LR_PHYS_ID_MASK;
lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);

View File

@ -172,6 +172,36 @@ struct vgic_reg_attr {
gpa_t addr;
};
struct its_device {
struct list_head dev_list;
/* the head for the list of ITTEs */
struct list_head itt_head;
u32 num_eventid_bits;
gpa_t itt_addr;
u32 device_id;
};
#define COLLECTION_NOT_MAPPED ((u32)~0)
struct its_collection {
struct list_head coll_list;
u32 collection_id;
u32 target_addr;
};
#define its_is_collection_mapped(coll) ((coll) && \
((coll)->target_addr != COLLECTION_NOT_MAPPED))
struct its_ite {
struct list_head ite_list;
struct vgic_irq *irq;
struct its_collection *collection;
u32 event_id;
};
int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
struct vgic_reg_attr *reg_attr);
int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
@ -359,4 +389,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
int vgic_its_debug_init(struct kvm_device *dev);
void vgic_its_debug_destroy(struct kvm_device *dev);
#endif

View File

@ -28,6 +28,7 @@ HAS_EPAN
HAS_EVT
HAS_FPMR
HAS_FGT
HAS_FGT2
HAS_FPSIMD
HAS_GCS
HAS_GENERIC_AUTH
@ -94,6 +95,7 @@ WORKAROUND_2457168
WORKAROUND_2645198
WORKAROUND_2658417
WORKAROUND_AMPERE_AC03_CPU_38
WORKAROUND_AMPERE_AC04_CPU_23
WORKAROUND_TRBE_OVERWRITE_FILL_MODE
WORKAROUND_TSB_FLUSH_FAILURE
WORKAROUND_TRBE_WRITE_OUT_OF_RANGE

File diff suppressed because it is too large Load Diff

View File

@ -301,7 +301,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
/* MMU handling */
void kvm_flush_tlb_all(void);
void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa);
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write);
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write, int ecode);
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable);
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);

View File

@ -37,7 +37,7 @@
#define KVM_LOONGSON_IRQ_NUM_MASK 0xffff
typedef union loongarch_instruction larch_inst;
typedef int (*exit_handle_fn)(struct kvm_vcpu *);
typedef int (*exit_handle_fn)(struct kvm_vcpu *, int);
int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst);
int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst);

View File

@ -341,7 +341,7 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
* 2) Execute CACOP/IDLE instructions;
* 3) Access to unimplemented CSRs/IOCSRs.
*/
static int kvm_handle_gspr(struct kvm_vcpu *vcpu)
static int kvm_handle_gspr(struct kvm_vcpu *vcpu, int ecode)
{
int ret = RESUME_GUEST;
enum emulation_result er = EMULATE_DONE;
@ -661,7 +661,7 @@ int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst)
return ret;
}
static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write)
static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write, int ecode)
{
int ret;
larch_inst inst;
@ -675,7 +675,7 @@ static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write)
return RESUME_GUEST;
}
ret = kvm_handle_mm_fault(vcpu, badv, write);
ret = kvm_handle_mm_fault(vcpu, badv, write, ecode);
if (ret) {
/* Treat as MMIO */
inst.word = vcpu->arch.badi;
@ -705,14 +705,14 @@ static int kvm_handle_rdwr_fault(struct kvm_vcpu *vcpu, bool write)
return ret;
}
static int kvm_handle_read_fault(struct kvm_vcpu *vcpu)
static int kvm_handle_read_fault(struct kvm_vcpu *vcpu, int ecode)
{
return kvm_handle_rdwr_fault(vcpu, false);
return kvm_handle_rdwr_fault(vcpu, false, ecode);
}
static int kvm_handle_write_fault(struct kvm_vcpu *vcpu)
static int kvm_handle_write_fault(struct kvm_vcpu *vcpu, int ecode)
{
return kvm_handle_rdwr_fault(vcpu, true);
return kvm_handle_rdwr_fault(vcpu, true, ecode);
}
int kvm_complete_user_service(struct kvm_vcpu *vcpu, struct kvm_run *run)
@ -726,11 +726,12 @@ int kvm_complete_user_service(struct kvm_vcpu *vcpu, struct kvm_run *run)
/**
* kvm_handle_fpu_disabled() - Guest used fpu however it is disabled at host
* @vcpu: Virtual CPU context.
* @ecode: Exception code.
*
* Handle when the guest attempts to use fpu which hasn't been allowed
* by the root context.
*/
static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu)
static int kvm_handle_fpu_disabled(struct kvm_vcpu *vcpu, int ecode)
{
struct kvm_run *run = vcpu->run;
@ -783,11 +784,12 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu)
/*
* kvm_handle_lsx_disabled() - Guest used LSX while disabled in root.
* @vcpu: Virtual CPU context.
* @ecode: Exception code.
*
* Handle when the guest attempts to use LSX when it is disabled in the root
* context.
*/
static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu)
static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu, int ecode)
{
if (kvm_own_lsx(vcpu))
kvm_queue_exception(vcpu, EXCCODE_INE, 0);
@ -798,11 +800,12 @@ static int kvm_handle_lsx_disabled(struct kvm_vcpu *vcpu)
/*
* kvm_handle_lasx_disabled() - Guest used LASX while disabled in root.
* @vcpu: Virtual CPU context.
* @ecode: Exception code.
*
* Handle when the guest attempts to use LASX when it is disabled in the root
* context.
*/
static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu)
static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu, int ecode)
{
if (kvm_own_lasx(vcpu))
kvm_queue_exception(vcpu, EXCCODE_INE, 0);
@ -810,7 +813,7 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu)
static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu, int ecode)
{
if (kvm_own_lbt(vcpu))
kvm_queue_exception(vcpu, EXCCODE_INE, 0);
@ -872,7 +875,7 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu)
kvm_write_reg(vcpu, LOONGARCH_GPR_A0, ret);
}
static int kvm_handle_hypercall(struct kvm_vcpu *vcpu)
static int kvm_handle_hypercall(struct kvm_vcpu *vcpu, int ecode)
{
int ret;
larch_inst inst;
@ -932,16 +935,14 @@ static int kvm_handle_hypercall(struct kvm_vcpu *vcpu)
/*
* LoongArch KVM callback handling for unimplemented guest exiting
*/
static int kvm_fault_ni(struct kvm_vcpu *vcpu)
static int kvm_fault_ni(struct kvm_vcpu *vcpu, int ecode)
{
unsigned int ecode, inst;
unsigned long estat, badv;
unsigned int inst;
unsigned long badv;
/* Fetch the instruction */
inst = vcpu->arch.badi;
badv = vcpu->arch.badv;
estat = vcpu->arch.host_estat;
ecode = (estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT;
kvm_err("ECode: %d PC=%#lx Inst=0x%08x BadVaddr=%#lx ESTAT=%#lx\n",
ecode, vcpu->arch.pc, inst, badv, read_gcsr_estat());
kvm_arch_vcpu_dump_regs(vcpu);
@ -966,5 +967,5 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = {
int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault)
{
return kvm_fault_tables[fault](vcpu);
return kvm_fault_tables[fault](vcpu, fault);
}

View File

@ -912,7 +912,7 @@ out:
return err;
}
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write, int ecode)
{
int ret;
@ -921,8 +921,17 @@ int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
return ret;
/* Invalidate this entry in the TLB */
vcpu->arch.flush_gpa = gpa;
kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu);
if (!cpu_has_ptw || (ecode == EXCCODE_TLBM)) {
/*
* With HW PTW, invalid TLB is not added when page fault. But
* for EXCCODE_TLBM exception, stale TLB may exist because of
* the last read access.
*
* With SW PTW, invalid TLB is added in TLB refill exception.
*/
vcpu->arch.flush_gpa = gpa;
kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu);
}
return 0;
}

View File

@ -63,9 +63,6 @@ struct kvm_vcpu_aia {
/* CPU AIA CSR context of Guest VCPU */
struct kvm_vcpu_aia_csr guest_csr;
/* CPU AIA CSR context upon Guest VCPU reset */
struct kvm_vcpu_aia_csr guest_reset_csr;
/* Guest physical address of IMSIC for this VCPU */
gpa_t imsic_addr;

View File

@ -119,6 +119,9 @@ struct kvm_arch {
/* AIA Guest/VM context */
struct kvm_aia aia;
/* KVM_CAP_RISCV_MP_STATE_RESET */
bool mp_state_reset;
};
struct kvm_cpu_trap {
@ -193,6 +196,12 @@ struct kvm_vcpu_smstateen_csr {
unsigned long sstateen0;
};
struct kvm_vcpu_reset_state {
spinlock_t lock;
unsigned long pc;
unsigned long a1;
};
struct kvm_vcpu_arch {
/* VCPU ran at least once */
bool ran_atleast_once;
@ -227,12 +236,8 @@ struct kvm_vcpu_arch {
/* CPU Smstateen CSR context of Guest VCPU */
struct kvm_vcpu_smstateen_csr smstateen_csr;
/* CPU context upon Guest VCPU reset */
struct kvm_cpu_context guest_reset_context;
spinlock_t reset_cntx_lock;
/* CPU CSR context upon Guest VCPU reset */
struct kvm_vcpu_csr guest_reset_csr;
/* CPU reset state of Guest VCPU */
struct kvm_vcpu_reset_state reset_state;
/*
* VCPU interrupts

View File

@ -55,6 +55,9 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run);
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
struct kvm_run *run,
u32 type, u64 flags);
void kvm_riscv_vcpu_sbi_request_reset(struct kvm_vcpu *vcpu,
unsigned long pc, unsigned long a1);
void kvm_riscv_vcpu_sbi_load_reset_state(struct kvm_vcpu *vcpu);
int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
int kvm_riscv_vcpu_set_reg_sbi_ext(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg);

View File

@ -33,8 +33,7 @@ void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx,
unsigned long *isa);
void kvm_riscv_vcpu_host_vector_save(struct kvm_cpu_context *cntx);
void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx);
int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu,
struct kvm_cpu_context *cntx);
int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu);
void kvm_riscv_vcpu_free_vector_context(struct kvm_vcpu *vcpu);
#else
@ -62,8 +61,7 @@ static inline void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cn
{
}
static inline int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu,
struct kvm_cpu_context *cntx)
static inline int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu)
{
return 0;
}

View File

@ -131,6 +131,12 @@ secondary_start_sbi:
csrw CSR_IE, zero
csrw CSR_IP, zero
#ifndef CONFIG_RISCV_M_MODE
/* Enable time CSR */
li t0, 0x2
csrw CSR_SCOUNTEREN, t0
#endif
/* Load the global pointer */
load_global_pointer
@ -226,6 +232,10 @@ SYM_CODE_START(_start_kernel)
* to hand it to us.
*/
csrr a0, CSR_MHARTID
#else
/* Enable time CSR */
li t0, 0x2
csrw CSR_SCOUNTEREN, t0
#endif /* CONFIG_RISCV_M_MODE */
/* Load the global pointer */

View File

@ -18,7 +18,7 @@ menuconfig VIRTUALIZATION
if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
tristate "Kernel-based Virtual Machine (KVM) support"
depends on RISCV_SBI && MMU
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING

View File

@ -526,12 +526,10 @@ int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu)
void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
struct kvm_vcpu_aia_csr *reset_csr =
&vcpu->arch.aia_context.guest_reset_csr;
if (!kvm_riscv_aia_available())
return;
memcpy(csr, reset_csr, sizeof(*csr));
memset(csr, 0, sizeof(*csr));
/* Proceed only if AIA was initialized successfully */
if (!kvm_riscv_aia_initialized(vcpu->kvm))

View File

@ -51,12 +51,33 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
static void kvm_riscv_vcpu_context_reset(struct kvm_vcpu *vcpu,
bool kvm_sbi_reset)
{
struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr;
struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
struct kvm_cpu_context *reset_cntx = &vcpu->arch.guest_reset_context;
void *vector_datap = cntx->vector.datap;
memset(cntx, 0, sizeof(*cntx));
memset(csr, 0, sizeof(*csr));
memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr));
/* Restore datap as it's not a part of the guest context. */
cntx->vector.datap = vector_datap;
if (kvm_sbi_reset)
kvm_riscv_vcpu_sbi_load_reset_state(vcpu);
/* Setup reset state of shadow SSTATUS and HSTATUS CSRs */
cntx->sstatus = SR_SPP | SR_SPIE;
cntx->hstatus |= HSTATUS_VTW;
cntx->hstatus |= HSTATUS_SPVP;
cntx->hstatus |= HSTATUS_SPV;
}
static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu, bool kvm_sbi_reset)
{
bool loaded;
/**
@ -71,13 +92,7 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.last_exit_cpu = -1;
memcpy(csr, reset_csr, sizeof(*csr));
spin_lock(&vcpu->arch.reset_cntx_lock);
memcpy(cntx, reset_cntx, sizeof(*cntx));
spin_unlock(&vcpu->arch.reset_cntx_lock);
memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr));
kvm_riscv_vcpu_context_reset(vcpu, kvm_sbi_reset);
kvm_riscv_vcpu_fp_reset(vcpu);
@ -112,8 +127,6 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
int rc;
struct kvm_cpu_context *cntx;
struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr;
spin_lock_init(&vcpu->arch.mp_state_lock);
@ -133,24 +146,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
/* Setup VCPU hfence queue */
spin_lock_init(&vcpu->arch.hfence_lock);
/* Setup reset state of shadow SSTATUS and HSTATUS CSRs */
spin_lock_init(&vcpu->arch.reset_cntx_lock);
spin_lock_init(&vcpu->arch.reset_state.lock);
spin_lock(&vcpu->arch.reset_cntx_lock);
cntx = &vcpu->arch.guest_reset_context;
cntx->sstatus = SR_SPP | SR_SPIE;
cntx->hstatus = 0;
cntx->hstatus |= HSTATUS_VTW;
cntx->hstatus |= HSTATUS_SPVP;
cntx->hstatus |= HSTATUS_SPV;
spin_unlock(&vcpu->arch.reset_cntx_lock);
if (kvm_riscv_vcpu_alloc_vector_context(vcpu, cntx))
if (kvm_riscv_vcpu_alloc_vector_context(vcpu))
return -ENOMEM;
/* By default, make CY, TM, and IR counters accessible in VU mode */
reset_csr->scounteren = 0x7;
/* Setup VCPU timer */
kvm_riscv_vcpu_timer_init(vcpu);
@ -169,7 +169,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_riscv_vcpu_sbi_init(vcpu);
/* Reset VCPU */
kvm_riscv_reset_vcpu(vcpu);
kvm_riscv_reset_vcpu(vcpu, false);
return 0;
}
@ -518,6 +518,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
case KVM_MP_STATE_STOPPED:
__kvm_riscv_vcpu_power_off(vcpu);
break;
case KVM_MP_STATE_INIT_RECEIVED:
if (vcpu->kvm->arch.mp_state_reset)
kvm_riscv_reset_vcpu(vcpu, false);
else
ret = -EINVAL;
break;
default:
ret = -EINVAL;
}
@ -706,7 +712,7 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
kvm_riscv_reset_vcpu(vcpu);
kvm_riscv_reset_vcpu(vcpu, true);
if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu))
kvm_riscv_gstage_update_hgatp(vcpu);

View File

@ -143,9 +143,9 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
struct kvm_vcpu *tmp;
kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
spin_lock(&vcpu->arch.mp_state_lock);
spin_lock(&tmp->arch.mp_state_lock);
WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
spin_unlock(&vcpu->arch.mp_state_lock);
spin_unlock(&tmp->arch.mp_state_lock);
}
kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
@ -156,6 +156,34 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
}
void kvm_riscv_vcpu_sbi_request_reset(struct kvm_vcpu *vcpu,
unsigned long pc, unsigned long a1)
{
spin_lock(&vcpu->arch.reset_state.lock);
vcpu->arch.reset_state.pc = pc;
vcpu->arch.reset_state.a1 = a1;
spin_unlock(&vcpu->arch.reset_state.lock);
kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
}
void kvm_riscv_vcpu_sbi_load_reset_state(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr;
struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
struct kvm_vcpu_reset_state *reset_state = &vcpu->arch.reset_state;
cntx->a0 = vcpu->vcpu_id;
spin_lock(&vcpu->arch.reset_state.lock);
cntx->sepc = reset_state->pc;
cntx->a1 = reset_state->a1;
spin_unlock(&vcpu->arch.reset_state.lock);
cntx->sstatus &= ~SR_SIE;
csr->vsatp = 0;
}
int kvm_riscv_vcpu_sbi_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;

View File

@ -15,7 +15,6 @@
static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *reset_cntx;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
struct kvm_vcpu *target_vcpu;
unsigned long target_vcpuid = cp->a0;
@ -32,17 +31,7 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu)
goto out;
}
spin_lock(&target_vcpu->arch.reset_cntx_lock);
reset_cntx = &target_vcpu->arch.guest_reset_context;
/* start address */
reset_cntx->sepc = cp->a1;
/* target vcpu id to start */
reset_cntx->a0 = target_vcpuid;
/* private data passed from kernel */
reset_cntx->a1 = cp->a2;
spin_unlock(&target_vcpu->arch.reset_cntx_lock);
kvm_make_request(KVM_REQ_VCPU_RESET, target_vcpu);
kvm_riscv_vcpu_sbi_request_reset(target_vcpu, cp->a1, cp->a2);
__kvm_riscv_vcpu_power_on(target_vcpu);

View File

@ -13,7 +13,6 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
struct kvm_cpu_context *reset_cntx;
unsigned long funcid = cp->a6;
unsigned long hva, i;
struct kvm_vcpu *tmp;
@ -45,14 +44,7 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
}
}
spin_lock(&vcpu->arch.reset_cntx_lock);
reset_cntx = &vcpu->arch.guest_reset_context;
reset_cntx->sepc = cp->a1;
reset_cntx->a0 = vcpu->vcpu_id;
reset_cntx->a1 = cp->a2;
spin_unlock(&vcpu->arch.reset_cntx_lock);
kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
kvm_riscv_vcpu_sbi_request_reset(vcpu, cp->a1, cp->a2);
/* userspace provides the suspend implementation */
kvm_riscv_vcpu_sbi_forward(vcpu, run);

View File

@ -22,6 +22,9 @@ void kvm_riscv_vcpu_vector_reset(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *cntx = &vcpu->arch.guest_context;
cntx->sstatus &= ~SR_VS;
cntx->vector.vlenb = riscv_v_vsize / 32;
if (riscv_isa_extension_available(isa, v)) {
cntx->sstatus |= SR_VS_INITIAL;
WARN_ON(!cntx->vector.datap);
@ -70,13 +73,11 @@ void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx)
__kvm_riscv_vector_restore(cntx);
}
int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu,
struct kvm_cpu_context *cntx)
int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu)
{
cntx->vector.datap = kmalloc(riscv_v_vsize, GFP_KERNEL);
if (!cntx->vector.datap)
vcpu->arch.guest_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL);
if (!vcpu->arch.guest_context.vector.datap)
return -ENOMEM;
cntx->vector.vlenb = riscv_v_vsize / 32;
vcpu->arch.host_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL);
if (!vcpu->arch.host_context.vector.datap)
@ -87,7 +88,7 @@ int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu,
void kvm_riscv_vcpu_free_vector_context(struct kvm_vcpu *vcpu)
{
kfree(vcpu->arch.guest_reset_context.vector.datap);
kfree(vcpu->arch.guest_context.vector.datap);
kfree(vcpu->arch.host_context.vector.datap);
}
#endif

View File

@ -209,6 +209,19 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
return r;
}
int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
{
switch (cap->cap) {
case KVM_CAP_RISCV_MP_STATE_RESET:
if (cap->flags)
return -EINVAL;
kvm->arch.mp_state_reset = true;
return 0;
default:
return -EINVAL;
}
}
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
return -EINVAL;

View File

@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
KVM_X86_OP(vm_init)
KVM_X86_OP_OPTIONAL(vm_destroy)
KVM_X86_OP_OPTIONAL(vm_pre_destroy)
KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
KVM_X86_OP(vcpu_create)
KVM_X86_OP(vcpu_free)
@ -115,6 +116,7 @@ KVM_X86_OP_OPTIONAL(pi_start_assignment)
KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
KVM_X86_OP_OPTIONAL(protected_apic_has_interrupt)
KVM_X86_OP_OPTIONAL(set_hv_timer)
KVM_X86_OP_OPTIONAL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
@ -125,7 +127,8 @@ KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
#endif
KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)

View File

@ -609,8 +609,15 @@ struct kvm_pmu {
struct kvm_pmu_ops;
enum {
KVM_DEBUGREG_BP_ENABLED = 1,
KVM_DEBUGREG_WONT_EXIT = 2,
KVM_DEBUGREG_BP_ENABLED = BIT(0),
KVM_DEBUGREG_WONT_EXIT = BIT(1),
/*
* Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by
* hardware on exit from or enter to guest. KVM needn't switch them.
* DR0-3, DR6 and DR7 are set to their architectural INIT value on VM
* exit, host values need to be restored.
*/
KVM_DEBUGREG_AUTO_SWITCH = BIT(2),
};
struct kvm_mtrr {
@ -1571,6 +1578,13 @@ struct kvm_arch {
struct kvm_mmu_memory_cache split_desc_cache;
gfn_t gfn_direct_bits;
/*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero
* value indicates CPU dirty logging is unsupported or disabled in
* current VM.
*/
int cpu_dirty_log_size;
};
struct kvm_vm_stat {
@ -1674,6 +1688,7 @@ struct kvm_x86_ops {
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
void (*vm_pre_destroy)(struct kvm *kvm);
/* Create, but do not attach this VCPU */
int (*vcpu_precreate)(struct kvm *kvm);
@ -1823,11 +1838,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
/*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
* value indicates CPU dirty logging is unsupported or disabled.
*/
int cpu_dirty_log_size;
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
const struct kvm_x86_nested_ops *nested_ops;
@ -1841,6 +1851,7 @@ struct kvm_x86_ops {
void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu);
int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
bool *expired);
@ -1857,6 +1868,7 @@ struct kvm_x86_ops {
int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
@ -2333,6 +2345,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
int kvm_add_user_return_msr(u32 msr);
int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
void kvm_user_return_msr_update_cache(unsigned int index, u64 val);
static inline bool kvm_is_supported_user_return_msr(u32 msr)
{
@ -2416,7 +2429,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
KVM_X86_QUIRK_SLOT_ZAP_ALL | \
KVM_X86_QUIRK_STUFF_FEATURE_MSRS)
KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
KVM_X86_QUIRK_IGNORE_GUEST_PAT)
#define KVM_X86_CONDITIONAL_QUIRKS \
(KVM_X86_QUIRK_CD_NW_CLEARED | \
KVM_X86_QUIRK_IGNORE_GUEST_PAT)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was

View File

@ -81,6 +81,11 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
}
static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc)
{
return test_bit(vector, (unsigned long *)pi_desc->pir);
}
/* Non-atomic helpers */
static inline void __pi_set_sn(struct pi_desc *pi_desc)
{

View File

@ -67,11 +67,18 @@
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
/* TDX hypercall Leaf IDs */
#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000
#define TDVMCALL_MAP_GPA 0x10001
#define TDVMCALL_GET_QUOTE 0x10002
#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
#define TDVMCALL_STATUS_RETRY 1
/*
* TDG.VP.VMCALL Status Codes (returned in R10)
*/
#define TDVMCALL_STATUS_SUCCESS 0x0000000000000000ULL
#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL
#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL
#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL
/*
* Bitmasks of exposed registers (with VMM).

View File

@ -5,6 +5,7 @@
#include <linux/init.h>
#include <linux/bits.h>
#include <linux/mmzone.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
@ -18,6 +19,7 @@
* TDX module.
*/
#define TDX_ERROR _BITUL(63)
#define TDX_NON_RECOVERABLE _BITUL(62)
#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40))
#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000))
@ -33,6 +35,8 @@
#ifndef __ASSEMBLER__
#include <uapi/asm/mce.h>
#include <asm/tdx_global_metadata.h>
#include <linux/pgtable.h>
/*
* Used by the #VE exception handler to gather the #VE exception
@ -119,11 +123,82 @@ static inline u64 sc_retry(sc_func_t func, u64 fn,
int tdx_cpu_enable(void);
int tdx_enable(void);
const char *tdx_dump_mce_info(struct mce *m);
const struct tdx_sys_info *tdx_get_sysinfo(void);
int tdx_guest_keyid_alloc(void);
u32 tdx_get_nr_guest_keyids(void);
void tdx_guest_keyid_free(unsigned int keyid);
struct tdx_td {
/* TD root structure: */
struct page *tdr_page;
int tdcs_nr_pages;
/* TD control structure: */
struct page **tdcs_pages;
/* Size of `tdcx_pages` in struct tdx_vp */
int tdcx_nr_pages;
};
struct tdx_vp {
/* TDVP root page */
struct page *tdvpr_page;
/* TD vCPU control structure: */
struct page **tdcx_pages;
};
static inline u64 mk_keyed_paddr(u16 hkid, struct page *page)
{
u64 ret;
ret = page_to_phys(page);
/* KeyID bits are just above the physical address bits: */
ret |= (u64)hkid << boot_cpu_data.x86_phys_bits;
return ret;
}
static inline int pg_level_to_tdx_sept_level(enum pg_level level)
{
WARN_ON_ONCE(level == PG_LEVEL_NONE);
return level - 1;
}
u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args);
u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page);
u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2);
u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page);
u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2);
u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2);
u64 tdh_mng_key_config(struct tdx_td *td);
u64 tdh_mng_create(struct tdx_td *td, u16 hkid);
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp);
u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data);
u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2);
u64 tdh_mr_finalize(struct tdx_td *td);
u64 tdh_vp_flush(struct tdx_vp *vp);
u64 tdh_mng_vpflushdone(struct tdx_td *td);
u64 tdh_mng_key_freeid(struct tdx_td *td);
u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err);
u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid);
u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data);
u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask);
u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size);
u64 tdh_mem_track(struct tdx_td *tdr);
u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2);
u64 tdh_phymem_cache_wb(bool resume);
u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td);
u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page);
#else
static inline void tdx_init(void) { }
static inline int tdx_cpu_enable(void) { return -ENODEV; }
static inline int tdx_enable(void) { return -ENODEV; }
static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
#endif /* CONFIG_INTEL_TDX_HOST */
#endif /* !__ASSEMBLER__ */

View File

@ -17,9 +17,28 @@ struct tdx_sys_info_tdmr {
u16 pamt_1g_entry_size;
};
struct tdx_sys_info_td_ctrl {
u16 tdr_base_size;
u16 tdcs_base_size;
u16 tdvps_base_size;
};
struct tdx_sys_info_td_conf {
u64 attributes_fixed0;
u64 attributes_fixed1;
u64 xfam_fixed0;
u64 xfam_fixed1;
u16 num_cpuid_config;
u16 max_vcpus_per_td;
u64 cpuid_config_leaves[128];
u64 cpuid_config_values[128][2];
};
struct tdx_sys_info {
struct tdx_sys_info_features features;
struct tdx_sys_info_tdmr tdmr;
struct tdx_sys_info_td_ctrl td_ctrl;
struct tdx_sys_info_td_conf td_conf;
};
#endif

View File

@ -256,6 +256,7 @@ enum vmcs_field {
TSC_MULTIPLIER_HIGH = 0x00002033,
TERTIARY_VM_EXEC_CONTROL = 0x00002034,
TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035,
SHARED_EPT_POINTER = 0x0000203C,
PID_POINTER_TABLE = 0x00002042,
PID_POINTER_TABLE_HIGH = 0x00002043,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
@ -586,6 +587,7 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_PROT_READ BIT(3)
#define EPT_VIOLATION_PROT_WRITE BIT(4)
#define EPT_VIOLATION_PROT_EXEC BIT(5)
#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6)
#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
EPT_VIOLATION_PROT_WRITE | \
EPT_VIOLATION_PROT_EXEC)

View File

@ -441,6 +441,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8)
#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@ -930,4 +931,74 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_SNP_VM 4
#define KVM_X86_TDX_VM 5
/* Trust Domain eXtension sub-ioctl() commands. */
enum kvm_tdx_cmd_id {
KVM_TDX_CAPABILITIES = 0,
KVM_TDX_INIT_VM,
KVM_TDX_INIT_VCPU,
KVM_TDX_INIT_MEM_REGION,
KVM_TDX_FINALIZE_VM,
KVM_TDX_GET_CPUID,
KVM_TDX_CMD_NR_MAX,
};
struct kvm_tdx_cmd {
/* enum kvm_tdx_cmd_id */
__u32 id;
/* flags for sub-commend. If sub-command doesn't use this, set zero. */
__u32 flags;
/*
* data for each sub-command. An immediate or a pointer to the actual
* data in process virtual address. If sub-command doesn't use it,
* set zero.
*/
__u64 data;
/*
* Auxiliary error code. The sub-command may return TDX SEAMCALL
* status code in addition to -Exxx.
*/
__u64 hw_error;
};
struct kvm_tdx_capabilities {
__u64 supported_attrs;
__u64 supported_xfam;
__u64 reserved[254];
/* Configurable CPUID bits for userspace */
struct kvm_cpuid2 cpuid;
};
struct kvm_tdx_init_vm {
__u64 attributes;
__u64 xfam;
__u64 mrconfigid[6]; /* sha384 digest */
__u64 mrowner[6]; /* sha384 digest */
__u64 mrownerconfig[6]; /* sha384 digest */
/* The total space for TD_PARAMS before the CPUIDs is 256 bytes */
__u64 reserved[12];
/*
* Call KVM_TDX_INIT_VM before vcpu creation, thus before
* KVM_SET_CPUID2.
* This configuration supersedes KVM_SET_CPUID2s for VCPUs because the
* TDX module directly virtualizes those CPUIDs without VMM. The user
* space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with
* those values. If it doesn't, KVM may have wrong idea of vCPUIDs of
* the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX
* module doesn't virtualize.
*/
struct kvm_cpuid2 cpuid;
};
#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0)
struct kvm_tdx_init_mem_region {
__u64 source_addr;
__u64 gpa;
__u64 nr_pages;
};
#endif /* _ASM_X86_KVM_H */

View File

@ -34,6 +34,7 @@
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT_SIGNAL 3
#define EXIT_REASON_SIPI_SIGNAL 4
#define EXIT_REASON_OTHER_SMI 6
#define EXIT_REASON_INTERRUPT_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
@ -92,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
#define EXIT_REASON_TDCALL 77
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@ -155,7 +157,8 @@
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
{ EXIT_REASON_TPAUSE, "TPAUSE" }, \
{ EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
{ EXIT_REASON_NOTIFY, "NOTIFY" }
{ EXIT_REASON_NOTIFY, "NOTIFY" }, \
{ EXIT_REASON_TDCALL, "TDCALL" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }

View File

@ -352,7 +352,7 @@ static noinstr bool handle_bug(struct pt_regs *regs)
case BUG_UD1_UBSAN:
if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
pr_crit("%s at %pS\n",
report_ubsan_failure(regs, ud_imm),
report_ubsan_failure(ud_imm),
(void *)regs->ip);
}
break;

View File

@ -95,6 +95,8 @@ config KVM_SW_PROTECTED_VM
config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL
select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
help
Provides support for KVM on processors equipped with Intel's VT
extensions, a.k.a. Virtual Machine Extensions (VMX).
@ -129,6 +131,16 @@ config X86_SGX_KVM
If unsure, say N.
config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
help
Provides support for launching Intel Trust Domain Extensions (TDX)
confidential VMs on Intel processors.
If unsure, say N.
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON)

View File

@ -20,6 +20,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o

View File

@ -81,17 +81,8 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
/*
* Magic value used by KVM when querying userspace-provided CPUID entries and
* doesn't care about the CPIUD index because the index of the function in
* question is not significant. Note, this magic value must have at least one
* bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find()
* to avoid false positives when processing guest CPUID input.
*/
#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu,
u32 function, u64 index)
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
{
struct kvm_cpuid_entry2 *e;
int i;
@ -108,8 +99,8 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu,
*/
lockdep_assert_irqs_enabled();
for (i = 0; i < vcpu->arch.cpuid_nent; i++) {
e = &vcpu->arch.cpuid_entries[i];
for (i = 0; i < nent; i++) {
e = &entries[i];
if (e->function != function)
continue;
@ -140,26 +131,7 @@ static struct kvm_cpuid_entry2 *cpuid_entry2_find(struct kvm_vcpu *vcpu,
return NULL;
}
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
u32 function, u32 index)
{
return cpuid_entry2_find(vcpu, function, index);
}
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function)
{
return cpuid_entry2_find(vcpu, function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
}
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
/*
* cpuid_entry2_find() and KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used
* directly outside of kvm_find_cpuid_entry() and kvm_find_cpuid_entry_index().
*/
#undef KVM_CPUID_INDEX_NOT_SIGNIFICANT
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2);
static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
{
@ -492,6 +464,20 @@ not_found:
return 36;
}
int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000000);
if (!best || best->eax < 0x80000008)
goto not_found;
best = kvm_find_cpuid_entry(vcpu, 0x80000008);
if (best)
return (best->eax >> 16) & 0xff;
not_found:
return 0;
}
/*
* This "raw" version returns the reserved GPA bits without any adjustments for
* encryption technologies that usurp bits. The raw mask should be used if and

View File

@ -11,10 +11,34 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
u32 function, u32 index);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
int nent, u32 function, u64 index);
/*
* Magic value used by KVM when querying userspace-provided CPUID entries and
* doesn't care about the CPIUD index because the index of the function in
* question is not significant. Note, this magic value must have at least one
* bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2()
* to avoid false positives when processing guest CPUID input.
*
* KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of
* kvm_find_cpuid_entry2() and kvm_find_cpuid_entry().
*/
#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
u32 function, u32 index)
{
return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
function, index);
}
static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function)
{
return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
}
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries,
unsigned int type);
@ -34,6 +58,7 @@ void __init kvm_init_xstate_sizes(void);
u32 xstate_required_size(u64 xstate_bv, bool compacted);
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu);
u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)

View File

@ -100,6 +100,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
if (kvm_cpu_has_extint(v))
return 1;
if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected)
return kvm_x86_call(protected_apic_has_interrupt)(v);
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
}
EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);

View File

@ -1790,8 +1790,17 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT);
u32 reg;
/*
* Assume a timer IRQ was "injected" if the APIC is protected. KVM's
* copy of the vIRR is bogus, it's the responsibility of the caller to
* precisely check whether or not a timer IRQ is pending.
*/
if (apic->guest_apic_protected)
return true;
reg = kvm_lapic_get_reg(apic, APIC_LVTT);
if (kvm_apic_hw_enabled(apic)) {
int vec = reg & APIC_VECTOR_MASK;
void *bitmap = apic->regs + APIC_ISR;
@ -2650,6 +2659,7 @@ int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_apic_set_base);
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
@ -2958,6 +2968,9 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
if (!kvm_apic_present(vcpu))
return -1;
if (apic->guest_apic_protected)
return -1;
__apic_update_ppr(apic, &ppr);
return apic_has_interrupt_for_ppr(apic, ppr);
}

View File

@ -65,6 +65,8 @@ struct kvm_lapic {
bool sw_enabled;
bool irr_pending;
bool lvt0_in_nmi_mode;
/* Select registers in the vAPIC cannot be read/written. */
bool guest_apic_protected;
/* Number of bits set in ISR. */
s16 isr_count;
/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */

View File

@ -79,6 +79,7 @@ static inline gfn_t kvm_mmu_max_gfn(void)
u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
@ -234,7 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
bool kvm_mmu_may_ignore_guest_pat(void);
bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
@ -256,6 +257,9 @@ extern bool tdp_mmu_enabled;
#define tdp_mmu_enabled false
#endif
bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);

View File

@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed;
#ifdef CONFIG_X86_64
bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
EXPORT_SYMBOL_GPL(tdp_mmu_enabled);
#endif
static int max_huge_page_level __read_mostly;
@ -1456,15 +1457,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
* enabled but it chooses between clearing the Dirty bit and Writeable
* bit based on the context.
*/
if (kvm_x86_ops.cpu_dirty_log_size)
if (kvm->arch.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
int kvm_cpu_dirty_log_size(void)
int kvm_cpu_dirty_log_size(struct kvm *kvm)
{
return kvm_x86_ops.cpu_dirty_log_size;
return kvm->arch.cpu_dirty_log_size;
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
@ -4835,19 +4836,6 @@ out_unlock:
}
#endif
bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
* When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
* has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
* honor the memtype from the guest's PAT so that guest accesses to
* memory that is DMA'd aren't cached against the guest's wishes. As a
* result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
* KVM _always_ ignores guest PAT (when EPT is enabled).
*/
return shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
#ifdef CONFIG_X86_64
@ -4858,8 +4846,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
u8 *level)
int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
{
int r;
@ -4873,6 +4860,10 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
do {
if (signal_pending(current))
return -EINTR;
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
return -EIO;
cond_resched();
r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
} while (r == RET_PF_RETRY);
@ -4897,6 +4888,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
return -EIO;
}
}
EXPORT_SYMBOL_GPL(kvm_tdp_map_page);
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
@ -5589,12 +5581,19 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
int maxpa;
if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM)
maxpa = cpuid_query_maxguestphyaddr(vcpu);
else
maxpa = cpuid_maxphyaddr(vcpu);
/* tdp_root_level is architecture forced level, use it if nonzero */
if (tdp_root_level)
return tdp_root_level;
/* Use 5-level TDP if and only if it's useful/necessary. */
if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
if (max_tdp_level == 5 && maxpa <= 48)
return 4;
return max_tdp_level;
@ -5913,6 +5912,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@ -7239,6 +7239,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm,
.start = slot->base_gfn,
.end = slot->base_gfn + slot->npages,
.may_block = true,
.attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED,
};
bool flush;

View File

@ -187,7 +187,8 @@ static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mm
return kvm_gfn_direct_bits(kvm);
}
static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
/*
* When using the EPT page-modification log, the GPAs in the CPU dirty
@ -197,7 +198,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
* being enabled is mandatory as the bits used to denote WP-only SPTEs
* are reserved for PAE paging (32-bit KVM).
*/
return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode;
}
static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)

Some files were not shown because too many files have changed in this diff Show More