aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe14
-rw-r--r--Documentation/devicetree/bindings/arm/ete.yaml75
-rw-r--r--Documentation/devicetree/bindings/arm/trbe.yaml49
-rw-r--r--Documentation/trace/coresight/coresight-trbe.rst38
-rw-r--r--Documentation/virt/kvm/amd-memory-encryption.rst143
-rw-r--r--Documentation/virt/kvm/api.rst214
-rw-r--r--Documentation/virt/kvm/arm/index.rst1
-rw-r--r--Documentation/virt/kvm/arm/ptp_kvm.rst25
-rw-r--r--Documentation/virt/kvm/devices/arm-vgic-its.rst2
-rw-r--r--Documentation/virt/kvm/devices/arm-vgic-v3.rst2
-rw-r--r--Documentation/virt/kvm/locking.rst49
-rw-r--r--Documentation/virt/kvm/s390-diag.rst33
-rw-r--r--MAINTAINERS6
-rw-r--r--arch/arm/include/asm/hypervisor.h3
-rw-r--r--arch/arm64/include/asm/assembler.h27
-rw-r--r--arch/arm64/include/asm/barrier.h1
-rw-r--r--arch/arm64/include/asm/el2_setup.h13
-rw-r--r--arch/arm64/include/asm/fpsimd.h11
-rw-r--r--arch/arm64/include/asm/fpsimdmacros.h10
-rw-r--r--arch/arm64/include/asm/hyp_image.h7
-rw-r--r--arch/arm64/include/asm/hypervisor.h3
-rw-r--r--arch/arm64/include/asm/kvm_arm.h2
-rw-r--r--arch/arm64/include/asm/kvm_asm.h9
-rw-r--r--arch/arm64/include/asm/kvm_host.h55
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h14
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h25
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h164
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h4
-rw-r--r--arch/arm64/include/asm/sections.h1
-rw-r--r--arch/arm64/include/asm/sysreg.h59
-rw-r--r--arch/arm64/kernel/asm-offsets.c3
-rw-r--r--arch/arm64/kernel/cpu-reset.S5
-rw-r--r--arch/arm64/kernel/hyp-stub.S3
-rw-r--r--arch/arm64/kernel/image-vars.h34
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S74
-rw-r--r--arch/arm64/kvm/arm.c220
-rw-r--r--arch/arm64/kvm/debug.c118
-rw-r--r--arch/arm64/kvm/fpsimd.c26
-rw-r--r--arch/arm64/kvm/guest.c11
-rw-r--r--arch/arm64/kvm/handle_exit.c45
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/fpsimd.S10
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h107
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/early_alloc.h14
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/gfp.h68
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h36
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/memory.h51
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mm.h96
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/spinlock.h92
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/cache.S13
-rw-r--r--arch/arm64/kvm/hyp/nvhe/debug-sr.c56
-rw-r--r--arch/arm64/kvm/hyp/nvhe/early_alloc.c54
-rw-r--r--arch/arm64/kvm/hyp/nvhe/gen-hyprel.c18
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S18
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S54
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c75
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-smp.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S1
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c279
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c173
-rw-r--r--arch/arm64/kvm/hyp/nvhe/page_alloc.c195
-rw-r--r--arch/arm64/kvm/hyp/nvhe/psci-relay.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c214
-rw-r--r--arch/arm64/kvm/hyp/nvhe/stub.c22
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c26
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c4
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c410
-rw-r--r--arch/arm64/kvm/hyp/reserved_mem.c113
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c4
-rw-r--r--arch/arm64/kvm/hypercalls.c80
-rw-r--r--arch/arm64/kvm/mmu.c254
-rw-r--r--arch/arm64/kvm/perf.c7
-rw-r--r--arch/arm64/kvm/pmu-emul.c2
-rw-r--r--arch/arm64/kvm/pmu.c8
-rw-r--r--arch/arm64/kvm/reset.c51
-rw-r--r--arch/arm64/kvm/sys_regs.c16
-rw-r--r--arch/arm64/kvm/trace_arm.h66
-rw-r--r--arch/arm64/kvm/va_layout.c7
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c6
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c7
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c81
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c10
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c66
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c38
-rw-r--r--arch/arm64/kvm/vgic/vgic.h2
-rw-r--r--arch/arm64/lib/clear_page.S4
-rw-r--r--arch/arm64/lib/copy_page.S4
-rw-r--r--arch/arm64/mm/init.c3
-rw-r--r--arch/mips/include/asm/kvm_host.h17
-rw-r--r--arch/mips/kvm/mips.c21
-rw-r--r--arch/mips/kvm/mmu.c100
-rw-r--r--arch/mips/kvm/vz.c19
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h12
-rw-r--r--arch/powerpc/include/asm/kvm_host.h7
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h9
-rw-r--r--arch/powerpc/kvm/book3s.c18
-rw-r--r--arch/powerpc/kvm/book3s.h10
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c98
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c25
-rw-r--r--arch/powerpc/kvm/book3s_hv.c12
-rw-r--r--arch/powerpc/kvm/book3s_pr.c56
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c29
-rw-r--r--arch/powerpc/kvm/trace_booke.h15
-rw-r--r--arch/s390/include/asm/kvm_host.h5
-rw-r--r--arch/s390/include/asm/smp.h1
-rw-r--r--arch/s390/kernel/smp.c1
-rw-r--r--arch/s390/kvm/diag.c31
-rw-r--r--arch/s390/kvm/gaccess.c30
-rw-r--r--arch/s390/kvm/gaccess.h60
-rw-r--r--arch/s390/kvm/kvm-s390.c15
-rw-r--r--arch/s390/kvm/kvm-s390.h8
-rw-r--r--arch/s390/kvm/vsie.c109
-rw-r--r--arch/sh/kernel/perf_event.c18
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h67
-rw-r--r--arch/x86/include/asm/mem_encrypt.h1
-rw-r--r--arch/x86/include/asm/svm.h4
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/kvm.c128
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c98
-rw-r--r--arch/x86/kvm/cpuid.h155
-rw-r--r--arch/x86/kvm/emulate.c80
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h19
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/mmu.h23
-rw-r--r--arch/x86/kvm/mmu/mmu.c637
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c2
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h44
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h3
-rw-r--r--arch/x86/kvm/mmu/spte.c159
-rw-r--r--arch/x86/kvm/mmu/spte.h141
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c740
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h51
-rw-r--r--arch/x86/kvm/reverse_cpuid.h186
-rw-r--r--arch/x86/kvm/svm/avic.c24
-rw-r--r--arch/x86/kvm/svm/nested.c573
-rw-r--r--arch/x86/kvm/svm/sev.c922
-rw-r--r--arch/x86/kvm/svm/svm.c1107
-rw-r--r--arch/x86/kvm/svm/svm.h91
-rw-r--r--arch/x86/kvm/svm/vmenter.S47
-rw-r--r--arch/x86/kvm/vmx/nested.c83
-rw-r--r--arch/x86/kvm/vmx/nested.h5
-rw-r--r--arch/x86/kvm/vmx/sgx.c502
-rw-r--r--arch/x86/kvm/vmx/sgx.h34
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c432
-rw-r--r--arch/x86/kvm/vmx/vmx.h39
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h4
-rw-r--r--arch/x86/kvm/x86.c214
-rw-r--r--arch/x86/kvm/x86.h18
-rw-r--r--arch/x86/mm/mem_encrypt.c10
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c1
-rw-r--r--drivers/clocksource/arm_arch_timer.c36
-rw-r--r--drivers/crypto/ccp/sev-dev.c193
-rw-r--r--drivers/crypto/ccp/sev-dev.h4
-rw-r--r--drivers/firmware/psci/psci.c2
-rw-r--r--drivers/firmware/smccc/Makefile2
-rw-r--r--drivers/firmware/smccc/kvm_guest.c50
-rw-r--r--drivers/firmware/smccc/smccc.c1
-rw-r--r--drivers/hwtracing/coresight/Kconfig24
-rw-r--r--drivers/hwtracing/coresight/Makefile1
-rw-r--r--drivers/hwtracing/coresight/coresight-core.c29
-rw-r--r--drivers/hwtracing/coresight/coresight-etm-perf.c119
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x-core.c161
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x-sysfs.c19
-rw-r--r--drivers/hwtracing/coresight/coresight-etm4x.h83
-rw-r--r--drivers/hwtracing/coresight/coresight-platform.c6
-rw-r--r--drivers/hwtracing/coresight/coresight-priv.h3
-rw-r--r--drivers/hwtracing/coresight/coresight-trbe.c1157
-rw-r--r--drivers/hwtracing/coresight/coresight-trbe.h152
-rw-r--r--drivers/irqchip/irq-gic-v3-its.c18
-rw-r--r--drivers/perf/arm_pmu.c30
-rw-r--r--drivers/ptp/Kconfig2
-rw-r--r--drivers/ptp/Makefile2
-rw-r--r--drivers/ptp/ptp_kvm_arm.c28
-rw-r--r--drivers/ptp/ptp_kvm_common.c (renamed from drivers/ptp/ptp_kvm.c)85
-rw-r--r--drivers/ptp/ptp_kvm_x86.c97
-rw-r--r--include/kvm/arm_pmu.h4
-rw-r--r--include/kvm/arm_vgic.h1
-rw-r--r--include/linux/arm-smccc.h41
-rw-r--r--include/linux/bug.h10
-rw-r--r--include/linux/clocksource.h6
-rw-r--r--include/linux/clocksource_ids.h12
-rw-r--r--include/linux/coresight.h13
-rw-r--r--include/linux/kvm_host.h24
-rw-r--r--include/linux/perf_event.h2
-rw-r--r--include/linux/psp-sev.h18
-rw-r--r--include/linux/ptp_kvm.h19
-rw-r--r--include/linux/timekeeping.h12
-rw-r--r--include/trace/events/kvm.h90
-rw-r--r--include/uapi/linux/kvm.h45
-rw-r--r--include/uapi/linux/perf_event.h13
-rw-r--r--kernel/events/core.c5
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timekeeping.c1
-rw-r--r--lib/bug.c54
-rw-r--r--tools/include/asm-generic/hugetlb_encode.h3
-rw-r--r--tools/testing/selftests/kvm/.gitignore2
-rw-r--r--tools/testing/selftests/kvm/Makefile4
-rw-r--r--tools/testing/selftests/kvm/aarch64/vgic_init.c551
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c69
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h13
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h21
-rw-r--r--tools/testing/selftests/kvm/kvm_page_table_test.c506
-rw-r--r--tools/testing/selftests/kvm/lib/assert.c4
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c138
-rw-r--r--tools/testing/selftests/kvm/lib/test_util.c163
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c61
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c2
-rw-r--r--virt/kvm/coalesced_mmio.c19
-rw-r--r--virt/kvm/kvm_main.c303
216 files changed, 12447 insertions, 4022 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe b/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe
new file mode 100644
index 000000000000..ad3bbc6fa751
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe
@@ -0,0 +1,14 @@
+What: /sys/bus/coresight/devices/trbe<cpu>/align
+Date: March 2021
+KernelVersion: 5.13
+Contact: Anshuman Khandual <anshuman.khandual@arm.com>
+Description: (Read) Shows the TRBE write pointer alignment. This value
+ is fetched from the TRBIDR register.
+
+What: /sys/bus/coresight/devices/trbe<cpu>/flag
+Date: March 2021
+KernelVersion: 5.13
+Contact: Anshuman Khandual <anshuman.khandual@arm.com>
+Description: (Read) Shows if TRBE updates in the memory are with access
+ and dirty flag updates as well. This value is fetched from
+ the TRBIDR register.
diff --git a/Documentation/devicetree/bindings/arm/ete.yaml b/Documentation/devicetree/bindings/arm/ete.yaml
new file mode 100644
index 000000000000..7f9b2d1e1147
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/ete.yaml
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/ete.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Embedded Trace Extensions
+
+maintainers:
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+
+description: |
+ Arm Embedded Trace Extension(ETE) is a per CPU trace component that
+ allows tracing the CPU execution. It overlaps with the CoreSight ETMv4
+ architecture and has extended support for future architecture changes.
+ The trace generated by the ETE could be stored via legacy CoreSight
+ components (e.g, TMC-ETR) or other means (e.g, using a per CPU buffer
+ Arm Trace Buffer Extension (TRBE)). Since the ETE can be connected to
+ legacy CoreSight components, a node must be listed per instance, along
+ with any optional connection graph as per the coresight bindings.
+ See bindings/arm/coresight.txt.
+
+properties:
+ $nodename:
+ pattern: "^ete([0-9a-f]+)$"
+ compatible:
+ items:
+ - const: arm,embedded-trace-extension
+
+ cpu:
+ description: |
+ Handle to the cpu this ETE is bound to.
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ out-ports:
+ description: |
+ Output connections from the ETE to legacy CoreSight trace bus.
+ $ref: /schemas/graph.yaml#/properties/ports
+ properties:
+ port:
+ description: Output connection from the ETE to legacy CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - cpu
+
+additionalProperties: false
+
+examples:
+
+# An ETE node without legacy CoreSight connections
+ - |
+ ete0 {
+ compatible = "arm,embedded-trace-extension";
+ cpu = <&cpu_0>;
+ };
+# An ETE node with legacy CoreSight connections
+ - |
+ ete1 {
+ compatible = "arm,embedded-trace-extension";
+ cpu = <&cpu_1>;
+
+ out-ports { /* legacy coresight connection */
+ port {
+ ete1_out_port: endpoint {
+ remote-endpoint = <&funnel_in_port0>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/trbe.yaml b/Documentation/devicetree/bindings/arm/trbe.yaml
new file mode 100644
index 000000000000..4402d7bfd1fc
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/trbe.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/trbe.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Trace Buffer Extensions
+
+maintainers:
+ - Anshuman Khandual <anshuman.khandual@arm.com>
+
+description: |
+ Arm Trace Buffer Extension (TRBE) is a per CPU component
+ for storing trace generated on the CPU to memory. It is
+ accessed via CPU system registers. The software can verify
+ if it is permitted to use the component by checking the
+ TRBIDR register.
+
+properties:
+ $nodename:
+ const: "trbe"
+ compatible:
+ items:
+ - const: arm,trace-buffer-extension
+
+ interrupts:
+ description: |
+ Exactly 1 PPI must be listed. For heterogeneous systems where
+ TRBE is only supported on a subset of the CPUs, please consult
+ the arm,gic-v3 binding for details on describing a PPI partition.
+ maxItems: 1
+
+required:
+ - compatible
+ - interrupts
+
+additionalProperties: false
+
+examples:
+
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ trbe {
+ compatible = "arm,trace-buffer-extension";
+ interrupts = <GIC_PPI 15 IRQ_TYPE_LEVEL_HIGH>;
+ };
+...
diff --git a/Documentation/trace/coresight/coresight-trbe.rst b/Documentation/trace/coresight/coresight-trbe.rst
new file mode 100644
index 000000000000..b9928ef148da
--- /dev/null
+++ b/Documentation/trace/coresight/coresight-trbe.rst
@@ -0,0 +1,38 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Trace Buffer Extension (TRBE).
+==============================
+
+ :Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ :Date: November 2020
+
+Hardware Description
+--------------------
+
+Trace Buffer Extension (TRBE) is a percpu hardware which captures in system
+memory, CPU traces generated from a corresponding percpu tracing unit. This
+gets plugged in as a coresight sink device because the corresponding trace
+generators (ETE), are plugged in as source device.
+
+The TRBE is not compliant to CoreSight architecture specifications, but is
+driven via the CoreSight driver framework to support the ETE (which is
+CoreSight compliant) integration.
+
+Sysfs files and directories
+---------------------------
+
+The TRBE devices appear on the existing coresight bus alongside the other
+coresight devices::
+
+ >$ ls /sys/bus/coresight/devices
+ trbe0 trbe1 trbe2 trbe3
+
+The ``trbe<N>`` named TRBEs are associated with a CPU.::
+
+ >$ ls /sys/bus/coresight/devices/trbe0/
+ align flag
+
+*Key file items are:-*
+ * ``align``: TRBE write pointer alignment
+ * ``flag``: TRBE updates memory with access and dirty flags
diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 469a6308765b..5ec8a1902e15 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -148,6 +148,9 @@ measurement. Since the guest owner knows the initial contents of the guest at
boot, the measurement can be verified by comparing it to what the guest owner
expects.
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
Parameters (in): struct kvm_sev_launch_measure
Returns: 0 on success, -negative on error
@@ -271,6 +274,9 @@ report containing the SHA-256 digest of the guest memory and VMSA passed through
commands and signed with the PEK. The digest returned by the command should match the digest
used by the guest owner with the KVM_SEV_LAUNCH_MEASURE.
+If len is zero on entry, the measurement blob length is written to len and
+uaddr is unused.
+
Parameters (in): struct kvm_sev_attestation
Returns: 0 on success, -negative on error
@@ -284,6 +290,143 @@ Returns: 0 on success, -negative on error
__u32 len;
};
+11. KVM_SEV_SEND_START
+----------------------
+
+The KVM_SEV_SEND_START command can be used by the hypervisor to create an
+outgoing guest encryption context.
+
+If session_len is zero on entry, the length of the guest session information is
+written to session_len and all other fields are not used.
+
+Parameters (in): struct kvm_sev_send_start
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_send_start {
+ __u32 policy; /* guest policy */
+
+ __u64 pdh_cert_uaddr; /* platform Diffie-Hellman certificate */
+ __u32 pdh_cert_len;
+
+ __u64 plat_certs_uaddr; /* platform certificate chain */
+ __u32 plat_certs_len;
+
+ __u64 amd_certs_uaddr; /* AMD certificate */
+ __u32 amd_certs_len;
+
+ __u64 session_uaddr; /* Guest session information */
+ __u32 session_len;
+ };
+
+12. KVM_SEV_SEND_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_SEND_UPDATE_DATA command can be used by the hypervisor to encrypt the
+outgoing guest memory region with the encryption context creating using
+KVM_SEV_SEND_START.
+
+If hdr_len or trans_len are zero on entry, the length of the packet header and
+transport region are written to hdr_len and trans_len respectively, and all
+other fields are not used.
+
+Parameters (in): struct kvm_sev_send_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_launch_send_update_data {
+ __u64 hdr_uaddr; /* userspace address containing the packet header */
+ __u32 hdr_len;
+
+ __u64 guest_uaddr; /* the source memory region to be encrypted */
+ __u32 guest_len;
+
+ __u64 trans_uaddr; /* the destination memory region */
+ __u32 trans_len;
+ };
+
+13. KVM_SEV_SEND_FINISH
+------------------------
+
+After completion of the migration flow, the KVM_SEV_SEND_FINISH command can be
+issued by the hypervisor to delete the encryption context.
+
+Returns: 0 on success, -negative on error
+
+14. KVM_SEV_SEND_CANCEL
+------------------------
+
+After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the
+SEND_CANCEL command to stop a migration. This is necessary so that a cancelled
+migration can restart with a new target later.
+
+Returns: 0 on success, -negative on error
+
+15. KVM_SEV_RECEIVE_START
+-------------------------
+
+The KVM_SEV_RECEIVE_START command is used for creating the memory encryption
+context for an incoming SEV guest. To create the encryption context, the user must
+provide a guest policy, the platform public Diffie-Hellman (PDH) key and session
+information.
+
+Parameters: struct kvm_sev_receive_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_receive_start {
+ __u32 handle; /* if zero then firmware creates a new handle */
+ __u32 policy; /* guest's policy */
+
+ __u64 pdh_uaddr; /* userspace address pointing to the PDH key */
+ __u32 pdh_len;
+
+ __u64 session_uaddr; /* userspace address which points to the guest session information */
+ __u32 session_len;
+ };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.12.
+
+16. KVM_SEV_RECEIVE_UPDATE_DATA
+-------------------------------
+
+The KVM_SEV_RECEIVE_UPDATE_DATA command can be used by the hypervisor to copy
+the incoming buffers into the guest memory region with encryption context
+created during the KVM_SEV_RECEIVE_START.
+
+Parameters (in): struct kvm_sev_receive_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+ struct kvm_sev_launch_receive_update_data {
+ __u64 hdr_uaddr; /* userspace address containing the packet header */
+ __u32 hdr_len;
+
+ __u64 guest_uaddr; /* the destination guest memory region */
+ __u32 guest_len;
+
+ __u64 trans_uaddr; /* the incoming buffer memory region */
+ __u32 trans_len;
+ };
+
+17. KVM_SEV_RECEIVE_FINISH
+--------------------------
+
+After completion of the migration flow, the KVM_SEV_RECEIVE_FINISH command can be
+issued by the hypervisor to make the guest ready for execution.
+
+Returns: 0 on success, -negative on error
+
References
==========
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 245d80581f15..22d077562149 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -204,7 +204,7 @@ Errors:
====== ============================================================
EFAULT the msr index list cannot be read from or written to
- E2BIG the msr index list is to be to fit in the array specified by
+ E2BIG the msr index list is too big to fit in the array specified by
the user.
====== ============================================================
@@ -3116,6 +3116,18 @@ optional features it should have.  This will cause a reset of the cpu
registers to their initial values.  If this is not called, KVM_RUN will
return ENOEXEC for that vcpu.
+The initial values are defined as:
+ - Processor state:
+ * AArch64: EL1h, D, A, I and F bits set. All other bits
+ are cleared.
+ * AArch32: SVC, A, I and F bits set. All other bits are
+ cleared.
+ - General Purpose registers, including PC and SP: set to 0
+ - FPSIMD/NEON registers: set to 0
+ - SVE registers: set to 0
+ - System registers: Reset to their architecturally defined
+ values as for a warm reset to EL1 (resp. SVC)
+
Note that because some registers reflect machine topology, all vcpus
should be created before this ioctl is invoked.
@@ -3335,7 +3347,8 @@ The top 16 bits of the control field are architecture specific control
flags which can include the following:
- KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64]
- - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64]
+ - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390]
+ - KVM_GUESTDBG_USE_HW: using hardware debug events [arm64]
- KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86]
- KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86]
- KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390]
@@ -3358,6 +3371,9 @@ indicating the number of supported registers.
For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether
the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported.
+Also when supported, KVM_CAP_SET_GUEST_DEBUG2 capability indicates the
+supported KVM_GUESTDBG_* bits in the control field.
+
When debug events exit the main run loop with the reason
KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
structure containing architecture specific debug information.
@@ -3690,31 +3706,105 @@ which is the maximum number of possibly pending cpu-local interrupts.
Queues an SMI on the thread's vcpu.
-4.97 KVM_CAP_PPC_MULTITCE
--------------------------
+4.97 KVM_X86_SET_MSR_FILTER
+----------------------------
-:Capability: KVM_CAP_PPC_MULTITCE
-:Architectures: ppc
-:Type: vm
+:Capability: KVM_X86_SET_MSR_FILTER
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_filter
+:Returns: 0 on success, < 0 on error
-This capability means the kernel is capable of handling hypercalls
-H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
-space. This significantly accelerates DMA operations for PPC KVM guests.
-User space should expect that its handlers for these hypercalls
-are not going to be called if user space previously registered LIOBN
-in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+::
-In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
-user space might have to advertise it for the guest. For example,
-IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
-present in the "ibm,hypertas-functions" device-tree property.
+ struct kvm_msr_filter_range {
+ #define KVM_MSR_FILTER_READ (1 << 0)
+ #define KVM_MSR_FILTER_WRITE (1 << 1)
+ __u32 flags;
+ __u32 nmsrs; /* number of msrs in bitmap */
+ __u32 base; /* MSR index the bitmap starts at */
+ __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */
+ };
-The hypercalls mentioned above may or may not be processed successfully
-in the kernel based fast path. If they can not be handled by the kernel,
-they will get passed on to user space. So user space still has to have
-an implementation for these despite the in kernel acceleration.
+ #define KVM_MSR_FILTER_MAX_RANGES 16
+ struct kvm_msr_filter {
+ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+ #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+ __u32 flags;
+ struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
+ };
-This capability is always enabled.
+flags values for ``struct kvm_msr_filter_range``:
+
+``KVM_MSR_FILTER_READ``
+
+ Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a read should immediately fail, while a 1 indicates that
+ a read for a particular MSR should be handled regardless of the default
+ filter action.
+
+``KVM_MSR_FILTER_WRITE``
+
+ Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap
+ indicates that a write should immediately fail, while a 1 indicates that
+ a write for a particular MSR should be handled regardless of the default
+ filter action.
+
+``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE``
+
+ Filter both read and write accesses to MSRs using the given bitmap. A 0
+ in the bitmap indicates that both reads and writes should immediately fail,
+ while a 1 indicates that reads and writes for a particular MSR are not
+ filtered by this range.
+
+flags values for ``struct kvm_msr_filter``:
+
+``KVM_MSR_FILTER_DEFAULT_ALLOW``
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to allowing access to the MSR.
+
+``KVM_MSR_FILTER_DEFAULT_DENY``
+
+ If no filter range matches an MSR index that is getting accessed, KVM will
+ fall back to rejecting access to the MSR. In this mode, all MSRs that should
+ be processed by KVM need to explicitly be marked as allowed in the bitmaps.
+
+This ioctl allows user space to define up to 16 bitmaps of MSR ranges to
+specify whether a certain MSR access should be explicitly filtered for or not.
+
+If this ioctl has never been invoked, MSR accesses are not guarded and the
+default KVM in-kernel emulation behavior is fully preserved.
+
+Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR
+filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
+an error.
+
+As soon as the filtering is in place, every MSR access is processed through
+the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff);
+x2APIC MSRs are always allowed, independent of the ``default_allow`` setting,
+and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base
+register.
+
+If a bit is within one of the defined ranges, read and write accesses are
+guarded by the bitmap's value for the MSR index if the kind of access
+is included in the ``struct kvm_msr_filter_range`` flags. If no range
+cover this particular access, the behavior is determined by the flags
+field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW``
+and ``KVM_MSR_FILTER_DEFAULT_DENY``.
+
+Each bitmap range specifies a range of MSRs to potentially allow access on.
+The range goes from MSR index [base .. base+nmsrs]. The flags field
+indicates whether reads, writes or both reads and writes are filtered
+by setting a 1 bit in the bitmap for the corresponding MSR index.
+
+If an MSR access is not permitted through the filtering, it generates a
+#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that
+allows user space to deflect and potentially handle various MSR accesses
+into user space.
+
+If a vCPU is in running state while this ioctl is invoked, the vCPU may
+experience inconsistent filtering behavior on MSR accesses.
4.98 KVM_CREATE_SPAPR_TCE_64
----------------------------
@@ -4855,7 +4945,7 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
Sets the exception vector used to deliver Xen event channel upcalls.
-4.128 KVM_XEN_HVM_GET_ATTR
+4.127 KVM_XEN_HVM_GET_ATTR
--------------------------
:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4867,7 +4957,7 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
Allows Xen VM attributes to be read. For the structure and types,
see KVM_XEN_HVM_SET_ATTR above.
-4.129 KVM_XEN_VCPU_SET_ATTR
+4.128 KVM_XEN_VCPU_SET_ATTR
---------------------------
:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -4929,7 +5019,7 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
or RUNSTATE_offline) to set the current accounted state as of the
adjusted state_entry_time.
-4.130 KVM_XEN_VCPU_GET_ATTR
+4.129 KVM_XEN_VCPU_GET_ATTR
---------------------------
:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO
@@ -6233,6 +6323,45 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
This capability can be used to check / enable 2nd DAWR feature provided
by POWER10 processor.
+7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
+-------------------------------------
+
+Architectures: x86 SEV enabled
+Type: vm
+Parameters: args[0] is the fd of the source vm
+Returns: 0 on success; ENOTTY on error
+
+This capability enables userspace to copy encryption context from the vm
+indicated by the fd to the vm this is called on.
+
+This is intended to support in-guest workloads scheduled by the host. This
+allows the in-guest workload to maintain its own NPTs and keeps the two vms
+from accidentally clobbering each other with interrupts and the like (separate
+APIC/MSRs/etc).
+
+7.25 KVM_CAP_SGX_ATTRIBUTE
+--------------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+ attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes. args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint. To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
8. Other capabilities.
======================
@@ -6727,3 +6856,38 @@ vcpu_info is set.
The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+
+8.31 KVM_CAP_PPC_MULTITCE
+-------------------------
+
+:Capability: KVM_CAP_PPC_MULTITCE
+:Architectures: ppc
+:Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
+8.32 KVM_CAP_PTP_KVM
+--------------------
+
+:Architectures: arm64
+
+This capability indicates that the KVM virtual PTP service is
+supported in the host. A VMM can check whether the service is
+available to the guest on migration.
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index 3e2b2aba90fc..78a9b670aafe 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,3 +10,4 @@ ARM
hyp-abi
psci
pvtime
+ ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644
index 000000000000..aecdc80ddcd8
--- /dev/null
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=============================
+
+PTP_KVM is used for high precision time sync between host and guests.
+It relies on transferring the wall clock and counter value from the
+host to the guest using a KVM-specific hypercall.
+
+* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001
+
+This hypercall uses the SMC32/HVC32 calling convention:
+
+ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID
+ ============== ======== =====================================
+ Function ID: (uint32) 0x86000001
+ Arguments: (uint32) KVM_PTP_VIRT_COUNTER(0)
+ KVM_PTP_PHYS_COUNTER(1)
+ Return Values: (int32) NOT_SUPPORTED(-1) on error, or
+ (uint32) Upper 32 bits of wall clock time (r0)
+ (uint32) Lower 32 bits of wall clock time (r1)
+ (uint32) Upper 32 bits of counter (r2)
+ (uint32) Lower 32 bits of counter (r3)
+ Endianness: No Restrictions.
+ ============== ======== =====================================
diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst b/Documentation/virt/kvm/devices/arm-vgic-its.rst
index 6c304fd2b1b4..d257eddbae29 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-its.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst
@@ -80,7 +80,7 @@ KVM_DEV_ARM_VGIC_GRP_CTRL
-EFAULT Invalid guest ram access
-EBUSY One or more VCPUS are running
-EACCES The virtual ITS is backed by a physical GICv4 ITS, and the
- state is not available
+ state is not available without GICv4.1
======= ==========================================================
KVM_DEV_ARM_VGIC_GRP_ITS_REGS
diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
index 5dd3bff51978..51e5e5762571 100644
--- a/Documentation/virt/kvm/devices/arm-vgic-v3.rst
+++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst
@@ -228,7 +228,7 @@ Groups:
KVM_DEV_ARM_VGIC_CTRL_INIT
request the initialization of the VGIC, no additional parameter in
- kvm_device_attr.addr.
+ kvm_device_attr.addr. Must be called after all VCPUs have been created.
KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES
save all LPI pending bits into guest RAM pending tables.
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 0aa4817b466d..1fc860c007a3 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -38,25 +38,24 @@ the mmu-lock on x86. Currently, the page fault can be fast in one of the
following two cases:
1. Access Tracking: The SPTE is not present, but it is marked for access
- tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
- restore the saved R/X bits. This is described in more detail later below.
+ tracking. That means we need to restore the saved R/X bits. This is
+ described in more detail later below.
-2. Write-Protection: The SPTE is present and the fault is
- caused by write-protect. That means we just need to change the W bit of
- the spte.
+2. Write-Protection: The SPTE is present and the fault is caused by
+ write-protect. That means we just need to change the W bit of the spte.
-What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
-SPTE_MMU_WRITEABLE bit on the spte:
+What we use to avoid all the race is the Host-writable bit and MMU-writable bit
+on the spte:
-- SPTE_HOST_WRITEABLE means the gfn is writable on host.
-- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
- the gfn is writable on guest mmu and it is not write-protected by shadow
- page write-protection.
+- Host-writable means the gfn is writable in the host kernel page tables and in
+ its KVM memslot.
+- MMU-writable means the gfn is writable in the guest's mmu and it is not
+ write-protected by shadow page write-protection.
On fast page fault path, we will use cmpxchg to atomically set the spte W
-bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
-restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
-is safe because whenever changing these bits can be detected by cmpxchg.
+bit if spte.HOST_WRITEABLE = 1 and spte.WRITE_PROTECT = 1, to restore the saved
+R/X bits if for an access-traced spte, or both. This is safe because whenever
+changing these bits can be detected by cmpxchg.
But we need carefully check these cases:
@@ -185,17 +184,17 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update().
Lockless Access Tracking:
This is used for Intel CPUs that are using EPT but do not support the EPT A/D
-bits. In this case, when the KVM MMU notifier is called to track accesses to a
-page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
-by clearing the RWX bits in the PTE and storing the original R & X bits in
-some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
-PTE (using the ignored bit 62). When the VM tries to access the page later on,
-a fault is generated and the fast page fault mechanism described above is used
-to atomically restore the PTE to a Present state. The W bit is not saved when
-the PTE is marked for access tracking and during restoration to the Present
-state, the W bit is set depending on whether or not it was a write access. If
-it wasn't, then the W bit will remain clear until a write access happens, at
-which time it will be set using the Dirty tracking mechanism described above.
+bits. In this case, PTEs are tagged as A/D disabled (using ignored bits), and
+when the KVM MMU notifier is called to track accesses to a page (via
+kvm_mmu_notifier_clear_flush_young), it marks the PTE not-present in hardware
+by clearing the RWX bits in the PTE and storing the original R & X bits in more
+unused/ignored bits. When the VM tries to access the page later on, a fault is
+generated and the fast page fault mechanism described above is used to
+atomically restore the PTE to a Present state. The W bit is not saved when the
+PTE is marked for access tracking and during restoration to the Present state,
+the W bit is set depending on whether or not it was a write access. If it
+wasn't, then the W bit will remain clear until a write access happens, at which
+time it will be set using the Dirty tracking mechanism described above.
3. Reference
------------
diff --git a/Documentation/virt/kvm/s390-diag.rst b/Documentation/virt/kvm/s390-diag.rst
index eaac4864d3d6..ca85f030eb0b 100644
--- a/Documentation/virt/kvm/s390-diag.rst
+++ b/Documentation/virt/kvm/s390-diag.rst
@@ -84,3 +84,36 @@ If the function code specifies 0x501, breakpoint functions may be performed.
This function code is handled by userspace.
This diagnose function code has no subfunctions and uses no parameters.
+
+
+DIAGNOSE function code 'X'9C - Voluntary Time Slice Yield
+---------------------------------------------------------
+
+General register 1 contains the target CPU address.
+
+In a guest of a hypervisor like LPAR, KVM or z/VM using shared host CPUs,
+DIAGNOSE with function code 0x9c may improve system performance by
+yielding the host CPU on which the guest CPU is running to be assigned
+to another guest CPU, preferably the logical CPU containing the specified
+target CPU.
+
+
+DIAG 'X'9C forwarding
++++++++++++++++++++++
+
+The guest may send a DIAGNOSE 0x9c in order to yield to a certain
+other vcpu. An example is a Linux guest that tries to yield to the vcpu
+that is currently holding a spinlock, but not running.
+
+However, on the host the real cpu backing the vcpu may itself not be
+running.
+Forwarding the DIAGNOSE 0x9c initially sent by the guest to yield to
+the backing cpu will hopefully cause that cpu, and thus subsequently
+the guest's vcpu, to be scheduled.
+
+
+diag9c_forwarding_hz
+ KVM kernel parameter allowing to specify the maximum number of DIAGNOSE
+ 0x9c forwarding per second in the purpose of avoiding a DIAGNOSE 0x9c
+ forwarding storm.
+ A value of 0 turns the forwarding off.
diff --git a/MAINTAINERS b/MAINTAINERS
index 0204c82b765c..44e09c2641ad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1782,6 +1782,8 @@ F: Documentation/ABI/testing/sysfs-bus-coresight-devices-*
F: Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
F: Documentation/devicetree/bindings/arm/coresight-cti.yaml
F: Documentation/devicetree/bindings/arm/coresight.txt
+F: Documentation/devicetree/bindings/arm/ete.yaml
+F: Documentation/devicetree/bindings/arm/trbe.yaml
F: Documentation/trace/coresight/*
F: drivers/hwtracing/coresight/*
F: include/dt-bindings/arm/coresight-cti-dt.h
@@ -9949,10 +9951,10 @@ F: virt/kvm/*
KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
M: Marc Zyngier <maz@kernel.org>
R: James Morse <james.morse@arm.com>
-R: Julien Thierry <julien.thierry.kdev@gmail.com>
+R: Alexandru Elisei <alexandru.elisei@arm.com>
R: Suzuki K Poulose <suzuki.poulose@arm.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-L: kvmarm@lists.cs.columbia.edu
+L: kvmarm@lists.cs.columbia.edu (moderated for non-subscribers)
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
F: arch/arm64/include/asm/kvm*
diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h
index df8524365637..bd61502b9715 100644
--- a/arch/arm/include/asm/hypervisor.h
+++ b/arch/arm/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
#include <asm/xen/hypervisor.h>
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
#endif
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ab569b0b45fc..8418c1bd8f04 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -16,6 +16,7 @@
#include <asm/asm-offsets.h>
#include <asm/alternative.h>
+#include <asm/asm-bug.h>
#include <asm/cpufeature.h>
#include <asm/cputype.h>
#include <asm/debug-monitors.h>
@@ -279,12 +280,24 @@ alternative_endif
* provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val
*/
.macro read_ctr, reg
+#ifndef __KVM_NVHE_HYPERVISOR__
alternative_if_not ARM64_MISMATCHED_CACHE_TYPE
mrs \reg, ctr_el0 // read CTR
nop
alternative_else
ldr_l \reg, arm64_ftr_reg_ctrel0 + ARM64_FTR_SYSVAL
alternative_endif
+#else
+alternative_if_not ARM64_KVM_PROTECTED_MODE
+ ASM_BUG()
+alternative_else_nop_endif
+alternative_cb kvm_compute_final_ctr_el0
+ movz \reg, #0
+ movk \reg, #0, lsl #16
+ movk \reg, #0, lsl #32
+ movk \reg, #0, lsl #48
+alternative_cb_end
+#endif
.endm
@@ -685,11 +698,11 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU
.endm
/*
- * Set SCTLR_EL1 to the passed value, and invalidate the local icache
+ * Set SCTLR_ELx to the @reg value, and invalidate the local icache
* in the process. This is called when setting the MMU on.
*/
-.macro set_sctlr_el1, reg
- msr sctlr_el1, \reg
+.macro set_sctlr, sreg, reg
+ msr \sreg, \reg
isb
/*
* Invalidate the local I-cache so that any instructions fetched
@@ -701,6 +714,14 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU
isb
.endm
+.macro set_sctlr_el1, reg
+ set_sctlr sctlr_el1, \reg
+.endm
+
+.macro set_sctlr_el2, reg
+ set_sctlr sctlr_el2, \reg
+.endm
+
/*
* Check whether preempt/bh-disabled asm code should yield as soon as
* it is able. This is the case if we are currently running in task
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 065ba482daf0..2175ec0004ed 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -23,6 +23,7 @@
#define dsb(opt) asm volatile("dsb " #opt : : : "memory")
#define psb_csync() asm volatile("hint #17" : : : "memory")
+#define tsb_csync() asm volatile("hint #18" : : : "memory")
#define csdb() asm volatile("hint #20" : : : "memory")
#ifdef CONFIG_ARM64_PSEUDO_NMI
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index b3f2d3bb0938..21fa330f498d 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -65,6 +65,19 @@
// use EL1&0 translation.
.Lskip_spe_\@:
+ /* Trace buffer */
+ ubfx x0, x1, #ID_AA64DFR0_TRBE_SHIFT, #4
+ cbz x0, .Lskip_trace_\@ // Skip if TraceBuffer is not present
+
+ mrs_s x0, SYS_TRBIDR_EL1
+ and x0, x0, TRBIDR_PROG
+ cbnz x0, .Lskip_trace_\@ // If TRBE is available at EL2
+
+ mov x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
+ orr x2, x2, x0 // allow the EL1&0 translation
+ // to own it.
+
+.Lskip_trace_\@:
msr mdcr_el2, x2 // Configure debug traps
.endm
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index ebb263b2d3b1..2599504674b5 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -131,6 +131,15 @@ static inline void sve_user_enable(void)
sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN);
}
+#define sve_cond_update_zcr_vq(val, reg) \
+ do { \
+ u64 __zcr = read_sysreg_s((reg)); \
+ u64 __new = __zcr & ~ZCR_ELx_LEN_MASK; \
+ __new |= (val) & ZCR_ELx_LEN_MASK; \
+ if (__zcr != __new) \
+ write_sysreg_s(__new, (reg)); \
+ } while (0)
+
/*
* Probing and setup functions.
* Calls to these functions must be serialised with one another.
@@ -160,6 +169,8 @@ static inline int sve_get_current_vl(void)
static inline void sve_user_disable(void) { BUILD_BUG(); }
static inline void sve_user_enable(void) { BUILD_BUG(); }
+#define sve_cond_update_zcr_vq(val, reg) do { } while (0)
+
static inline void sve_init_vq_map(void) { }
static inline void sve_update_vq_map(void) { }
static inline int sve_verify_vq_map(void) { return 0; }
diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
index af43367534c7..a2563992d2dc 100644
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -6,6 +6,8 @@
* Author: Catalin Marinas <catalin.marinas@arm.com>
*/
+#include <asm/assembler.h>
+
.macro fpsimd_save state, tmpnr
stp q0, q1, [\state, #16 * 0]
stp q2, q3, [\state, #16 * 2]
@@ -230,8 +232,7 @@
str w\nxtmp, [\xpfpsr, #4]
.endm
-.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
- sve_load_vq \xvqminus1, x\nxtmp, \xtmp2
+.macro __sve_load nxbase, xpfpsr, nxtmp
_for n, 0, 31, _sve_ldr_v \n, \nxbase, \n - 34
_sve_ldr_p 0, \nxbase
_sve_wrffr 0
@@ -242,3 +243,8 @@
ldr w\nxtmp, [\xpfpsr, #4]
msr fpcr, x\nxtmp
.endm
+
+.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2
+ sve_load_vq \xvqminus1, x\nxtmp, \xtmp2
+ __sve_load \nxbase, \xpfpsr, \nxtmp
+.endm
diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h
index 737ded6b6d0d..b4b3076a76fb 100644
--- a/arch/arm64/include/asm/hyp_image.h
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -10,11 +10,15 @@
#define __HYP_CONCAT(a, b) a ## b
#define HYP_CONCAT(a, b) __HYP_CONCAT(a, b)
+#ifndef __KVM_NVHE_HYPERVISOR__
/*
* KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
* to separate it from the kernel proper.
*/
#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym
+#else
+#define kvm_nvhe_sym(sym) sym
+#endif
#ifdef LINKER_SCRIPT
@@ -56,6 +60,9 @@
*/
#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym;
+/* Defines a linker script alias for KVM nVHE hyp symbols */
+#define KVM_NVHE_ALIAS_HYP(first, sec) kvm_nvhe_sym(first) = kvm_nvhe_sym(sec);
+
#endif /* LINKER_SCRIPT */
#endif /* __ARM64_HYP_IMAGE_H__ */
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index f9cc1d021791..0ae427f352c8 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
#include <asm/xen/hypervisor.h>
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
#endif
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 94d4025acc0b..692c9049befa 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -278,6 +278,8 @@
#define CPTR_EL2_DEFAULT CPTR_EL2_RES1
/* Hyp Debug Configuration Register bits */
+#define MDCR_EL2_E2TB_MASK (UL(0x3))
+#define MDCR_EL2_E2TB_SHIFT (UL(24))
#define MDCR_EL2_TTRF (1 << 19)
#define MDCR_EL2_TPMS (1 << 14)
#define MDCR_EL2_E2PB_MASK (UL(0x3))
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index a7ab84f781f7..cf8df032b9c3 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -57,6 +57,12 @@
#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13
#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14
+#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16
+#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17
+#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18
+#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19
+#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20
#ifndef __ASSEMBLY__
@@ -154,6 +160,9 @@ struct kvm_nvhe_init_params {
unsigned long tpidr_el2;
unsigned long stack_hyp_va;
phys_addr_t pgd_pa;
+ unsigned long hcr_el2;
+ unsigned long vttbr;
+ unsigned long vtcr;
};
/* Translate a kernel address @ptr into its equivalent linear mapping */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3d10e6527f7d..7cd7d5c8c4bc 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -94,7 +94,7 @@ struct kvm_s2_mmu {
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
- struct kvm *kvm;
+ struct kvm_arch *arch;
};
struct kvm_arch_memory_slot {
@@ -315,6 +315,8 @@ struct kvm_vcpu_arch {
struct kvm_guest_debug_arch regs;
/* Statistical profiling extension */
u64 pmscr_el1;
+ /* Self-hosted trace */
+ u64 trfcr_el1;
} host_debug_state;
/* VGIC state */
@@ -372,8 +374,10 @@ struct kvm_vcpu_arch {
};
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
-#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
- sve_ffr_offset((vcpu)->arch.sve_max_vl)))
+#define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \
+ sve_ffr_offset((vcpu)->arch.sve_max_vl))
+
+#define vcpu_sve_max_vq(vcpu) sve_vq_from_vl((vcpu)->arch.sve_max_vl)
#define vcpu_sve_state_size(vcpu) ({ \
size_t __size_ret; \
@@ -382,7 +386,7 @@ struct kvm_vcpu_arch {
if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \
__size_ret = 0; \
} else { \
- __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl); \
+ __vcpu_vq = vcpu_sve_max_vq(vcpu); \
__size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \
} \
\
@@ -400,7 +404,13 @@ struct kvm_vcpu_arch {
#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */
#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */
#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */
+#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */
+#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_USE_HW | \
+ KVM_GUESTDBG_SINGLESTEP)
/*
* When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can
* take the following values:
@@ -582,15 +592,11 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events);
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
+#ifndef __KVM_NVHE_HYPERVISOR__
#define kvm_call_hyp_nvhe(f, ...) \
({ \
struct arm_smccc_res res; \
@@ -630,9 +636,13 @@ void kvm_arm_resume_guest(struct kvm *kvm);
\
ret; \
})
+#else /* __KVM_NVHE_HYPERVISOR__ */
+#define kvm_call_hyp(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__)
+#define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__)
+#endif /* __KVM_NVHE_HYPERVISOR__ */
void force_vm_exit(const cpumask_t *mask);
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
@@ -692,19 +702,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr();
}
-static inline bool kvm_arch_requires_vhe(void)
-{
- /*
- * The Arm architecture specifies that implementation of SVE
- * requires VHE also to be implemented. The KVM code for arm64
- * relies on this when SVE is present:
- */
- if (system_supports_sve())
- return true;
-
- return false;
-}
-
void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu);
static inline void kvm_arch_hardware_unsetup(void) {}
@@ -713,6 +710,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
void kvm_arm_init_debug(void);
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu);
void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
@@ -734,6 +732,10 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
return (!has_vhe() && attr->exclude_host);
}
+/* Flags for host debug state */
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu);
+
#ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */
static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
{
@@ -771,5 +773,12 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
int kvm_trng_call(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM
+extern phys_addr_t hyp_mem_base;
+extern phys_addr_t hyp_mem_size;
+void __init kvm_hyp_reserve(void);
+#else
+static inline void kvm_hyp_reserve(void) { }
+#endif
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 32ae676236b6..9d60b3006efc 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -90,6 +90,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+void __sve_save_state(void *sve_pffr, u32 *fpsr);
+void __sve_restore_state(void *sve_pffr, u32 *fpsr);
#ifndef __KVM_NVHE_HYPERVISOR__
void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
@@ -100,10 +102,20 @@ u64 __guest_enter(struct kvm_vcpu *vcpu);
bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt);
-void __noreturn hyp_panic(void);
#ifdef __KVM_NVHE_HYPERVISOR__
void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
u64 elr, u64 par);
#endif
+#ifdef __KVM_NVHE_HYPERVISOR__
+void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size,
+ phys_addr_t pgd, void *sp, void *cont_fn);
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+ unsigned long *per_cpu_base, u32 hyp_va_bits);
+void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
+#endif
+
+extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
+extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
+
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 90873851f677..25ed956f9af1 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -121,6 +121,8 @@ void kvm_update_va_mask(struct alt_instr *alt,
void kvm_compute_layout(void);
void kvm_apply_hyp_relocations(void);
+#define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)
+
static __always_inline unsigned long __kern_hyp_va(unsigned long v)
{
asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
@@ -166,7 +168,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
-int kvm_mmu_init(void);
+int kvm_mmu_init(u32 *hyp_va_bits);
+
+static inline void *__kvm_vector_slot2addr(void *base,
+ enum arm64_hyp_spectre_vector slot)
+{
+ int idx = slot - (slot != HYP_VECTOR_DIRECT);
+
+ return base + (idx * SZ_2K);
+}
struct kvm;
@@ -262,9 +272,9 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
* Must be called from hyp code running at EL2 with an updated VTTBR
* and interrupts disabled.
*/
-static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr)
{
- write_sysreg(kern_hyp_va(mmu->kvm)->arch.vtcr, vtcr_el2);
+ write_sysreg(vtcr, vtcr_el2);
write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
/*
@@ -275,5 +285,14 @@ static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
}
+static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu)
+{
+ __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr);
+}
+
+static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
+{
+ return container_of(mmu->arch, struct kvm, arch);
+}
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8886d43cfb11..c3674c47d48c 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -11,22 +11,79 @@
#include <linux/kvm_host.h>
#include <linux/types.h>
+#define KVM_PGTABLE_MAX_LEVELS 4U
+
+static inline u64 kvm_get_parange(u64 mmfr0)
+{
+ u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
+ ID_AA64MMFR0_PARANGE_SHIFT);
+ if (parange > ID_AA64MMFR0_PARANGE_MAX)
+ parange = ID_AA64MMFR0_PARANGE_MAX;
+
+ return parange;
+}
+
typedef u64 kvm_pte_t;
/**
+ * struct kvm_pgtable_mm_ops - Memory management callbacks.
+ * @zalloc_page: Allocate a single zeroed memory page. The @arg parameter
+ * can be used by the walker to pass a memcache. The
+ * initial refcount of the page is 1.
+ * @zalloc_pages_exact: Allocate an exact number of zeroed memory pages. The
+ * @size parameter is in bytes, and is rounded-up to the
+ * next page boundary. The resulting allocation is
+ * physically contiguous.
+ * @free_pages_exact: Free an exact number of memory pages previously
+ * allocated by zalloc_pages_exact.
+ * @get_page: Increment the refcount on a page.
+ * @put_page: Decrement the refcount on a page. When the refcount
+ * reaches 0 the page is automatically freed.
+ * @page_count: Return the refcount of a page.
+ * @phys_to_virt: Convert a physical address into a virtual address mapped
+ * in the current context.
+ * @virt_to_phys: Convert a virtual address mapped in the current context
+ * into a physical address.
+ */
+struct kvm_pgtable_mm_ops {
+ void* (*zalloc_page)(void *arg);
+ void* (*zalloc_pages_exact)(size_t size);
+ void (*free_pages_exact)(void *addr, size_t size);
+ void (*get_page)(void *addr);
+ void (*put_page)(void *addr);
+ int (*page_count)(void *addr);
+ void* (*phys_to_virt)(phys_addr_t phys);
+ phys_addr_t (*virt_to_phys)(void *addr);
+};
+
+/**
+ * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags.
+ * @KVM_PGTABLE_S2_NOFWB: Don't enforce Normal-WB even if the CPUs have
+ * ARM64_HAS_STAGE2_FWB.
+ * @KVM_PGTABLE_S2_IDMAP: Only use identity mappings.
+ */
+enum kvm_pgtable_stage2_flags {
+ KVM_PGTABLE_S2_NOFWB = BIT(0),
+ KVM_PGTABLE_S2_IDMAP = BIT(1),
+};
+
+/**
* struct kvm_pgtable - KVM page-table.
* @ia_bits: Maximum input address size, in bits.
* @start_level: Level at which the page-table walk starts.
* @pgd: Pointer to the first top-level entry of the page-table.
+ * @mm_ops: Memory management callbacks.
* @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
*/
struct kvm_pgtable {
u32 ia_bits;
u32 start_level;
kvm_pte_t *pgd;
+ struct kvm_pgtable_mm_ops *mm_ops;
/* Stage-2 only */
struct kvm_s2_mmu *mmu;
+ enum kvm_pgtable_stage2_flags flags;
};
/**
@@ -50,6 +107,16 @@ enum kvm_pgtable_prot {
#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
/**
+ * struct kvm_mem_range - Range of Intermediate Physical Addresses
+ * @start: Start of the range.
+ * @end: End of the range.
+ */
+struct kvm_mem_range {
+ u64 start;
+ u64 end;
+};
+
+/**
* enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
* @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid
* entries.
@@ -86,10 +153,12 @@ struct kvm_pgtable_walker {
* kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
* @pgt: Uninitialised page-table structure to initialise.
* @va_bits: Maximum virtual address bits.
+ * @mm_ops: Memory management callbacks.
*
* Return: 0 on success, negative error code on failure.
*/
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+ struct kvm_pgtable_mm_ops *mm_ops);
/**
* kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
@@ -123,17 +192,41 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
enum kvm_pgtable_prot prot);
/**
- * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * kvm_get_vtcr() - Helper to construct VTCR_EL2
+ * @mmfr0: Sanitized value of SYS_ID_AA64MMFR0_EL1 register.
+ * @mmfr1: Sanitized value of SYS_ID_AA64MMFR1_EL1 register.
+ * @phys_shfit: Value to set in VTCR_EL2.T0SZ.
+ *
+ * The VTCR value is common across all the physical CPUs on the system.
+ * We use system wide sanitised values to fill in different fields,
+ * except for Hardware Management of Access Flags. HA Flag is set
+ * unconditionally on all CPUs, as it is safe to run with or without
+ * the feature and the bit is RES0 on CPUs that don't support it.
+ *
+ * Return: VTCR_EL2 value
+ */
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
+
+/**
+ * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table.
* @pgt: Uninitialised page-table structure to initialise.
- * @kvm: KVM structure representing the guest virtual machine.
+ * @arch: Arch-specific KVM structure representing the guest virtual
+ * machine.
+ * @mm_ops: Memory management callbacks.
+ * @flags: Stage-2 configuration flags.
*
* Return: 0 on success, negative error code on failure.
*/
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+ struct kvm_pgtable_mm_ops *mm_ops,
+ enum kvm_pgtable_stage2_flags flags);
+
+#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
+ kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
/**
* kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
*
* The page-table is assumed to be unreachable by any hardware walkers prior
* to freeing and therefore no TLB invalidation is performed.
@@ -142,13 +235,13 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
/**
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address at which to place the mapping.
* @size: Size of the mapping.
* @phys: Physical address of the memory to map.
* @prot: Permissions and attributes for the mapping.
- * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to
- * allocate page-table pages.
+ * @mc: Cache of pre-allocated and zeroed memory from which to allocate
+ * page-table pages.
*
* The offset of @addr within a page is ignored, @size is rounded-up to
* the next page boundary and @phys is rounded-down to the previous page
@@ -170,11 +263,31 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
*/
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
u64 phys, enum kvm_pgtable_prot prot,
- struct kvm_mmu_memory_cache *mc);
+ void *mc);
+
+/**
+ * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
+ * track ownership.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr: Base intermediate physical address to annotate.
+ * @size: Size of the annotated range.
+ * @mc: Cache of pre-allocated and zeroed memory from which to allocate
+ * page-table pages.
+ * @owner_id: Unique identifier for the owner of the page.
+ *
+ * By default, all page-tables are owned by identifier 0. This function can be
+ * used to mark portions of the IPA space as owned by other entities. When a
+ * stage 2 is used with identity-mappings, these annotations allow to use the
+ * page-table data structure as a simple rmap.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ void *mc, u8 owner_id);
/**
* kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address from which to remove the mapping.
* @size: Size of the mapping.
*
@@ -194,7 +307,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
* kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
* without TLB invalidation.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address from which to write-protect,
* @size: Size of the range.
*
@@ -211,7 +324,7 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
* kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
*
* The offset of @addr within a page is ignored.
@@ -225,7 +338,7 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
/**
* kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
*
* The offset of @addr within a page is ignored.
@@ -244,7 +357,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
/**
* kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
* page-table entry.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
* @prot: Additional permissions to grant for the mapping.
*
@@ -263,7 +376,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
/**
* kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
* access flag set.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address to identify the page-table entry.
*
* The offset of @addr within a page is ignored.
@@ -276,7 +389,7 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
* kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
* of Coherency for guest stage-2 address
* range.
- * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @addr: Intermediate physical address from which to flush.
* @size: Size of the range.
*
@@ -311,4 +424,23 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_pgtable_walker *walker);
+/**
+ * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical
+ * Addresses with compatible permission
+ * attributes.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @addr: Address that must be covered by the range.
+ * @prot: Protection attributes that the range must be compatible with.
+ * @range: Range structure used to limit the search space at call time and
+ * that will hold the result.
+ *
+ * The offset of @addr within a page is ignored. An IPA is compatible with @prot
+ * iff its corresponding stage-2 page-table entry has default ownership and, if
+ * valid, is mapped with protection attributes identical to @prot.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot,
+ struct kvm_mem_range *range);
#endif /* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index fab2f573f7a4..938092df76cf 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -71,10 +71,10 @@ extern bool arm64_use_ng_mappings;
#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN)
#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
-#define PAGE_S2_MEMATTR(attr) \
+#define PAGE_S2_MEMATTR(attr, has_fwb) \
({ \
u64 __val; \
- if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) \
+ if (has_fwb) \
__val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \
else \
__val = PTE_S2_MEMATTR(MT_S2_ ## attr); \
diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 2f36b16a5b5d..e4ad9db53af1 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -13,6 +13,7 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
extern char __hyp_text_start[], __hyp_text_end[];
extern char __hyp_rodata_start[], __hyp_rodata_end[];
extern char __hyp_reloc_begin[], __hyp_reloc_end[];
+extern char __hyp_bss_start[], __hyp_bss_end[];
extern char __idmap_text_start[], __idmap_text_end[];
extern char __initdata_begin[], __initdata_end[];
extern char __inittext_begin[], __inittext_end[];
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 012a0b8c0a27..65d15700a168 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -283,6 +283,8 @@
#define SYS_PMSIRR_EL1_INTERVAL_MASK 0xffffffUL
/* Filtering controls */
+#define SYS_PMSNEVFR_EL1 sys_reg(3, 0, 9, 9, 1)
+
#define SYS_PMSFCR_EL1 sys_reg(3, 0, 9, 9, 4)
#define SYS_PMSFCR_EL1_FE_SHIFT 0
#define SYS_PMSFCR_EL1_FT_SHIFT 1
@@ -333,6 +335,55 @@
/*** End of Statistical Profiling Extension ***/
+/*
+ * TRBE Registers
+ */
+#define SYS_TRBLIMITR_EL1 sys_reg(3, 0, 9, 11, 0)
+#define SYS_TRBPTR_EL1 sys_reg(3, 0, 9, 11, 1)
+#define SYS_TRBBASER_EL1 sys_reg(3, 0, 9, 11, 2)
+#define SYS_TRBSR_EL1 sys_reg(3, 0, 9, 11, 3)
+#define SYS_TRBMAR_EL1 sys_reg(3, 0, 9, 11, 4)
+#define SYS_TRBTRG_EL1 sys_reg(3, 0, 9, 11, 6)
+#define SYS_TRBIDR_EL1 sys_reg(3, 0, 9, 11, 7)
+
+#define TRBLIMITR_LIMIT_MASK GENMASK_ULL(51, 0)
+#define TRBLIMITR_LIMIT_SHIFT 12
+#define TRBLIMITR_NVM BIT(5)
+#define TRBLIMITR_TRIG_MODE_MASK GENMASK(1, 0)
+#define TRBLIMITR_TRIG_MODE_SHIFT 3
+#define TRBLIMITR_FILL_MODE_MASK GENMASK(1, 0)
+#define TRBLIMITR_FILL_MODE_SHIFT 1
+#define TRBLIMITR_ENABLE BIT(0)
+#define TRBPTR_PTR_MASK GENMASK_ULL(63, 0)
+#define TRBPTR_PTR_SHIFT 0
+#define TRBBASER_BASE_MASK GENMASK_ULL(51, 0)
+#define TRBBASER_BASE_SHIFT 12
+#define TRBSR_EC_MASK GENMASK(5, 0)
+#define TRBSR_EC_SHIFT 26
+#define TRBSR_IRQ BIT(22)
+#define TRBSR_TRG BIT(21)
+#define TRBSR_WRAP BIT(20)
+#define TRBSR_ABORT BIT(18)
+#define TRBSR_STOP BIT(17)
+#define TRBSR_MSS_MASK GENMASK(15, 0)
+#define TRBSR_MSS_SHIFT 0
+#define TRBSR_BSC_MASK GENMASK(5, 0)
+#define TRBSR_BSC_SHIFT 0
+#define TRBSR_FSC_MASK GENMASK(5, 0)
+#define TRBSR_FSC_SHIFT 0
+#define TRBMAR_SHARE_MASK GENMASK(1, 0)
+#define TRBMAR_SHARE_SHIFT 8
+#define TRBMAR_OUTER_MASK GENMASK(3, 0)
+#define TRBMAR_OUTER_SHIFT 4
+#define TRBMAR_INNER_MASK GENMASK(3, 0)
+#define TRBMAR_INNER_SHIFT 0
+#define TRBTRG_TRG_MASK GENMASK(31, 0)
+#define TRBTRG_TRG_SHIFT 0
+#define TRBIDR_FLAG BIT(5)
+#define TRBIDR_PROG BIT(4)
+#define TRBIDR_ALIGN_MASK GENMASK(3, 0)
+#define TRBIDR_ALIGN_SHIFT 0
+
#define SYS_PMINTENSET_EL1 sys_reg(3, 0, 9, 14, 1)
#define SYS_PMINTENCLR_EL1 sys_reg(3, 0, 9, 14, 2)
@@ -587,9 +638,6 @@
#define SCTLR_ELx_A (BIT(1))
#define SCTLR_ELx_M (BIT(0))
-#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
- SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB)
-
/* SCTLR_EL2 specific flags. */
#define SCTLR_EL2_RES1 ((BIT(4)) | (BIT(5)) | (BIT(11)) | (BIT(16)) | \
(BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
@@ -601,6 +649,10 @@
#define ENDIAN_SET_EL2 0
#endif
+#define INIT_SCTLR_EL2_MMU_ON \
+ (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \
+ SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1)
+
#define INIT_SCTLR_EL2_MMU_OFF \
(SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
@@ -849,6 +901,7 @@
#define ID_AA64MMFR2_CNP_SHIFT 0
/* id_aa64dfr0 */
+#define ID_AA64DFR0_TRBE_SHIFT 44
#define ID_AA64DFR0_TRACE_FILT_SHIFT 40
#define ID_AA64DFR0_DOUBLELOCK_SHIFT 36
#define ID_AA64DFR0_PMSVER_SHIFT 32
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index e797603e55b7..0cb34ccb6e73 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -123,6 +123,9 @@ int main(void)
DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2));
DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va));
DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa));
+ DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2));
+ DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr));
+ DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr));
#endif
#ifdef CONFIG_CPU_PM
DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S
index 37721eb6f9a1..d47ff63a5b66 100644
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -30,10 +30,7 @@
* flat identity mapping.
*/
SYM_CODE_START(__cpu_soft_restart)
- /* Clear sctlr_el1 flags. */
- mrs x12, sctlr_el1
- mov_q x13, SCTLR_ELx_FLAGS
- bic x12, x12, x13
+ mov_q x12, INIT_SCTLR_EL1_MMU_OFF
pre_disable_mmu_workaround
/*
* either disable EL1&0 translation regime or disable EL2&0 translation
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 74ad3db061d1..43d212618834 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -115,9 +115,10 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
mrs_s x0, SYS_VBAR_EL12
msr vbar_el1, x0
- // Use EL2 translations for SPE and disable access from EL1
+ // Use EL2 translations for SPE & TRBE and disable access from EL1
mrs x0, mdcr_el2
bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT)
+ bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT)
msr mdcr_el2, x0
// Transfer the MM state from EL1 to EL2
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 5aa9ed1e9ec6..bcf3c2755370 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -65,13 +65,13 @@ __efistub__ctype = _ctype;
KVM_NVHE_ALIAS(kvm_patch_vector_branch);
KVM_NVHE_ALIAS(kvm_update_va_mask);
KVM_NVHE_ALIAS(kvm_get_kimage_voffset);
+KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0);
/* Global kernel state accessed by nVHE hyp code. */
KVM_NVHE_ALIAS(kvm_vgic_global_state);
/* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */
-KVM_NVHE_ALIAS(__hyp_panic_string);
-KVM_NVHE_ALIAS(panic);
+KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
/* Vectors installed by hyp-init on reset HVC. */
KVM_NVHE_ALIAS(__hyp_stub_vectors);
@@ -104,6 +104,36 @@ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
/* PMU available static key */
KVM_NVHE_ALIAS(kvm_arm_pmu_available);
+/* Position-independent library routines */
+KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page);
+KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page);
+KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(memset, __pi_memset);
+
+#ifdef CONFIG_KASAN
+KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
+KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
+#endif
+
+/* Kernel memory sections */
+KVM_NVHE_ALIAS(__start_rodata);
+KVM_NVHE_ALIAS(__end_rodata);
+KVM_NVHE_ALIAS(__bss_start);
+KVM_NVHE_ALIAS(__bss_stop);
+
+/* Hyp memory sections */
+KVM_NVHE_ALIAS(__hyp_idmap_text_start);
+KVM_NVHE_ALIAS(__hyp_idmap_text_end);
+KVM_NVHE_ALIAS(__hyp_text_start);
+KVM_NVHE_ALIAS(__hyp_text_end);
+KVM_NVHE_ALIAS(__hyp_bss_start);
+KVM_NVHE_ALIAS(__hyp_bss_end);
+KVM_NVHE_ALIAS(__hyp_rodata_start);
+KVM_NVHE_ALIAS(__hyp_rodata_end);
+
+/* pKVM static key */
+KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
+
#endif /* CONFIG_KVM */
#endif /* __ARM64_KERNEL_IMAGE_VARS_H */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 7eea7888bb02..709d2c433c5e 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -5,24 +5,7 @@
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
*/
-#define RO_EXCEPTION_TABLE_ALIGN 8
-#define RUNTIME_DISCARD_EXIT
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/cache.h>
#include <asm/hyp_image.h>
-#include <asm/kernel-pgtable.h>
-#include <asm/memory.h>
-#include <asm/page.h>
-
-#include "image.h"
-
-OUTPUT_ARCH(aarch64)
-ENTRY(_text)
-
-jiffies = jiffies_64;
-
-
#ifdef CONFIG_KVM
#define HYPERVISOR_EXTABLE \
. = ALIGN(SZ_8); \
@@ -32,9 +15,11 @@ jiffies = jiffies_64;
#define HYPERVISOR_DATA_SECTIONS \
HYP_SECTION_NAME(.rodata) : { \
+ . = ALIGN(PAGE_SIZE); \
__hyp_rodata_start = .; \
*(HYP_SECTION_NAME(.data..ro_after_init)) \
*(HYP_SECTION_NAME(.rodata)) \
+ . = ALIGN(PAGE_SIZE); \
__hyp_rodata_end = .; \
}
@@ -51,29 +36,52 @@ jiffies = jiffies_64;
__hyp_reloc_end = .; \
}
+#define BSS_FIRST_SECTIONS \
+ __hyp_bss_start = .; \
+ *(HYP_SECTION_NAME(.bss)) \
+ . = ALIGN(PAGE_SIZE); \
+ __hyp_bss_end = .;
+
+/*
+ * We require that __hyp_bss_start and __bss_start are aligned, and enforce it
+ * with an assertion. But the BSS_SECTION macro places an empty .sbss section
+ * between them, which can in some cases cause the linker to misalign them. To
+ * work around the issue, force a page alignment for __bss_start.
+ */
+#define SBSS_ALIGN PAGE_SIZE
#else /* CONFIG_KVM */
#define HYPERVISOR_EXTABLE
#define HYPERVISOR_DATA_SECTIONS
#define HYPERVISOR_PERCPU_SECTION
#define HYPERVISOR_RELOC_SECTION
+#define SBSS_ALIGN 0
#endif
+#define RO_EXCEPTION_TABLE_ALIGN 8
+#define RUNTIME_DISCARD_EXIT
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+
+#include "image.h"
+
+OUTPUT_ARCH(aarch64)
+ENTRY(_text)
+
+jiffies = jiffies_64;
+
#define HYPERVISOR_TEXT \
- /* \
- * Align to 4 KB so that \
- * a) the HYP vector table is at its minimum \
- * alignment of 2048 bytes \
- * b) the HYP init code will not cross a page \
- * boundary if its size does not exceed \
- * 4 KB (see related ASSERT() below) \
- */ \
- . = ALIGN(SZ_4K); \
+ . = ALIGN(PAGE_SIZE); \
__hyp_idmap_text_start = .; \
*(.hyp.idmap.text) \
__hyp_idmap_text_end = .; \
__hyp_text_start = .; \
*(.hyp.text) \
HYPERVISOR_EXTABLE \
+ . = ALIGN(PAGE_SIZE); \
__hyp_text_end = .;
#define IDMAP_TEXT \
@@ -276,7 +284,7 @@ SECTIONS
__pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
_edata = .;
- BSS_SECTION(0, 0, 0)
+ BSS_SECTION(SBSS_ALIGN, 0, 0)
. = ALIGN(PAGE_SIZE);
init_pg_dir = .;
@@ -309,11 +317,12 @@ SECTIONS
#include "image-vars.h"
/*
- * The HYP init code and ID map text can't be longer than a page each,
- * and should not cross a page boundary.
+ * The HYP init code and ID map text can't be longer than a page each. The
+ * former is page-aligned, but the latter may not be with 16K or 64K pages, so
+ * it should also not cross a page boundary.
*/
-ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
- "HYP init code too big or misaligned")
+ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE,
+ "HYP init code too big")
ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"ID map text too big or misaligned")
#ifdef CONFIG_HIBERNATION
@@ -324,6 +333,9 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE,
"Entry trampoline text too big")
#endif
+#ifdef CONFIG_KVM
+ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned")
+#endif
/*
* If padding is applied before .head.text, virt<->phys conversions will fail.
*/
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 7f06ba76698d..1cb39c0803a4 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -206,8 +206,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_INJECT_EXT_DABT:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_VCPU_ATTRIBUTES:
+ case KVM_CAP_PTP_KVM:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ return KVM_GUESTDBG_VALID_MASK;
case KVM_CAP_ARM_SET_DEVICE_ADDR:
r = 1;
break;
@@ -416,10 +419,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (vcpu_has_ptrauth(vcpu))
vcpu_ptrauth_disable(vcpu);
+ kvm_arch_vcpu_load_debug_state_flags(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ kvm_arch_vcpu_put_debug_state_flags(vcpu);
kvm_arch_vcpu_put_fp(vcpu);
if (has_vhe())
kvm_vcpu_put_sysregs_vhe(vcpu);
@@ -580,6 +585,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
vcpu->arch.has_run_once = true;
+ kvm_arm_vcpu_init_debug(vcpu);
+
if (likely(irqchip_in_kernel(kvm))) {
/*
* Map the VGIC hardware resources before running a vcpu the
@@ -1268,7 +1275,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
kvm_flush_remote_tlbs(kvm);
}
@@ -1350,16 +1357,9 @@ static unsigned long nvhe_percpu_order(void)
/* A lookup table holding the hypervisor VA for each vector slot */
static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
-static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot)
-{
- return slot - (slot != HYP_VECTOR_DIRECT);
-}
-
static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
{
- int idx = __kvm_vector_slot2idx(slot);
-
- hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K);
+ hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
}
static int kvm_init_vector_slots(void)
@@ -1388,22 +1388,18 @@ static int kvm_init_vector_slots(void)
return 0;
}
-static void cpu_init_hyp_mode(void)
+static void cpu_prepare_hyp_mode(int cpu)
{
- struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params);
- struct arm_smccc_res res;
+ struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
unsigned long tcr;
- /* Switch from the HYP stub to our own HYP init vector */
- __hyp_set_vectors(kvm_get_idmap_vector());
-
/*
* Calculate the raw per-cpu offset without a translation from the
* kernel's mapping to the linear mapping, and store it in tpidr_el2
* so that we can use adr_l to access per-cpu variables in EL2.
* Also drop the KASAN tag which gets in the way...
*/
- params->tpidr_el2 = (unsigned long)kasan_reset_tag(this_cpu_ptr_nvhe_sym(__per_cpu_start)) -
+ params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
(unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
params->mair_el2 = read_sysreg(mair_el1);
@@ -1427,14 +1423,28 @@ static void cpu_init_hyp_mode(void)
tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
params->tcr_el2 = tcr;
- params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE);
+ params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE);
params->pgd_pa = kvm_mmu_get_httbr();
+ if (is_protected_kvm_enabled())
+ params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
+ else
+ params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
+ params->vttbr = params->vtcr = 0;
/*
* Flush the init params from the data cache because the struct will
* be read while the MMU is off.
*/
kvm_flush_dcache_to_poc(params, sizeof(*params));
+}
+
+static void hyp_install_host_vector(void)
+{
+ struct kvm_nvhe_init_params *params;
+ struct arm_smccc_res res;
+
+ /* Switch from the HYP stub to our own HYP init vector */
+ __hyp_set_vectors(kvm_get_idmap_vector());
/*
* Call initialization code, and switch to the full blown HYP code.
@@ -1443,8 +1453,14 @@ static void cpu_init_hyp_mode(void)
* cpus_have_const_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
+ params = this_cpu_ptr_nvhe_sym(kvm_init_params);
arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
+}
+
+static void cpu_init_hyp_mode(void)
+{
+ hyp_install_host_vector();
/*
* Disabling SSBD on a non-VHE system requires us to enable SSBS
@@ -1487,7 +1503,10 @@ static void cpu_set_hyp_vector(void)
struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
void *vector = hyp_spectre_vector_selector[data->slot];
- *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+ if (!is_protected_kvm_enabled())
+ *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
+ else
+ kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
}
static void cpu_hyp_reinit(void)
@@ -1495,13 +1514,14 @@ static void cpu_hyp_reinit(void)
kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
cpu_hyp_reset();
- cpu_set_hyp_vector();
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
else
cpu_init_hyp_mode();
+ cpu_set_hyp_vector();
+
kvm_arm_init_debug();
if (vgic_present)
@@ -1697,18 +1717,62 @@ static void teardown_hyp_mode(void)
}
}
+static int do_pkvm_init(u32 hyp_va_bits)
+{
+ void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
+ int ret;
+
+ preempt_disable();
+ hyp_install_host_vector();
+ ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
+ num_possible_cpus(), kern_hyp_va(per_cpu_base),
+ hyp_va_bits);
+ preempt_enable();
+
+ return ret;
+}
+
+static int kvm_hyp_init_protection(u32 hyp_va_bits)
+{
+ void *addr = phys_to_virt(hyp_mem_base);
+ int ret;
+
+ kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+
+ ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ ret = do_pkvm_init(hyp_va_bits);
+ if (ret)
+ return ret;
+
+ free_hyp_pgds();
+
+ return 0;
+}
+
/**
* Inits Hyp-mode on all online CPUs
*/
static int init_hyp_mode(void)
{
+ u32 hyp_va_bits;
int cpu;
- int err = 0;
+ int err = -ENOMEM;
+
+ /*
+ * The protected Hyp-mode cannot be initialized if the memory pool
+ * allocation has failed.
+ */
+ if (is_protected_kvm_enabled() && !hyp_mem_base)
+ goto out_err;
/*
* Allocate Hyp PGD and setup Hyp identity mapping
*/
- err = kvm_mmu_init();
+ err = kvm_mmu_init(&hyp_va_bits);
if (err)
goto out_err;
@@ -1769,7 +1833,19 @@ static int init_hyp_mode(void)
goto out_err;
}
- err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+ /*
+ * .hyp.bss is guaranteed to be placed at the beginning of the .bss
+ * section thanks to an assertion in the linker script. Map it RW and
+ * the rest of .bss RO.
+ */
+ err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
+ kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
+ if (err) {
+ kvm_err("Cannot map hyp bss section: %d\n", err);
+ goto out_err;
+ }
+
+ err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
if (err) {
kvm_err("Cannot map bss section\n");
@@ -1790,26 +1866,36 @@ static int init_hyp_mode(void)
}
}
- /*
- * Map Hyp percpu pages
- */
for_each_possible_cpu(cpu) {
char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
char *percpu_end = percpu_begin + nvhe_percpu_size();
+ /* Map Hyp percpu pages */
err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
-
if (err) {
kvm_err("Cannot map hyp percpu region\n");
goto out_err;
}
+
+ /* Prepare the CPU initialization parameters */
+ cpu_prepare_hyp_mode(cpu);
}
if (is_protected_kvm_enabled()) {
init_cpu_logical_map();
- if (!init_psci_relay())
+ if (!init_psci_relay()) {
+ err = -ENODEV;
+ goto out_err;
+ }
+ }
+
+ if (is_protected_kvm_enabled()) {
+ err = kvm_hyp_init_protection(hyp_va_bits);
+ if (err) {
+ kvm_err("Failed to init hyp memory protection\n");
goto out_err;
+ }
}
return 0;
@@ -1820,6 +1906,72 @@ out_err:
return err;
}
+static void _kvm_host_prot_finalize(void *discard)
+{
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize));
+}
+
+static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+ return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end);
+}
+
+#define pkvm_mark_hyp_section(__section) \
+ pkvm_mark_hyp(__pa_symbol(__section##_start), \
+ __pa_symbol(__section##_end))
+
+static int finalize_hyp_mode(void)
+{
+ int cpu, ret;
+
+ if (!is_protected_kvm_enabled())
+ return 0;
+
+ ret = pkvm_mark_hyp_section(__hyp_idmap_text);
+ if (ret)
+ return ret;
+
+ ret = pkvm_mark_hyp_section(__hyp_text);
+ if (ret)
+ return ret;
+
+ ret = pkvm_mark_hyp_section(__hyp_rodata);
+ if (ret)
+ return ret;
+
+ ret = pkvm_mark_hyp_section(__hyp_bss);
+ if (ret)
+ return ret;
+
+ ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size);
+ if (ret)
+ return ret;
+
+ for_each_possible_cpu(cpu) {
+ phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]);
+ phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order());
+
+ ret = pkvm_mark_hyp(start, end);
+ if (ret)
+ return ret;
+
+ start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu));
+ end = start + PAGE_SIZE;
+ ret = pkvm_mark_hyp(start, end);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Flip the static key upfront as that may no longer be possible
+ * once the host stage 2 is installed.
+ */
+ static_branch_enable(&kvm_protected_mode_initialized);
+ on_each_cpu(_kvm_host_prot_finalize, NULL, 1);
+
+ return 0;
+}
+
static void check_kvm_target_cpu(void *ret)
{
*(int *)ret = kvm_target_cpu();
@@ -1894,11 +2046,6 @@ int kvm_arch_init(void *opaque)
in_hyp_mode = is_kernel_in_hyp_mode();
- if (!in_hyp_mode && kvm_arch_requires_vhe()) {
- kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
- return -ENODEV;
- }
-
if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
cpus_have_final_cap(ARM64_WORKAROUND_1508412))
kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
@@ -1936,8 +2083,15 @@ int kvm_arch_init(void *opaque)
if (err)
goto out_hyp;
+ if (!in_hyp_mode) {
+ err = finalize_hyp_mode();
+ if (err) {
+ kvm_err("Failed to finalize Hyp protection\n");
+ goto out_hyp;
+ }
+ }
+
if (is_protected_kvm_enabled()) {
- static_branch_enable(&kvm_protected_mode_initialized);
kvm_info("Protected nVHE mode initialized successfully\n");
} else if (in_hyp_mode) {
kvm_info("VHE mode initialized successfully\n");
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index dbc890511631..d5e79d7ee6e9 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -69,6 +69,65 @@ void kvm_arm_init_debug(void)
}
/**
+ * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value
+ *
+ * @vcpu: the vcpu pointer
+ *
+ * This ensures we will trap access to:
+ * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ * - Debug ROM Address (MDCR_EL2_TDRA)
+ * - OS related registers (MDCR_EL2_TDOSA)
+ * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
+ * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ * - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
+ */
+static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
+{
+ /*
+ * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
+ * to disable guest access to the profiling and trace buffers
+ */
+ vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+ vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+ MDCR_EL2_TPMS |
+ MDCR_EL2_TTRF |
+ MDCR_EL2_TPMCR |
+ MDCR_EL2_TDRA |
+ MDCR_EL2_TDOSA);
+
+ /* Is the VM being debugged by userspace? */
+ if (vcpu->guest_debug)
+ /* Route all software debug exceptions to EL2 */
+ vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+ /*
+ * Trap debug register access when one of the following is true:
+ * - Userspace is using the hardware to debug the guest
+ * (KVM_GUESTDBG_USE_HW is set).
+ * - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear).
+ */
+ if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) ||
+ !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
+ vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+ trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+}
+
+/**
+ * kvm_arm_vcpu_init_debug - setup vcpu debug traps
+ *
+ * @vcpu: the vcpu pointer
+ *
+ * Set vcpu initial mdcr_el2 value.
+ */
+void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ kvm_arm_setup_mdcr_el2(vcpu);
+ preempt_enable();
+}
+
+/**
* kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
*/
@@ -83,13 +142,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
* @vcpu: the vcpu pointer
*
* This is called before each entry into the hypervisor to setup any
- * debug related registers. Currently this just ensures we will trap
- * access to:
- * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
- * - Debug ROM Address (MDCR_EL2_TDRA)
- * - OS related registers (MDCR_EL2_TDOSA)
- * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
- * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
+ * debug related registers.
*
* Additionally, KVM only traps guest accesses to the debug registers if
* the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
@@ -101,28 +154,14 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
{
- bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY);
unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2;
trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
- /*
- * This also clears MDCR_EL2_E2PB_MASK to disable guest access
- * to the profiling buffer.
- */
- vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
- vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
- MDCR_EL2_TPMS |
- MDCR_EL2_TTRF |
- MDCR_EL2_TPMCR |
- MDCR_EL2_TDRA |
- MDCR_EL2_TDOSA);
+ kvm_arm_setup_mdcr_el2(vcpu);
/* Is Guest debugging in effect? */
if (vcpu->guest_debug) {
- /* Route all software debug exceptions to EL2 */
- vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
-
/* Save guest debug state */
save_guest_debug_regs(vcpu);
@@ -176,7 +215,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
- trap_debug = true;
trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
&vcpu->arch.debug_ptr->dbg_bcr[0],
@@ -191,10 +229,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
BUG_ON(!vcpu->guest_debug &&
vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
- /* Trap debug register access */
- if (trap_debug)
- vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
-
/* If KDE or MDE are set, perform a full save/restore cycle. */
if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY;
@@ -203,7 +237,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2)
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
- trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
}
@@ -231,3 +264,32 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
}
}
}
+
+void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+ u64 dfr0;
+
+ /* For VHE, there is nothing to do */
+ if (has_vhe())
+ return;
+
+ dfr0 = read_sysreg(id_aa64dfr0_el1);
+ /*
+ * If SPE is present on this CPU and is available at current EL,
+ * we may need to check if the host state needs to be saved.
+ */
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) &&
+ !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT)))
+ vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE;
+
+ /* Check if we have TRBE implemented and available at the host */
+ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) &&
+ !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG))
+ vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE;
+}
+
+void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE |
+ KVM_ARM64_DEBUG_STATE_SAVE_TRBE);
+}
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 3e081d556e81..5621020b28de 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -11,6 +11,7 @@
#include <linux/kvm_host.h>
#include <asm/fpsimd.h>
#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/sysreg.h>
@@ -42,6 +43,17 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
if (ret)
goto error;
+ if (vcpu->arch.sve_state) {
+ void *sve_end;
+
+ sve_end = vcpu->arch.sve_state + vcpu_sve_state_size(vcpu);
+
+ ret = create_hyp_mappings(vcpu->arch.sve_state, sve_end,
+ PAGE_HYP);
+ if (ret)
+ goto error;
+ }
+
vcpu->arch.host_thread_info = kern_hyp_va(ti);
vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
error:
@@ -109,11 +121,17 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
local_irq_save(flags);
if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
- fpsimd_save_and_flush_cpu_state();
+ if (guest_has_sve) {
+ __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+
+ /* Restore the VL that was saved when bound to the CPU */
+ if (!has_vhe())
+ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1,
+ SYS_ZCR_EL1);
+ }
- if (guest_has_sve)
- __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_s(SYS_ZCR_EL12);
- } else if (host_has_sve) {
+ fpsimd_save_and_flush_cpu_state();
+ } else if (has_vhe() && host_has_sve) {
/*
* The FPSIMD/SVE state in the CPU has not been touched, and we
* have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 9bbd30e62799..5cb4a1cd5603 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -299,7 +299,7 @@ static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
memset(vqs, 0, sizeof(vqs));
- max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+ max_vq = vcpu_sve_max_vq(vcpu);
for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
if (sve_vq_available(vq))
vqs[vq_word(vq)] |= vq_mask(vq);
@@ -427,7 +427,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
return -ENOENT;
- vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+ vq = vcpu_sve_max_vq(vcpu);
reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) -
SVE_SIG_REGS_OFFSET;
@@ -437,7 +437,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
return -ENOENT;
- vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
+ vq = vcpu_sve_max_vq(vcpu);
reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) -
SVE_SIG_REGS_OFFSET;
@@ -888,11 +888,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
return -EINVAL;
}
-#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
- KVM_GUESTDBG_USE_SW_BP | \
- KVM_GUESTDBG_USE_HW | \
- KVM_GUESTDBG_SINGLESTEP)
-
/**
* kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
* @kvm: pointer to the KVM struct
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index cebe39f3b1b6..6f48336b1d86 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -291,3 +291,48 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
if (exception_index == ARM_EXCEPTION_EL1_SERROR)
kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
}
+
+void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr,
+ u64 par, uintptr_t vcpu,
+ u64 far, u64 hpfar) {
+ u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr));
+ u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr;
+ u64 mode = spsr & PSR_MODE_MASK;
+
+ /*
+ * The nVHE hyp symbols are not included by kallsyms to avoid issues
+ * with aliasing. That means that the symbols cannot be printed with the
+ * "%pS" format specifier, so fall back to the vmlinux address if
+ * there's no better option.
+ */
+ if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
+ kvm_err("Invalid host exception to nVHE hyp!\n");
+ } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
+ (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) {
+ struct bug_entry *bug = find_bug(elr_in_kimg);
+ const char *file = NULL;
+ unsigned int line = 0;
+
+ /* All hyp bugs, including warnings, are treated as fatal. */
+ if (bug)
+ bug_get_file_line(bug, &file, &line);
+
+ if (file)
+ kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
+ else
+ kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset);
+ } else {
+ kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset);
+ }
+
+ /*
+ * Hyp has panicked and we're going to handle that by panicking the
+ * kernel. The kernel offset will be revealed in the panic so we're
+ * also safe to reveal the hyp offset as a debugging aid for translating
+ * hyp VAs to vmlinux addresses.
+ */
+ kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);
+
+ panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
+ spsr, elr, esr, far, hpfar, par, vcpu);
+}
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 687598e41b21..b726332eec49 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o
diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S
index 01f114aa47b0..3c635929771a 100644
--- a/arch/arm64/kvm/hyp/fpsimd.S
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@@ -19,3 +19,13 @@ SYM_FUNC_START(__fpsimd_restore_state)
fpsimd_restore x0, 1
ret
SYM_FUNC_END(__fpsimd_restore_state)
+
+SYM_FUNC_START(__sve_restore_state)
+ __sve_load 0, x1, 2
+ ret
+SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+ sve_save 0, x1, 2
+ ret
+SYM_FUNC_END(__sve_save_state)
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 6c1f51f25eb3..e4a2f295a394 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -30,8 +30,6 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
-extern const char __hyp_panic_string[];
-
extern struct exception_table_entry __start___kvm_ex_table;
extern struct exception_table_entry __stop___kvm_ex_table;
@@ -160,18 +158,10 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
return true;
}
-static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
{
- u8 ec;
- u64 esr;
u64 hpfar, far;
- esr = vcpu->arch.fault.esr_el2;
- ec = ESR_ELx_EC(esr);
-
- if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
- return true;
-
far = read_sysreg_el2(SYS_FAR);
/*
@@ -194,33 +184,59 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
hpfar = read_sysreg(hpfar_el2);
}
- vcpu->arch.fault.far_el2 = far;
- vcpu->arch.fault.hpfar_el2 = hpfar;
+ fault->far_el2 = far;
+ fault->hpfar_el2 = hpfar;
return true;
}
+static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
+{
+ u8 ec;
+ u64 esr;
+
+ esr = vcpu->arch.fault.esr_el2;
+ ec = ESR_ELx_EC(esr);
+
+ if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
+ return true;
+
+ return __get_fault_info(esr, &vcpu->arch.fault);
+}
+
+static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu)
+{
+ struct thread_struct *thread;
+
+ thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct,
+ uw.fpsimd_state);
+
+ __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr);
+}
+
+static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
+{
+ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
+ __sve_restore_state(vcpu_sve_pffr(vcpu),
+ &vcpu->arch.ctxt.fp_regs.fpsr);
+ write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR);
+}
+
/* Check for an FPSIMD/SVE trap and handle as appropriate */
static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
{
- bool vhe, sve_guest, sve_host;
+ bool sve_guest, sve_host;
u8 esr_ec;
+ u64 reg;
if (!system_supports_fpsimd())
return false;
- /*
- * Currently system_supports_sve() currently implies has_vhe(),
- * so the check is redundant. However, has_vhe() can be determined
- * statically and helps the compiler remove dead code.
- */
- if (has_vhe() && system_supports_sve()) {
+ if (system_supports_sve()) {
sve_guest = vcpu_has_sve(vcpu);
sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE;
- vhe = true;
} else {
sve_guest = false;
sve_host = false;
- vhe = has_vhe();
}
esr_ec = kvm_vcpu_trap_get_class(vcpu);
@@ -229,53 +245,38 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
return false;
/* Don't handle SVE traps for non-SVE vcpus here: */
- if (!sve_guest)
- if (esr_ec != ESR_ELx_EC_FP_ASIMD)
- return false;
+ if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD)
+ return false;
/* Valid trap. Switch the context: */
-
- if (vhe) {
- u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN;
-
+ if (has_vhe()) {
+ reg = CPACR_EL1_FPEN;
if (sve_guest)
reg |= CPACR_EL1_ZEN;
- write_sysreg(reg, cpacr_el1);
+ sysreg_clear_set(cpacr_el1, 0, reg);
} else {
- write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP,
- cptr_el2);
- }
+ reg = CPTR_EL2_TFP;
+ if (sve_guest)
+ reg |= CPTR_EL2_TZ;
+ sysreg_clear_set(cptr_el2, reg, 0);
+ }
isb();
if (vcpu->arch.flags & KVM_ARM64_FP_HOST) {
- /*
- * In the SVE case, VHE is assumed: it is enforced by
- * Kconfig and kvm_arch_init().
- */
- if (sve_host) {
- struct thread_struct *thread = container_of(
- vcpu->arch.host_fpsimd_state,
- struct thread_struct, uw.fpsimd_state);
-
- sve_save_state(sve_pffr(thread),
- &vcpu->arch.host_fpsimd_state->fpsr);
- } else {
+ if (sve_host)
+ __hyp_sve_save_host(vcpu);
+ else
__fpsimd_save_state(vcpu->arch.host_fpsimd_state);
- }
vcpu->arch.flags &= ~KVM_ARM64_FP_HOST;
}
- if (sve_guest) {
- sve_load_state(vcpu_sve_pffr(vcpu),
- &vcpu->arch.ctxt.fp_regs.fpsr,
- sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1);
- write_sysreg_s(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR_EL12);
- } else {
+ if (sve_guest)
+ __hyp_sve_restore_guest(vcpu);
+ else
__fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
- }
/* Skip restoring fpexc32 for AArch64 guests */
if (!(read_sysreg(hcr_el2) & HCR_RW))
diff --git a/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h
new file mode 100644
index 000000000000..dc61aaa56f31
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_EARLY_ALLOC_H
+#define __KVM_HYP_EARLY_ALLOC_H
+
+#include <asm/kvm_pgtable.h>
+
+void hyp_early_alloc_init(void *virt, unsigned long size);
+unsigned long hyp_early_alloc_nr_used_pages(void);
+void *hyp_early_alloc_page(void *arg);
+void *hyp_early_alloc_contig(unsigned int nr_pages);
+
+extern struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+
+#endif /* __KVM_HYP_EARLY_ALLOC_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
new file mode 100644
index 000000000000..18a4494337bd
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_GFP_H
+#define __KVM_HYP_GFP_H
+
+#include <linux/list.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_NO_ORDER UINT_MAX
+
+struct hyp_pool {
+ /*
+ * Spinlock protecting concurrent changes to the memory pool as well as
+ * the struct hyp_page of the pool's pages until we have a proper atomic
+ * API at EL2.
+ */
+ hyp_spinlock_t lock;
+ struct list_head free_area[MAX_ORDER];
+ phys_addr_t range_start;
+ phys_addr_t range_end;
+ unsigned int max_order;
+};
+
+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+ struct hyp_pool *pool = hyp_page_to_pool(p);
+
+ hyp_spin_lock(&pool->lock);
+ p->refcount++;
+ hyp_spin_unlock(&pool->lock);
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+ struct hyp_pool *pool = hyp_page_to_pool(p);
+ int ret;
+
+ hyp_spin_lock(&pool->lock);
+ p->refcount--;
+ ret = (p->refcount == 0);
+ hyp_spin_unlock(&pool->lock);
+
+ return ret;
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+ struct hyp_pool *pool = hyp_page_to_pool(p);
+
+ hyp_spin_lock(&pool->lock);
+ if (p->refcount) {
+ hyp_spin_unlock(&pool->lock);
+ BUG();
+ }
+ p->refcount = 1;
+ hyp_spin_unlock(&pool->lock);
+}
+
+/* Allocation */
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
+void hyp_get_page(void *addr);
+void hyp_put_page(void *addr);
+
+/* Used pages cannot be freed */
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+ unsigned int reserved_pages);
+#endif /* __KVM_HYP_GFP_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
new file mode 100644
index 000000000000..42d81ec739fa
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#ifndef __KVM_NVHE_MEM_PROTECT__
+#define __KVM_NVHE_MEM_PROTECT__
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/virt.h>
+#include <nvhe/spinlock.h>
+
+struct host_kvm {
+ struct kvm_arch arch;
+ struct kvm_pgtable pgt;
+ struct kvm_pgtable_mm_ops mm_ops;
+ hyp_spinlock_t lock;
+};
+extern struct host_kvm host_kvm;
+
+int __pkvm_prot_finalize(void);
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
+
+static __always_inline void __load_host_stage2(void)
+{
+ if (static_branch_likely(&kvm_protected_mode_initialized))
+ __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+ else
+ write_sysreg(0, vttbr_el2);
+}
+#endif /* __KVM_NVHE_MEM_PROTECT__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
new file mode 100644
index 000000000000..fd78bde939ee
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MEMORY_H
+#define __KVM_HYP_MEMORY_H
+
+#include <asm/kvm_mmu.h>
+#include <asm/page.h>
+
+#include <linux/types.h>
+
+struct hyp_pool;
+struct hyp_page {
+ unsigned int refcount;
+ unsigned int order;
+ struct hyp_pool *pool;
+ struct list_head node;
+};
+
+extern u64 __hyp_vmemmap;
+#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap)
+
+#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset))
+
+static inline void *hyp_phys_to_virt(phys_addr_t phys)
+{
+ return __hyp_va(phys);
+}
+
+static inline phys_addr_t hyp_virt_to_phys(void *addr)
+{
+ return __hyp_pa(addr);
+}
+
+#define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT)
+#define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT))
+#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)])
+#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt))
+#define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt))
+
+#define hyp_page_to_pfn(page) ((struct hyp_page *)(page) - hyp_vmemmap)
+#define hyp_page_to_phys(page) hyp_pfn_to_phys((hyp_page_to_pfn(page)))
+#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page))
+#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool)
+
+static inline int hyp_page_count(void *addr)
+{
+ struct hyp_page *p = hyp_virt_to_page(addr);
+
+ return p->refcount;
+}
+
+#endif /* __KVM_HYP_MEMORY_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
new file mode 100644
index 000000000000..0095f6289742
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_HYP_MM_H
+#define __KVM_HYP_MM_H
+
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+#include <linux/memblock.h>
+#include <linux/types.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/spinlock.h>
+
+#define HYP_MEMBLOCK_REGIONS 128
+extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
+extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
+extern struct kvm_pgtable pkvm_pgtable;
+extern hyp_spinlock_t pkvm_pgd_lock;
+extern struct hyp_pool hpool;
+extern u64 __io_map_base;
+
+int hyp_create_idmap(u32 hyp_va_bits);
+int hyp_map_vectors(void);
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot);
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+ enum kvm_pgtable_prot prot);
+
+static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct hyp_page *p = hyp_phys_to_page(phys);
+
+ *start = (unsigned long)p;
+ *end = *start + nr_pages * sizeof(struct hyp_page);
+ *start = ALIGN_DOWN(*start, PAGE_SIZE);
+ *end = ALIGN(*end, PAGE_SIZE);
+}
+
+static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
+{
+ unsigned long total = 0, i;
+
+ /* Provision the worst case scenario */
+ for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+ nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
+ total += nr_pages;
+ }
+
+ return total;
+}
+
+static inline unsigned long __hyp_pgtable_total_pages(void)
+{
+ unsigned long res = 0, i;
+
+ /* Cover all of memory with page-granularity */
+ for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+ struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
+ res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
+ }
+
+ return res;
+}
+
+static inline unsigned long hyp_s1_pgtable_pages(void)
+{
+ unsigned long res;
+
+ res = __hyp_pgtable_total_pages();
+
+ /* Allow 1 GiB for private mappings */
+ res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+
+ return res;
+}
+
+static inline unsigned long host_s2_mem_pgtable_pages(void)
+{
+ /*
+ * Include an extra 16 pages to safely upper-bound the worst case of
+ * concatenated pgds.
+ */
+ return __hyp_pgtable_total_pages() + 16;
+}
+
+static inline unsigned long host_s2_dev_pgtable_pages(void)
+{
+ /* Allow 1 GiB for MMIO mappings */
+ return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+}
+
+#endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
new file mode 100644
index 000000000000..76b537f8d1c6
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A stand-alone ticket spinlock implementation for use by the non-VHE
+ * KVM hypervisor code running at EL2.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ *
+ * Heavily based on the implementation removed by c11090474d70 which was:
+ * Copyright (C) 2012 ARM Ltd.
+ */
+
+#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__
+#define __ARM64_KVM_NVHE_SPINLOCK_H__
+
+#include <asm/alternative.h>
+#include <asm/lse.h>
+
+typedef union hyp_spinlock {
+ u32 __val;
+ struct {
+#ifdef __AARCH64EB__
+ u16 next, owner;
+#else
+ u16 owner, next;
+#endif
+ };
+} hyp_spinlock_t;
+
+#define hyp_spin_lock_init(l) \
+do { \
+ *(l) = (hyp_spinlock_t){ .__val = 0 }; \
+} while (0)
+
+static inline void hyp_spin_lock(hyp_spinlock_t *lock)
+{
+ u32 tmp;
+ hyp_spinlock_t lockval, newval;
+
+ asm volatile(
+ /* Atomically increment the next ticket. */
+ ARM64_LSE_ATOMIC_INSN(
+ /* LL/SC */
+" prfm pstl1strm, %3\n"
+"1: ldaxr %w0, %3\n"
+" add %w1, %w0, #(1 << 16)\n"
+" stxr %w2, %w1, %3\n"
+" cbnz %w2, 1b\n",
+ /* LSE atomics */
+" mov %w2, #(1 << 16)\n"
+" ldadda %w2, %w0, %3\n"
+ __nops(3))
+
+ /* Did we get the lock? */
+" eor %w1, %w0, %w0, ror #16\n"
+" cbz %w1, 3f\n"
+ /*
+ * No: spin on the owner. Send a local event to avoid missing an
+ * unlock before the exclusive load.
+ */
+" sevl\n"
+"2: wfe\n"
+" ldaxrh %w2, %4\n"
+" eor %w1, %w2, %w0, lsr #16\n"
+" cbnz %w1, 2b\n"
+ /* We got the lock. Critical section starts here. */
+"3:"
+ : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock)
+ : "Q" (lock->owner)
+ : "memory");
+}
+
+static inline void hyp_spin_unlock(hyp_spinlock_t *lock)
+{
+ u64 tmp;
+
+ asm volatile(
+ ARM64_LSE_ATOMIC_INSN(
+ /* LL/SC */
+ " ldrh %w1, %0\n"
+ " add %w1, %w1, #1\n"
+ " stlrh %w1, %0",
+ /* LSE atomics */
+ " mov %w1, #1\n"
+ " staddlh %w1, %0\n"
+ __nops(1))
+ : "=Q" (lock->owner), "=&r" (tmp)
+ :
+ : "memory");
+}
+
+#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index fb24a0f022ad..5df6193fc430 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -9,10 +9,15 @@ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
hostprogs := gen-hyprel
HOST_EXTRACFLAGS += -I$(objtree)/include
+lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := $(addprefix ../../../lib/, $(lib-objs))
+
obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
- hyp-main.o hyp-smp.o psci-relay.o
+ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \
+ cache.o setup.o mm.o mem_protect.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
- ../fpsimd.o ../hyp-entry.o ../exception.o
+ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
+obj-y += $(lib-objs)
##
## Build rules for compiling nVHE hyp code
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
new file mode 100644
index 000000000000..36cef6915428
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Code copied from arch/arm64/mm/cache.S.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/alternative.h>
+
+SYM_FUNC_START_PI(__flush_dcache_area)
+ dcache_by_line_op civac, sy, x0, x1, x2, x3
+ ret
+SYM_FUNC_END_PI(__flush_dcache_area)
diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
index f401724f12ef..7d3f25868cae 100644
--- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c
@@ -21,17 +21,11 @@ static void __debug_save_spe(u64 *pmscr_el1)
/* Clear pmscr in case of early return */
*pmscr_el1 = 0;
- /* SPE present on this CPU? */
- if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
- ID_AA64DFR0_PMSVER_SHIFT))
- return;
-
- /* Yes; is it owned by EL3? */
- reg = read_sysreg_s(SYS_PMBIDR_EL1);
- if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
- return;
-
- /* No; is the host actually using the thing? */
+ /*
+ * At this point, we know that this CPU implements
+ * SPE and is available to the host.
+ * Check if the host is actually using it ?
+ */
reg = read_sysreg_s(SYS_PMBLIMITR_EL1);
if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT)))
return;
@@ -58,10 +52,43 @@ static void __debug_restore_spe(u64 pmscr_el1)
write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
}
+static void __debug_save_trace(u64 *trfcr_el1)
+{
+ *trfcr_el1 = 0;
+
+ /* Check if the TRBE is enabled */
+ if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE))
+ return;
+ /*
+ * Prohibit trace generation while we are in guest.
+ * Since access to TRFCR_EL1 is trapped, the guest can't
+ * modify the filtering set by the host.
+ */
+ *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1);
+ write_sysreg_s(0, SYS_TRFCR_EL1);
+ isb();
+ /* Drain the trace buffer to memory */
+ tsb_csync();
+ dsb(nsh);
+}
+
+static void __debug_restore_trace(u64 trfcr_el1)
+{
+ if (!trfcr_el1)
+ return;
+
+ /* Restore trace filter controls */
+ write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1);
+}
+
void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
/* Disable and flush SPE data generation */
- __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+ if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+ __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1);
+ /* Disable and flush Self-Hosted Trace generation */
+ if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+ __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1);
}
void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
@@ -71,7 +98,10 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu)
{
- __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+ if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE)
+ __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1);
+ if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE)
+ __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1);
}
void __debug_switch_to_host(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
new file mode 100644
index 000000000000..1306c430ab87
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/memory.h>
+
+struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops;
+s64 __ro_after_init hyp_physvirt_offset;
+
+static unsigned long base;
+static unsigned long end;
+static unsigned long cur;
+
+unsigned long hyp_early_alloc_nr_used_pages(void)
+{
+ return (cur - base) >> PAGE_SHIFT;
+}
+
+void *hyp_early_alloc_contig(unsigned int nr_pages)
+{
+ unsigned long size = (nr_pages << PAGE_SHIFT);
+ void *ret = (void *)cur;
+
+ if (!nr_pages)
+ return NULL;
+
+ if (end - cur < size)
+ return NULL;
+
+ cur += size;
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+void *hyp_early_alloc_page(void *arg)
+{
+ return hyp_early_alloc_contig(1);
+}
+
+void hyp_early_alloc_init(void *virt, unsigned long size)
+{
+ base = cur = (unsigned long)virt;
+ end = base + size;
+
+ hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page;
+ hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt;
+ hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
index ead02c6a7628..6bc88a756cb7 100644
--- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
+++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
@@ -50,6 +50,18 @@
#ifndef R_AARCH64_ABS64
#define R_AARCH64_ABS64 257
#endif
+#ifndef R_AARCH64_PREL64
+#define R_AARCH64_PREL64 260
+#endif
+#ifndef R_AARCH64_PREL32
+#define R_AARCH64_PREL32 261
+#endif
+#ifndef R_AARCH64_PREL16
+#define R_AARCH64_PREL16 262
+#endif
+#ifndef R_AARCH64_PLT32
+#define R_AARCH64_PLT32 314
+#endif
#ifndef R_AARCH64_LD_PREL_LO19
#define R_AARCH64_LD_PREL_LO19 273
#endif
@@ -371,6 +383,12 @@ static void emit_rela_section(Elf64_Shdr *sh_rela)
case R_AARCH64_ABS64:
emit_rela_abs64(rela, sh_orig_name);
break;
+ /* Allow position-relative data relocations. */
+ case R_AARCH64_PREL64:
+ case R_AARCH64_PREL32:
+ case R_AARCH64_PREL16:
+ case R_AARCH64_PLT32:
+ break;
/* Allow relocations to generate PC-relative addressing. */
case R_AARCH64_LD_PREL_LO19:
case R_AARCH64_ADR_PREL_LO21:
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 5d94584840cc..2b23400e0fb3 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -79,22 +79,18 @@ SYM_FUNC_START(__hyp_do_panic)
mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
PSR_MODE_EL1h)
msr spsr_el2, lr
- ldr lr, =panic
+ ldr lr, =nvhe_hyp_panic_handler
hyp_kimg_va lr, x6
msr elr_el2, lr
mov x29, x0
- /* Load the format string into x0 and arguments into x1-7 */
- ldr x0, =__hyp_panic_string
- hyp_kimg_va x0, x6
-
- /* Load the format arguments into x1-7. */
- mov x6, x3
- get_vcpu_ptr x7, x3
- mrs x3, esr_el2
- mrs x4, far_el2
- mrs x5, hpfar_el2
+ /* Load the panic arguments into x0-7 */
+ mrs x0, esr_el2
+ get_vcpu_ptr x4, x5
+ mrs x5, far_el2
+ mrs x6, hpfar_el2
+ mov x7, xzr // Unused argument
/* Enter the host, conditionally restoring the host context. */
cbz x29, __host_enter_without_restoring
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index c631e29fb001..c953fb4b9a13 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -83,11 +83,6 @@ SYM_CODE_END(__kvm_hyp_init)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START_LOCAL(___kvm_hyp_init)
-alternative_if ARM64_KVM_PROTECTED_MODE
- mov_q x1, HCR_HOST_NVHE_PROTECTED_FLAGS
- msr hcr_el2, x1
-alternative_else_nop_endif
-
ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
msr tpidr_el2, x1
@@ -97,6 +92,15 @@ alternative_else_nop_endif
ldr x1, [x0, #NVHE_INIT_MAIR_EL2]
msr mair_el2, x1
+ ldr x1, [x0, #NVHE_INIT_HCR_EL2]
+ msr hcr_el2, x1
+
+ ldr x1, [x0, #NVHE_INIT_VTTBR]
+ msr vttbr_el2, x1
+
+ ldr x1, [x0, #NVHE_INIT_VTCR]
+ msr vtcr_el2, x1
+
ldr x1, [x0, #NVHE_INIT_PGD_PA]
phys_to_ttbr x2, x1
alternative_if ARM64_HAS_CNP
@@ -115,15 +119,10 @@ alternative_else_nop_endif
/* Invalidate the stale TLBs from Bootloader */
tlbi alle2
+ tlbi vmalls12e1
dsb sy
- /*
- * Preserve all the RES1 bits while setting the default flags,
- * as well as the EE bit on BE. Drop the A flag since the compiler
- * is allowed to generate unaligned accesses.
- */
- mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE( orr x0, x0, #SCTLR_ELx_EE)
+ mov_q x0, INIT_SCTLR_EL2_MMU_ON
alternative_if ARM64_HAS_ADDRESS_AUTH
mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
@@ -221,9 +220,7 @@ SYM_CODE_START(__kvm_handle_stub_hvc)
mov x0, xzr
reset:
/* Reset kvm back to the hyp stub. */
- mrs x5, sctlr_el2
- mov_q x6, SCTLR_ELx_FLAGS
- bic x5, x5, x6 // Clear SCTL_M and etc
+ mov_q x5, INIT_SCTLR_EL2_MMU_OFF
pre_disable_mmu_workaround
msr sctlr_el2, x5
isb
@@ -244,4 +241,31 @@ alternative_else_nop_endif
SYM_CODE_END(__kvm_handle_stub_hvc)
+SYM_FUNC_START(__pkvm_init_switch_pgd)
+ /* Turn the MMU off */
+ pre_disable_mmu_workaround
+ mrs x2, sctlr_el2
+ bic x3, x2, #SCTLR_ELx_M
+ msr sctlr_el2, x3
+ isb
+
+ tlbi alle2
+
+ /* Install the new pgtables */
+ ldr x3, [x0, #NVHE_INIT_PGD_PA]
+ phys_to_ttbr x4, x3
+alternative_if ARM64_HAS_CNP
+ orr x4, x4, #TTBR_CNP_BIT
+alternative_else_nop_endif
+ msr ttbr0_el2, x4
+
+ /* Set the new stack pointer */
+ ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA]
+ mov sp, x0
+
+ /* And turn the MMU back on! */
+ set_sctlr_el2 x2
+ ret x1
+SYM_FUNC_END(__pkvm_init_switch_pgd)
+
.popsection
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 936328207bde..f36420a80474 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -6,12 +6,15 @@
#include <hyp/switch.h>
+#include <asm/pgtable-types.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
#include <nvhe/trap_handler.h>
DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -106,6 +109,61 @@ static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
__vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
}
+static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+ DECLARE_REG(unsigned long, size, host_ctxt, 2);
+ DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
+ DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
+ DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+
+ /*
+ * __pkvm_init() will return only if an error occurred, otherwise it
+ * will tail-call in __pkvm_init_finalise() which will have to deal
+ * with the host context directly.
+ */
+ cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
+ hyp_va_bits);
+}
+
+static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1);
+
+ cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot);
+}
+
+static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(unsigned long, start, host_ctxt, 1);
+ DECLARE_REG(unsigned long, size, host_ctxt, 2);
+ DECLARE_REG(unsigned long, phys, host_ctxt, 3);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot);
+}
+
+static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
+ DECLARE_REG(size_t, size, host_ctxt, 2);
+ DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot);
+}
+
+static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
+{
+ cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
+}
+
+static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(phys_addr_t, start, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, end, host_ctxt, 2);
+
+ cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end);
+}
typedef void (*hcall_t)(struct kvm_cpu_context *);
#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -125,6 +183,12 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_get_mdcr_el2),
HANDLE_FUNC(__vgic_v3_save_aprs),
HANDLE_FUNC(__vgic_v3_restore_aprs),
+ HANDLE_FUNC(__pkvm_init),
+ HANDLE_FUNC(__pkvm_cpu_set_vector),
+ HANDLE_FUNC(__pkvm_create_mappings),
+ HANDLE_FUNC(__pkvm_create_private_mapping),
+ HANDLE_FUNC(__pkvm_prot_finalize),
+ HANDLE_FUNC(__pkvm_mark_hyp),
};
static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -177,7 +241,16 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
case ESR_ELx_EC_SMC64:
handle_host_smc(host_ctxt);
break;
+ case ESR_ELx_EC_SVE:
+ sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
+ isb();
+ sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
+ break;
+ case ESR_ELx_EC_IABT_LOW:
+ case ESR_ELx_EC_DABT_LOW:
+ handle_host_mem_abort(host_ctxt);
+ break;
default:
- hyp_panic();
+ BUG();
}
}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
index 879559057dee..9f54833af400 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -18,8 +18,7 @@ u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID
u64 cpu_logical_map(unsigned int cpu)
{
- if (cpu >= ARRAY_SIZE(hyp_cpu_logical_map))
- hyp_panic();
+ BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map));
return hyp_cpu_logical_map[cpu];
}
@@ -30,8 +29,7 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu)
unsigned long this_cpu_base;
unsigned long elf_base;
- if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base))
- hyp_panic();
+ BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base));
cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base;
this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index cd119d82d8e3..f4562f417d3f 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -25,4 +25,5 @@ SECTIONS {
BEGIN_HYP_SECTION(.data..percpu)
PERCPU_INPUT(L1_CACHE_BYTES)
END_HYP_SECTION
+ HYP_SECTION(.bss)
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
new file mode 100644
index 000000000000..e342f7f4f4fb
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/stage2_pgtable.h>
+
+#include <hyp/switch.h>
+
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+
+#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
+
+extern unsigned long hyp_nr_cpus;
+struct host_kvm host_kvm;
+
+struct hyp_pool host_s2_mem;
+struct hyp_pool host_s2_dev;
+
+/*
+ * Copies of the host's CPU features registers holding sanitized values.
+ */
+u64 id_aa64mmfr0_el1_sys_val;
+u64 id_aa64mmfr1_el1_sys_val;
+
+static const u8 pkvm_hyp_id = 1;
+
+static void *host_s2_zalloc_pages_exact(size_t size)
+{
+ return hyp_alloc_pages(&host_s2_mem, get_order(size));
+}
+
+static void *host_s2_zalloc_page(void *pool)
+{
+ return hyp_alloc_pages(pool, 0);
+}
+
+static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+ unsigned long nr_pages, pfn;
+ int ret;
+
+ pfn = hyp_virt_to_pfn(mem_pgt_pool);
+ nr_pages = host_s2_mem_pgtable_pages();
+ ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
+ if (ret)
+ return ret;
+
+ pfn = hyp_virt_to_pfn(dev_pgt_pool);
+ nr_pages = host_s2_dev_pgtable_pages();
+ ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+ if (ret)
+ return ret;
+
+ host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
+ .zalloc_pages_exact = host_s2_zalloc_pages_exact,
+ .zalloc_page = host_s2_zalloc_page,
+ .phys_to_virt = hyp_phys_to_virt,
+ .virt_to_phys = hyp_virt_to_phys,
+ .page_count = hyp_page_count,
+ .get_page = hyp_get_page,
+ .put_page = hyp_put_page,
+ };
+
+ return 0;
+}
+
+static void prepare_host_vtcr(void)
+{
+ u32 parange, phys_shift;
+
+ /* The host stage 2 is id-mapped, so use parange for T0SZ */
+ parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
+ phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+ host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+ id_aa64mmfr1_el1_sys_val, phys_shift);
+}
+
+int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+{
+ struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+ int ret;
+
+ prepare_host_vtcr();
+ hyp_spin_lock_init(&host_kvm.lock);
+
+ ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch,
+ &host_kvm.mm_ops, KVM_HOST_S2_FLAGS);
+ if (ret)
+ return ret;
+
+ mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
+ mmu->arch = &host_kvm.arch;
+ mmu->pgt = &host_kvm.pgt;
+ mmu->vmid.vmid_gen = 0;
+ mmu->vmid.vmid = 0;
+
+ return 0;
+}
+
+int __pkvm_prot_finalize(void)
+{
+ struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+ struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+ params->vttbr = kvm_get_vttbr(mmu);
+ params->vtcr = host_kvm.arch.vtcr;
+ params->hcr_el2 |= HCR_VM;
+ kvm_flush_dcache_to_poc(params, sizeof(*params));
+
+ write_sysreg(params->hcr_el2, hcr_el2);
+ __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr);
+
+ /*
+ * Make sure to have an ISB before the TLB maintenance below but only
+ * when __load_stage2() doesn't include one already.
+ */
+ asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+ /* Invalidate stale HCR bits that may be cached in TLBs */
+ __tlbi(vmalls12e1);
+ dsb(nsh);
+ isb();
+
+ return 0;
+}
+
+static int host_stage2_unmap_dev_all(void)
+{
+ struct kvm_pgtable *pgt = &host_kvm.pgt;
+ struct memblock_region *reg;
+ u64 addr = 0;
+ int i, ret;
+
+ /* Unmap all non-memory regions to recycle the pages */
+ for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
+ reg = &hyp_memory[i];
+ ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+ if (ret)
+ return ret;
+ }
+ return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
+}
+
+static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+{
+ int cur, left = 0, right = hyp_memblock_nr;
+ struct memblock_region *reg;
+ phys_addr_t end;
+
+ range->start = 0;
+ range->end = ULONG_MAX;
+
+ /* The list of memblock regions is sorted, binary search it */
+ while (left < right) {
+ cur = (left + right) >> 1;
+ reg = &hyp_memory[cur];
+ end = reg->base + reg->size;
+ if (addr < reg->base) {
+ right = cur;
+ range->end = reg->base;
+ } else if (addr >= end) {
+ left = cur + 1;
+ range->start = end;
+ } else {
+ range->start = reg->base;
+ range->end = end;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool range_is_memory(u64 start, u64 end)
+{
+ struct kvm_mem_range r1, r2;
+
+ if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2))
+ return false;
+ if (r1.start != r2.start)
+ return false;
+
+ return true;
+}
+
+static inline int __host_stage2_idmap(u64 start, u64 end,
+ enum kvm_pgtable_prot prot,
+ struct hyp_pool *pool)
+{
+ return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
+ prot, pool);
+}
+
+static int host_stage2_idmap(u64 addr)
+{
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
+ struct kvm_mem_range range;
+ bool is_memory = find_mem_range(addr, &range);
+ struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
+ int ret;
+
+ if (is_memory)
+ prot |= KVM_PGTABLE_PROT_X;
+
+ hyp_spin_lock(&host_kvm.lock);
+ ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range);
+ if (ret)
+ goto unlock;
+
+ ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+ if (is_memory || ret != -ENOMEM)
+ goto unlock;
+
+ /*
+ * host_s2_mem has been provided with enough pages to cover all of
+ * memory with page granularity, so we should never hit the ENOMEM case.
+ * However, it is difficult to know how much of the MMIO range we will
+ * need to cover upfront, so we may need to 'recycle' the pages if we
+ * run out.
+ */
+ ret = host_stage2_unmap_dev_all();
+ if (ret)
+ goto unlock;
+
+ ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+
+unlock:
+ hyp_spin_unlock(&host_kvm.lock);
+
+ return ret;
+}
+
+int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
+{
+ int ret;
+
+ /*
+ * host_stage2_unmap_dev_all() currently relies on MMIO mappings being
+ * non-persistent, so don't allow changing page ownership in MMIO range.
+ */
+ if (!range_is_memory(start, end))
+ return -EINVAL;
+
+ hyp_spin_lock(&host_kvm.lock);
+ ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
+ &host_s2_mem, pkvm_hyp_id);
+ hyp_spin_unlock(&host_kvm.lock);
+
+ return ret != -EAGAIN ? ret : 0;
+}
+
+void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
+{
+ struct kvm_vcpu_fault_info fault;
+ u64 esr, addr;
+ int ret = 0;
+
+ esr = read_sysreg_el2(SYS_ESR);
+ BUG_ON(!__get_fault_info(esr, &fault));
+
+ addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+ ret = host_stage2_idmap(addr);
+ BUG_ON(ret && ret != -EAGAIN);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
new file mode 100644
index 000000000000..a8efdf0f9003
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/spectre.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+#include <nvhe/spinlock.h>
+
+struct kvm_pgtable pkvm_pgtable;
+hyp_spinlock_t pkvm_pgd_lock;
+u64 __io_map_base;
+
+struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
+unsigned int hyp_memblock_nr;
+
+int __pkvm_create_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot)
+{
+ int err;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+ err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot);
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ return err;
+}
+
+unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
+ enum kvm_pgtable_prot prot)
+{
+ unsigned long addr;
+ int err;
+
+ hyp_spin_lock(&pkvm_pgd_lock);
+
+ size = PAGE_ALIGN(size + offset_in_page(phys));
+ addr = __io_map_base;
+ __io_map_base += size;
+
+ /* Are we overflowing on the vmemmap ? */
+ if (__io_map_base > __hyp_vmemmap) {
+ __io_map_base -= size;
+ addr = (unsigned long)ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot);
+ if (err) {
+ addr = (unsigned long)ERR_PTR(err);
+ goto out;
+ }
+
+ addr = addr + offset_in_page(phys);
+out:
+ hyp_spin_unlock(&pkvm_pgd_lock);
+
+ return addr;
+}
+
+int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
+{
+ unsigned long start = (unsigned long)from;
+ unsigned long end = (unsigned long)to;
+ unsigned long virt_addr;
+ phys_addr_t phys;
+
+ start = start & PAGE_MASK;
+ end = PAGE_ALIGN(end);
+
+ for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+ int err;
+
+ phys = hyp_virt_to_phys((void *)virt_addr);
+ err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
+{
+ unsigned long start, end;
+
+ hyp_vmemmap_range(phys, size, &start, &end);
+
+ return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
+}
+
+static void *__hyp_bp_vect_base;
+int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot)
+{
+ void *vector;
+
+ switch (slot) {
+ case HYP_VECTOR_DIRECT: {
+ vector = __kvm_hyp_vector;
+ break;
+ }
+ case HYP_VECTOR_SPECTRE_DIRECT: {
+ vector = __bp_harden_hyp_vecs;
+ break;
+ }
+ case HYP_VECTOR_INDIRECT:
+ case HYP_VECTOR_SPECTRE_INDIRECT: {
+ vector = (void *)__hyp_bp_vect_base;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ vector = __kvm_vector_slot2addr(vector, slot);
+ *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector;
+
+ return 0;
+}
+
+int hyp_map_vectors(void)
+{
+ phys_addr_t phys;
+ void *bp_base;
+
+ if (!cpus_have_const_cap(ARM64_SPECTRE_V3A))
+ return 0;
+
+ phys = __hyp_pa(__bp_harden_hyp_vecs);
+ bp_base = (void *)__pkvm_create_private_mapping(phys,
+ __BP_HARDEN_HYP_VECS_SZ,
+ PAGE_HYP_EXEC);
+ if (IS_ERR_OR_NULL(bp_base))
+ return PTR_ERR(bp_base);
+
+ __hyp_bp_vect_base = bp_base;
+
+ return 0;
+}
+
+int hyp_create_idmap(u32 hyp_va_bits)
+{
+ unsigned long start, end;
+
+ start = hyp_virt_to_phys((void *)__hyp_idmap_text_start);
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+
+ end = hyp_virt_to_phys((void *)__hyp_idmap_text_end);
+ end = ALIGN(end, PAGE_SIZE);
+
+ /*
+ * One half of the VA space is reserved to linearly map portions of
+ * memory -- see va_layout.c for more details. The other half of the VA
+ * space contains the trampoline page, and needs some care. Split that
+ * second half in two and find the quarter of VA space not conflicting
+ * with the idmap to place the IOs and the vmemmap. IOs use the lower
+ * half of the quarter and the vmemmap the upper half.
+ */
+ __io_map_base = start & BIT(hyp_va_bits - 2);
+ __io_map_base ^= BIT(hyp_va_bits - 2);
+ __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+
+ return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
new file mode 100644
index 000000000000..237e03bf0cb1
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <asm/kvm_hyp.h>
+#include <nvhe/gfp.h>
+
+u64 __hyp_vmemmap;
+
+/*
+ * Index the hyp_vmemmap to find a potential buddy page, but make no assumption
+ * about its current state.
+ *
+ * Example buddy-tree for a 4-pages physically contiguous pool:
+ *
+ * o : Page 3
+ * /
+ * o-o : Page 2
+ * /
+ * / o : Page 1
+ * / /
+ * o---o-o : Page 0
+ * Order 2 1 0
+ *
+ * Example of requests on this pool:
+ * __find_buddy_nocheck(pool, page 0, order 0) => page 1
+ * __find_buddy_nocheck(pool, page 0, order 1) => page 2
+ * __find_buddy_nocheck(pool, page 1, order 0) => page 0
+ * __find_buddy_nocheck(pool, page 2, order 0) => page 3
+ */
+static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
+ struct hyp_page *p,
+ unsigned int order)
+{
+ phys_addr_t addr = hyp_page_to_phys(p);
+
+ addr ^= (PAGE_SIZE << order);
+
+ /*
+ * Don't return a page outside the pool range -- it belongs to
+ * something else and may not be mapped in hyp_vmemmap.
+ */
+ if (addr < pool->range_start || addr >= pool->range_end)
+ return NULL;
+
+ return hyp_phys_to_page(addr);
+}
+
+/* Find a buddy page currently available for allocation */
+static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
+ struct hyp_page *p,
+ unsigned int order)
+{
+ struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
+
+ if (!buddy || buddy->order != order || list_empty(&buddy->node))
+ return NULL;
+
+ return buddy;
+
+}
+
+static void __hyp_attach_page(struct hyp_pool *pool,
+ struct hyp_page *p)
+{
+ unsigned int order = p->order;
+ struct hyp_page *buddy;
+
+ memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
+
+ /*
+ * Only the first struct hyp_page of a high-order page (otherwise known
+ * as the 'head') should have p->order set. The non-head pages should
+ * have p->order = HYP_NO_ORDER. Here @p may no longer be the head
+ * after coallescing, so make sure to mark it HYP_NO_ORDER proactively.
+ */
+ p->order = HYP_NO_ORDER;
+ for (; (order + 1) < pool->max_order; order++) {
+ buddy = __find_buddy_avail(pool, p, order);
+ if (!buddy)
+ break;
+
+ /* Take the buddy out of its list, and coallesce with @p */
+ list_del_init(&buddy->node);
+ buddy->order = HYP_NO_ORDER;
+ p = min(p, buddy);
+ }
+
+ /* Mark the new head, and insert it */
+ p->order = order;
+ list_add_tail(&p->node, &pool->free_area[order]);
+}
+
+static void hyp_attach_page(struct hyp_page *p)
+{
+ struct hyp_pool *pool = hyp_page_to_pool(p);
+
+ hyp_spin_lock(&pool->lock);
+ __hyp_attach_page(pool, p);
+ hyp_spin_unlock(&pool->lock);
+}
+
+static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
+ struct hyp_page *p,
+ unsigned int order)
+{
+ struct hyp_page *buddy;
+
+ list_del_init(&p->node);
+ while (p->order > order) {
+ /*
+ * The buddy of order n - 1 currently has HYP_NO_ORDER as it
+ * is covered by a higher-level page (whose head is @p). Use
+ * __find_buddy_nocheck() to find it and inject it in the
+ * free_list[n - 1], effectively splitting @p in half.
+ */
+ p->order--;
+ buddy = __find_buddy_nocheck(pool, p, p->order);
+ buddy->order = p->order;
+ list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
+ }
+
+ return p;
+}
+
+void hyp_put_page(void *addr)
+{
+ struct hyp_page *p = hyp_virt_to_page(addr);
+
+ if (hyp_page_ref_dec_and_test(p))
+ hyp_attach_page(p);
+}
+
+void hyp_get_page(void *addr)
+{
+ struct hyp_page *p = hyp_virt_to_page(addr);
+
+ hyp_page_ref_inc(p);
+}
+
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
+{
+ unsigned int i = order;
+ struct hyp_page *p;
+
+ hyp_spin_lock(&pool->lock);
+
+ /* Look for a high-enough-order page */
+ while (i < pool->max_order && list_empty(&pool->free_area[i]))
+ i++;
+ if (i >= pool->max_order) {
+ hyp_spin_unlock(&pool->lock);
+ return NULL;
+ }
+
+ /* Extract it from the tree at the right order */
+ p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
+ p = __hyp_extract_page(pool, p, order);
+
+ hyp_spin_unlock(&pool->lock);
+ hyp_set_page_refcounted(p);
+
+ return hyp_page_to_virt(p);
+}
+
+int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
+ unsigned int reserved_pages)
+{
+ phys_addr_t phys = hyp_pfn_to_phys(pfn);
+ struct hyp_page *p;
+ int i;
+
+ hyp_spin_lock_init(&pool->lock);
+ pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+ for (i = 0; i < pool->max_order; i++)
+ INIT_LIST_HEAD(&pool->free_area[i]);
+ pool->range_start = phys;
+ pool->range_end = phys + (nr_pages << PAGE_SHIFT);
+
+ /* Init the vmemmap portion */
+ p = hyp_phys_to_page(phys);
+ memset(p, 0, sizeof(*p) * nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ p[i].pool = pool;
+ INIT_LIST_HEAD(&p[i].node);
+ }
+
+ /* Attach the unused pages to the buddy tree */
+ for (i = reserved_pages; i < nr_pages; i++)
+ __hyp_attach_page(pool, &p[i]);
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 63de71c0481e..08508783ec3d 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -11,6 +11,7 @@
#include <linux/kvm_host.h>
#include <uapi/linux/psci.h>
+#include <nvhe/memory.h>
#include <nvhe/trap_handler.h>
void kvm_hyp_cpu_entry(unsigned long r0);
@@ -20,9 +21,6 @@ void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
/* Config options set by the host. */
struct kvm_host_psci_config __ro_after_init kvm_host_psci_config;
-s64 __ro_after_init hyp_physvirt_offset;
-
-#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset)
#define INVALID_CPU_ID UINT_MAX
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
new file mode 100644
index 000000000000..7488f53b0aa2
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+
+#include <nvhe/early_alloc.h>
+#include <nvhe/gfp.h>
+#include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/trap_handler.h>
+
+struct hyp_pool hpool;
+struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
+unsigned long hyp_nr_cpus;
+
+#define hyp_percpu_size ((unsigned long)__per_cpu_end - \
+ (unsigned long)__per_cpu_start)
+
+static void *vmemmap_base;
+static void *hyp_pgt_base;
+static void *host_s2_mem_pgt_base;
+static void *host_s2_dev_pgt_base;
+
+static int divide_memory_pool(void *virt, unsigned long size)
+{
+ unsigned long vstart, vend, nr_pages;
+
+ hyp_early_alloc_init(virt, size);
+
+ hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend);
+ nr_pages = (vend - vstart) >> PAGE_SHIFT;
+ vmemmap_base = hyp_early_alloc_contig(nr_pages);
+ if (!vmemmap_base)
+ return -ENOMEM;
+
+ nr_pages = hyp_s1_pgtable_pages();
+ hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
+ if (!hyp_pgt_base)
+ return -ENOMEM;
+
+ nr_pages = host_s2_mem_pgtable_pages();
+ host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages);
+ if (!host_s2_mem_pgt_base)
+ return -ENOMEM;
+
+ nr_pages = host_s2_dev_pgtable_pages();
+ host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages);
+ if (!host_s2_dev_pgt_base)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
+ unsigned long *per_cpu_base,
+ u32 hyp_va_bits)
+{
+ void *start, *end, *virt = hyp_phys_to_virt(phys);
+ unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT;
+ int ret, i;
+
+ /* Recreate the hyp page-table using the early page allocator */
+ hyp_early_alloc_init(hyp_pgt_base, pgt_size);
+ ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits,
+ &hyp_early_alloc_mm_ops);
+ if (ret)
+ return ret;
+
+ ret = hyp_create_idmap(hyp_va_bits);
+ if (ret)
+ return ret;
+
+ ret = hyp_map_vectors();
+ if (ret)
+ return ret;
+
+ ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base));
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO);
+ if (ret)
+ return ret;
+
+ ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ start = (void *)kern_hyp_va(per_cpu_base[i]);
+ end = start + PAGE_ALIGN(hyp_percpu_size);
+ ret = pkvm_create_mappings(start, end, PAGE_HYP);
+ if (ret)
+ return ret;
+
+ end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va;
+ start = end - PAGE_SIZE;
+ ret = pkvm_create_mappings(start, end, PAGE_HYP);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void update_nvhe_init_params(void)
+{
+ struct kvm_nvhe_init_params *params;
+ unsigned long i;
+
+ for (i = 0; i < hyp_nr_cpus; i++) {
+ params = per_cpu_ptr(&kvm_init_params, i);
+ params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd);
+ __flush_dcache_area(params, sizeof(*params));
+ }
+}
+
+static void *hyp_zalloc_hyp_page(void *arg)
+{
+ return hyp_alloc_pages(&hpool, 0);
+}
+
+void __noreturn __pkvm_init_finalise(void)
+{
+ struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
+ struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt;
+ unsigned long nr_pages, reserved_pages, pfn;
+ int ret;
+
+ /* Now that the vmemmap is backed, install the full-fledged allocator */
+ pfn = hyp_virt_to_pfn(hyp_pgt_base);
+ nr_pages = hyp_s1_pgtable_pages();
+ reserved_pages = hyp_early_alloc_nr_used_pages();
+ ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages);
+ if (ret)
+ goto out;
+
+ ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
+ if (ret)
+ goto out;
+
+ pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
+ .zalloc_page = hyp_zalloc_hyp_page,
+ .phys_to_virt = hyp_phys_to_virt,
+ .virt_to_phys = hyp_virt_to_phys,
+ .get_page = hyp_get_page,
+ .put_page = hyp_put_page,
+ };
+ pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
+
+out:
+ /*
+ * We tail-called to here from handle___pkvm_init() and will not return,
+ * so make sure to propagate the return value to the host.
+ */
+ cpu_reg(host_ctxt, 1) = ret;
+
+ __host_enter(host_ctxt);
+}
+
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
+ unsigned long *per_cpu_base, u32 hyp_va_bits)
+{
+ struct kvm_nvhe_init_params *params;
+ void *virt = hyp_phys_to_virt(phys);
+ void (*fn)(phys_addr_t params_pa, void *finalize_fn_va);
+ int ret;
+
+ if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+ return -EINVAL;
+
+ hyp_spin_lock_init(&pkvm_pgd_lock);
+ hyp_nr_cpus = nr_cpus;
+
+ ret = divide_memory_pool(virt, size);
+ if (ret)
+ return ret;
+
+ ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits);
+ if (ret)
+ return ret;
+
+ update_nvhe_init_params();
+
+ /* Jump in the idmap page to switch to the new page-tables */
+ params = this_cpu_ptr(&kvm_init_params);
+ fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd);
+ fn(__hyp_pa(params), __pkvm_init_finalise);
+
+ unreachable();
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/stub.c b/arch/arm64/kvm/hyp/nvhe/stub.c
new file mode 100644
index 000000000000..c0aa6bbfd79d
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/stub.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stubs for out-of-line function calls caused by re-using kernel
+ * infrastructure at EL2.
+ *
+ * Copyright (C) 2020 - Google LLC
+ */
+
+#include <linux/list.h>
+
+#ifdef CONFIG_DEBUG_LIST
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
+{
+ return true;
+}
+
+bool __list_del_entry_valid(struct list_head *entry)
+{
+ return true;
+}
+#endif
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 68ab6b4d5141..e9f6ea704d07 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -28,6 +28,8 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
+#include <nvhe/mem_protect.h>
+
/* Non-VHE specific context */
DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -41,9 +43,9 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
__activate_traps_common(vcpu);
val = CPTR_EL2_DEFAULT;
- val |= CPTR_EL2_TTA | CPTR_EL2_TZ | CPTR_EL2_TAM;
+ val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
if (!update_fp_enabled(vcpu)) {
- val |= CPTR_EL2_TFP;
+ val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
__activate_traps_fpsimd32(vcpu);
}
@@ -68,7 +70,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
extern char __kvm_hyp_host_vector[];
- u64 mdcr_el2;
+ u64 mdcr_el2, cptr;
___deactivate_traps(vcpu);
@@ -95,19 +97,17 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
mdcr_el2 &= MDCR_EL2_HPMN_MASK;
mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
+ mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT;
write_sysreg(mdcr_el2, mdcr_el2);
- if (is_protected_kvm_enabled())
- write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2);
- else
- write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
- write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
- write_sysreg(__kvm_hyp_host_vector, vbar_el2);
-}
+ write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
-static void __load_host_stage2(void)
-{
- write_sysreg(0, vttbr_el2);
+ cptr = CPTR_EL2_DEFAULT;
+ if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED))
+ cptr |= CPTR_EL2_TZ;
+
+ write_sysreg(cptr, cptr_el2);
+ write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
/* Save VGICv3 state on non-VHE systems */
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 229b06748c20..83dc3b271bc5 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -8,6 +8,8 @@
#include <asm/kvm_mmu.h>
#include <asm/tlbflush.h>
+#include <nvhe/mem_protect.h>
+
struct tlb_inv_context {
u64 tcr;
};
@@ -43,7 +45,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
{
- write_sysreg(0, vttbr_el2);
+ __load_host_stage2();
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
/* Ensure write of the host VMID */
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 926fc07074f5..c37c1dc4feaf 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -9,8 +9,7 @@
#include <linux/bitfield.h>
#include <asm/kvm_pgtable.h>
-
-#define KVM_PGTABLE_MAX_LEVELS 4U
+#include <asm/stage2_pgtable.h>
#define KVM_PTE_VALID BIT(0)
@@ -49,6 +48,11 @@
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
+#define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55)
+
+#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56)
+#define KVM_MAX_OWNER_ID 1
+
struct kvm_pgtable_walk_data {
struct kvm_pgtable *pgt;
struct kvm_pgtable_walker *walker;
@@ -68,21 +72,36 @@ static u64 kvm_granule_size(u32 level)
return BIT(kvm_granule_shift(level));
}
-static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+#define KVM_PHYS_INVALID (-1ULL)
+
+static bool kvm_phys_is_valid(u64 phys)
{
- u64 granule = kvm_granule_size(level);
+ return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX));
+}
+static bool kvm_level_supports_block_mapping(u32 level)
+{
/*
* Reject invalid block mappings and don't bother with 4TB mappings for
* 52-bit PAs.
*/
- if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+ return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+ u64 granule = kvm_granule_size(level);
+
+ if (!kvm_level_supports_block_mapping(level))
return false;
if (granule > (end - addr))
return false;
- return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+ if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
+ return false;
+
+ return IS_ALIGNED(addr, granule);
}
static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
@@ -152,20 +171,20 @@ static kvm_pte_t kvm_phys_to_pte(u64 pa)
return pte;
}
-static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
{
- return __va(kvm_pte_to_phys(pte));
+ return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
}
-static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+static void kvm_clear_pte(kvm_pte_t *ptep)
{
- kvm_pte_t pte = *ptep;
- WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+ WRITE_ONCE(*ptep, 0);
}
-static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp,
+ struct kvm_pgtable_mm_ops *mm_ops)
{
- kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
pte |= KVM_PTE_VALID;
@@ -187,6 +206,11 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
return pte;
}
+static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
+{
+ return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
+}
+
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag)
@@ -228,7 +252,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
goto out;
}
- childp = kvm_pte_follow(pte);
+ childp = kvm_pte_follow(pte, data->pgt->mm_ops);
ret = __kvm_pgtable_walk(data, childp, level + 1);
if (ret)
goto out;
@@ -303,12 +327,12 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
}
struct hyp_map_data {
- u64 phys;
- kvm_pte_t attr;
+ u64 phys;
+ kvm_pte_t attr;
+ struct kvm_pgtable_mm_ops *mm_ops;
};
-static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
- struct hyp_map_data *data)
+static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
{
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
@@ -333,7 +357,8 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
- data->attr = attr;
+ *ptep = attr;
+
return 0;
}
@@ -359,6 +384,8 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag, void * const arg)
{
kvm_pte_t *childp;
+ struct hyp_map_data *data = arg;
+ struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
return 0;
@@ -366,11 +393,11 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
- childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
if (!childp)
return -ENOMEM;
- kvm_set_table_pte(ptep, childp);
+ kvm_set_table_pte(ptep, childp, mm_ops);
return 0;
}
@@ -380,6 +407,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
int ret;
struct hyp_map_data map_data = {
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ .mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = hyp_map_walker,
@@ -387,7 +415,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
.arg = &map_data,
};
- ret = hyp_map_set_prot_attr(prot, &map_data);
+ ret = hyp_set_prot_attr(prot, &map_data.attr);
if (ret)
return ret;
@@ -397,16 +425,18 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
return ret;
}
-int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
+ struct kvm_pgtable_mm_ops *mm_ops)
{
u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
- pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
if (!pgt->pgd)
return -ENOMEM;
pgt->ia_bits = va_bits;
pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
+ pgt->mm_ops = mm_ops;
pgt->mmu = NULL;
return 0;
}
@@ -414,7 +444,9 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag, void * const arg)
{
- free_page((unsigned long)kvm_pte_follow(*ptep));
+ struct kvm_pgtable_mm_ops *mm_ops = arg;
+
+ mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops));
return 0;
}
@@ -423,29 +455,75 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
struct kvm_pgtable_walker walker = {
.cb = hyp_free_walker,
.flags = KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = pgt->mm_ops,
};
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
- free_page((unsigned long)pgt->pgd);
+ pgt->mm_ops->put_page(pgt->pgd);
pgt->pgd = NULL;
}
struct stage2_map_data {
u64 phys;
kvm_pte_t attr;
+ u8 owner_id;
kvm_pte_t *anchor;
+ kvm_pte_t *childp;
struct kvm_s2_mmu *mmu;
- struct kvm_mmu_memory_cache *memcache;
+ void *memcache;
+
+ struct kvm_pgtable_mm_ops *mm_ops;
};
-static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
- struct stage2_map_data *data)
+u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
+{
+ u64 vtcr = VTCR_EL2_FLAGS;
+ u8 lvls;
+
+ vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
+ vtcr |= VTCR_EL2_T0SZ(phys_shift);
+ /*
+ * Use a minimum 2 level page table to prevent splitting
+ * host PMD huge pages at stage2.
+ */
+ lvls = stage2_pgtable_levels(phys_shift);
+ if (lvls < 2)
+ lvls = 2;
+ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+ /*
+ * Enable the Hardware Access Flag management, unconditionally
+ * on all CPUs. The features is RES0 on CPUs without the support
+ * and must be ignored by the CPUs.
+ */
+ vtcr |= VTCR_EL2_HA;
+
+ /* Set the vmid bits */
+ vtcr |= (get_vmid_bits(mmfr1) == 16) ?
+ VTCR_EL2_VS_16BIT :
+ VTCR_EL2_VS_8BIT;
+
+ return vtcr;
+}
+
+static bool stage2_has_fwb(struct kvm_pgtable *pgt)
+{
+ if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return false;
+
+ return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
+}
+
+#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
+
+static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
+ kvm_pte_t *ptep)
{
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
- kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
- PAGE_S2_MEMATTR(NORMAL);
+ kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
+ KVM_S2_MEMATTR(pgt, NORMAL);
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
if (!(prot & KVM_PGTABLE_PROT_X))
@@ -461,44 +539,78 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
- data->attr = attr;
+ *ptep = attr;
+
return 0;
}
+static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+{
+ if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
+ return true;
+
+ return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
+}
+
+static bool stage2_pte_is_counted(kvm_pte_t pte)
+{
+ /*
+ * The refcount tracks valid entries as well as invalid entries if they
+ * encode ownership of a page to another entity than the page-table
+ * owner, whose id is 0.
+ */
+ return !!pte;
+}
+
+static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
+ u32 level, struct kvm_pgtable_mm_ops *mm_ops)
+{
+ /*
+ * Clear the existing PTE, and perform break-before-make with
+ * TLB maintenance if it was valid.
+ */
+ if (kvm_pte_valid(*ptep)) {
+ kvm_clear_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+ }
+
+ mm_ops->put_page(ptep);
+}
+
static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
{
kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
- struct page *page = virt_to_page(ptep);
+ struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
if (!kvm_block_mapping_supported(addr, end, phys, level))
return -E2BIG;
- new = kvm_init_valid_leaf_pte(phys, data->attr, level);
- if (kvm_pte_valid(old)) {
+ if (kvm_phys_is_valid(phys))
+ new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+ else
+ new = kvm_init_invalid_leaf_owner(data->owner_id);
+
+ if (stage2_pte_is_counted(old)) {
/*
* Skip updating the PTE if we are trying to recreate the exact
* same mapping or only change the access permissions. Instead,
* the vCPU will exit one more time from guest if still needed
* and then go through the path of relaxing permissions.
*/
- if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+ if (!stage2_pte_needs_update(old, new))
return -EAGAIN;
- /*
- * There's an existing different valid leaf entry, so perform
- * break-before-make.
- */
- kvm_set_invalid_pte(ptep);
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
- put_page(page);
+ stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
}
smp_store_release(ptep, new);
- get_page(page);
- data->phys += granule;
+ if (stage2_pte_is_counted(new))
+ mm_ops->get_page(ptep);
+ if (kvm_phys_is_valid(phys))
+ data->phys += granule;
return 0;
}
@@ -512,7 +624,8 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
if (!kvm_block_mapping_supported(addr, end, data->phys, level))
return 0;
- kvm_set_invalid_pte(ptep);
+ data->childp = kvm_pte_follow(*ptep, data->mm_ops);
+ kvm_clear_pte(ptep);
/*
* Invalidate the whole stage-2, as we may have numerous leaf
@@ -527,13 +640,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
struct stage2_map_data *data)
{
- int ret;
+ struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
kvm_pte_t *childp, pte = *ptep;
- struct page *page = virt_to_page(ptep);
+ int ret;
if (data->anchor) {
- if (kvm_pte_valid(pte))
- put_page(page);
+ if (stage2_pte_is_counted(pte))
+ mm_ops->put_page(ptep);
return 0;
}
@@ -548,7 +661,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
if (!data->memcache)
return -ENOMEM;
- childp = kvm_mmu_memory_cache_alloc(data->memcache);
+ childp = mm_ops->zalloc_page(data->memcache);
if (!childp)
return -ENOMEM;
@@ -557,14 +670,11 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
* a table. Accesses beyond 'end' that fall within the new table
* will be mapped lazily.
*/
- if (kvm_pte_valid(pte)) {
- kvm_set_invalid_pte(ptep);
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
- put_page(page);
- }
+ if (stage2_pte_is_counted(pte))
+ stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
- kvm_set_table_pte(ptep, childp);
- get_page(page);
+ kvm_set_table_pte(ptep, childp, mm_ops);
+ mm_ops->get_page(ptep);
return 0;
}
@@ -573,19 +683,25 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
{
+ struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+ kvm_pte_t *childp;
int ret = 0;
if (!data->anchor)
return 0;
- free_page((unsigned long)kvm_pte_follow(*ptep));
- put_page(virt_to_page(ptep));
-
if (data->anchor == ptep) {
+ childp = data->childp;
data->anchor = NULL;
+ data->childp = NULL;
ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+ } else {
+ childp = kvm_pte_follow(*ptep, mm_ops);
}
+ mm_ops->put_page(childp);
+ mm_ops->put_page(ptep);
+
return ret;
}
@@ -627,13 +743,14 @@ static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
u64 phys, enum kvm_pgtable_prot prot,
- struct kvm_mmu_memory_cache *mc)
+ void *mc)
{
int ret;
struct stage2_map_data map_data = {
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
.mmu = pgt->mmu,
.memcache = mc,
+ .mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
@@ -643,7 +760,10 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
.arg = &map_data,
};
- ret = stage2_map_set_prot_attr(prot, &map_data);
+ if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
+ return -EINVAL;
+
+ ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
if (ret)
return ret;
@@ -652,38 +772,63 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
return ret;
}
-static void stage2_flush_dcache(void *addr, u64 size)
+int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ void *mc, u8 owner_id)
{
- if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
- return;
+ int ret;
+ struct stage2_map_data map_data = {
+ .phys = KVM_PHYS_INVALID,
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ .mm_ops = pgt->mm_ops,
+ .owner_id = owner_id,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_PRE |
+ KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = &map_data,
+ };
+
+ if (owner_id > KVM_MAX_OWNER_ID)
+ return -EINVAL;
- __flush_dcache_area(addr, size);
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ return ret;
}
-static bool stage2_pte_cacheable(kvm_pte_t pte)
+static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
{
u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
- return memattr == PAGE_S2_MEMATTR(NORMAL);
+ return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
}
static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
- struct kvm_s2_mmu *mmu = arg;
+ struct kvm_pgtable *pgt = arg;
+ struct kvm_s2_mmu *mmu = pgt->mmu;
+ struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
kvm_pte_t pte = *ptep, *childp = NULL;
bool need_flush = false;
- if (!kvm_pte_valid(pte))
+ if (!kvm_pte_valid(pte)) {
+ if (stage2_pte_is_counted(pte)) {
+ kvm_clear_pte(ptep);
+ mm_ops->put_page(ptep);
+ }
return 0;
+ }
if (kvm_pte_table(pte, level)) {
- childp = kvm_pte_follow(pte);
+ childp = kvm_pte_follow(pte, mm_ops);
- if (page_count(virt_to_page(childp)) != 1)
+ if (mm_ops->page_count(childp) != 1)
return 0;
- } else if (stage2_pte_cacheable(pte)) {
- need_flush = true;
+ } else if (stage2_pte_cacheable(pgt, pte)) {
+ need_flush = !stage2_has_fwb(pgt);
}
/*
@@ -691,17 +836,15 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
* block entry and rely on the remaining portions being faulted
* back lazily.
*/
- kvm_set_invalid_pte(ptep);
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
- put_page(virt_to_page(ptep));
+ stage2_put_pte(ptep, mmu, addr, level, mm_ops);
if (need_flush) {
- stage2_flush_dcache(kvm_pte_follow(pte),
+ __flush_dcache_area(kvm_pte_follow(pte, mm_ops),
kvm_granule_size(level));
}
if (childp)
- free_page((unsigned long)childp);
+ mm_ops->put_page(childp);
return 0;
}
@@ -710,7 +853,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
struct kvm_pgtable_walker walker = {
.cb = stage2_unmap_walker,
- .arg = pgt->mmu,
+ .arg = pgt,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
};
@@ -842,12 +985,14 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
+ struct kvm_pgtable *pgt = arg;
+ struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
kvm_pte_t pte = *ptep;
- if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+ if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
return 0;
- stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+ __flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level));
return 0;
}
@@ -856,30 +1001,35 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
struct kvm_pgtable_walker walker = {
.cb = stage2_flush_walker,
.flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = pgt,
};
- if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ if (stage2_has_fwb(pgt))
return 0;
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
-int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch,
+ struct kvm_pgtable_mm_ops *mm_ops,
+ enum kvm_pgtable_stage2_flags flags)
{
size_t pgd_sz;
- u64 vtcr = kvm->arch.vtcr;
+ u64 vtcr = arch->vtcr;
u32 ia_bits = VTCR_EL2_IPA(vtcr);
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
- pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz);
if (!pgt->pgd)
return -ENOMEM;
pgt->ia_bits = ia_bits;
pgt->start_level = start_level;
- pgt->mmu = &kvm->arch.mmu;
+ pgt->mm_ops = mm_ops;
+ pgt->mmu = &arch->mmu;
+ pgt->flags = flags;
/* Ensure zeroed PGD pages are visible to the hardware walker */
dsb(ishst);
@@ -890,15 +1040,16 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
+ struct kvm_pgtable_mm_ops *mm_ops = arg;
kvm_pte_t pte = *ptep;
- if (!kvm_pte_valid(pte))
+ if (!stage2_pte_is_counted(pte))
return 0;
- put_page(virt_to_page(ptep));
+ mm_ops->put_page(ptep);
if (kvm_pte_table(pte, level))
- free_page((unsigned long)kvm_pte_follow(pte));
+ mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
return 0;
}
@@ -910,10 +1061,85 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = pgt->mm_ops,
};
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
- free_pages_exact(pgt->pgd, pgd_sz);
+ pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz);
pgt->pgd = NULL;
}
+
+#define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \
+ KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \
+ KVM_PTE_LEAF_ATTR_S2_IGNORED)
+
+static int stage2_check_permission_walker(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t old_attr, pte = *ptep, *new_attr = arg;
+
+ /*
+ * Compatible mappings are either invalid and owned by the page-table
+ * owner (whose id is 0), or valid with matching permission attributes.
+ */
+ if (kvm_pte_valid(pte)) {
+ old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK;
+ if (old_attr != *new_attr)
+ return -EEXIST;
+ } else if (pte) {
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot,
+ struct kvm_mem_range *range)
+{
+ kvm_pte_t attr;
+ struct kvm_pgtable_walker check_perm_walker = {
+ .cb = stage2_check_permission_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &attr,
+ };
+ u64 granule, start, end;
+ u32 level;
+ int ret;
+
+ ret = stage2_set_prot_attr(pgt, prot, &attr);
+ if (ret)
+ return ret;
+ attr &= KVM_PTE_LEAF_S2_COMPAT_MASK;
+
+ for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) {
+ granule = kvm_granule_size(level);
+ start = ALIGN_DOWN(addr, granule);
+ end = start + granule;
+
+ if (!kvm_level_supports_block_mapping(level))
+ continue;
+
+ if (start < range->start || range->end < end)
+ continue;
+
+ /*
+ * Check the presence of existing mappings with incompatible
+ * permissions within the current block range, and try one level
+ * deeper if one is found.
+ */
+ ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker);
+ if (ret != -EEXIST)
+ break;
+ }
+
+ if (!ret) {
+ range->start = start;
+ range->end = end;
+ }
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c
new file mode 100644
index 000000000000..83ca23ac259b
--- /dev/null
+++ b/arch/arm64/kvm/hyp/reserved_mem.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 - Google LLC
+ * Author: Quentin Perret <qperret@google.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/kvm_host.h>
+
+#include <nvhe/memory.h>
+#include <nvhe/mm.h>
+
+static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
+static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
+
+phys_addr_t hyp_mem_base;
+phys_addr_t hyp_mem_size;
+
+static int cmp_hyp_memblock(const void *p1, const void *p2)
+{
+ const struct memblock_region *r1 = p1;
+ const struct memblock_region *r2 = p2;
+
+ return r1->base < r2->base ? -1 : (r1->base > r2->base);
+}
+
+static void __init sort_memblock_regions(void)
+{
+ sort(hyp_memory,
+ *hyp_memblock_nr_ptr,
+ sizeof(struct memblock_region),
+ cmp_hyp_memblock,
+ NULL);
+}
+
+static int __init register_memblock_regions(void)
+{
+ struct memblock_region *reg;
+
+ for_each_mem_region(reg) {
+ if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
+ return -ENOMEM;
+
+ hyp_memory[*hyp_memblock_nr_ptr] = *reg;
+ (*hyp_memblock_nr_ptr)++;
+ }
+ sort_memblock_regions();
+
+ return 0;
+}
+
+void __init kvm_hyp_reserve(void)
+{
+ u64 nr_pages, prev, hyp_mem_pages = 0;
+ int ret;
+
+ if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
+ return;
+
+ if (kvm_get_mode() != KVM_MODE_PROTECTED)
+ return;
+
+ ret = register_memblock_regions();
+ if (ret) {
+ *hyp_memblock_nr_ptr = 0;
+ kvm_err("Failed to register hyp memblocks: %d\n", ret);
+ return;
+ }
+
+ hyp_mem_pages += hyp_s1_pgtable_pages();
+ hyp_mem_pages += host_s2_mem_pgtable_pages();
+ hyp_mem_pages += host_s2_dev_pgtable_pages();
+
+ /*
+ * The hyp_vmemmap needs to be backed by pages, but these pages
+ * themselves need to be present in the vmemmap, so compute the number
+ * of pages needed by looking for a fixed point.
+ */
+ nr_pages = 0;
+ do {
+ prev = nr_pages;
+ nr_pages = hyp_mem_pages + prev;
+ nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE);
+ nr_pages += __hyp_pgtable_max_pages(nr_pages);
+ } while (nr_pages != prev);
+ hyp_mem_pages += nr_pages;
+
+ /*
+ * Try to allocate a PMD-aligned region to reduce TLB pressure once
+ * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
+ */
+ hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
+ hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+ ALIGN(hyp_mem_size, PMD_SIZE),
+ PMD_SIZE);
+ if (!hyp_mem_base)
+ hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
+ hyp_mem_size, PAGE_SIZE);
+ else
+ hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
+
+ if (!hyp_mem_base) {
+ kvm_err("Failed to reserve hyp memory\n");
+ return;
+ }
+ memblock_reserve(hyp_mem_base, hyp_mem_size);
+
+ kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
+ hyp_mem_base);
+}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index af8e940d0f03..7b8f7db5c1ed 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -27,8 +27,6 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
-const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
-
/* VHE specific context */
DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
@@ -207,7 +205,7 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
__deactivate_traps(vcpu);
sysreg_restore_host_state_vhe(host_ctxt);
- panic(__hyp_panic_string,
+ panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n",
spsr, elr,
read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR),
read_sysreg(hpfar_el2), par, vcpu);
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index ead21b98b620..30da78f72b3b 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -9,16 +9,65 @@
#include <kvm/arm_hypercalls.h>
#include <kvm/arm_psci.h>
+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+ struct system_time_snapshot systime_snapshot;
+ u64 cycles = ~0UL;
+ u32 feature;
+
+ /*
+ * system time and counter value must captured at the same
+ * time to keep consistency and precision.
+ */
+ ktime_get_snapshot(&systime_snapshot);
+
+ /*
+ * This is only valid if the current clocksource is the
+ * architected counter, as this is the only one the guest
+ * can see.
+ */
+ if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+ return;
+
+ /*
+ * The guest selects one of the two reference counters
+ * (virtual or physical) with the first argument of the SMCCC
+ * call. In case the identifier is not supported, error out.
+ */
+ feature = smccc_get_arg1(vcpu);
+ switch (feature) {
+ case KVM_PTP_VIRT_COUNTER:
+ cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+ break;
+ case KVM_PTP_PHYS_COUNTER:
+ cycles = systime_snapshot.cycles;
+ break;
+ default:
+ return;
+ }
+
+ /*
+ * This relies on the top bit of val[0] never being set for
+ * valid values of system time, because that is *really* far
+ * in the future (about 292 years from 1970, and at that stage
+ * nobody will give a damn about it).
+ */
+ val[0] = upper_32_bits(systime_snapshot.real);
+ val[1] = lower_32_bits(systime_snapshot.real);
+ val[2] = upper_32_bits(cycles);
+ val[3] = lower_32_bits(cycles);
+}
+
int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
{
u32 func_id = smccc_get_function(vcpu);
- long val = SMCCC_RET_NOT_SUPPORTED;
+ u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
u32 feature;
gpa_t gpa;
switch (func_id) {
case ARM_SMCCC_VERSION_FUNC_ID:
- val = ARM_SMCCC_VERSION_1_1;
+ val[0] = ARM_SMCCC_VERSION_1_1;
break;
case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
feature = smccc_get_arg1(vcpu);
@@ -28,10 +77,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
case SPECTRE_VULNERABLE:
break;
case SPECTRE_MITIGATED:
- val = SMCCC_RET_SUCCESS;
+ val[0] = SMCCC_RET_SUCCESS;
break;
case SPECTRE_UNAFFECTED:
- val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
+ val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
break;
}
break;
@@ -54,22 +103,35 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
break;
fallthrough;
case SPECTRE_UNAFFECTED:
- val = SMCCC_RET_NOT_REQUIRED;
+ val[0] = SMCCC_RET_NOT_REQUIRED;
break;
}
break;
case ARM_SMCCC_HV_PV_TIME_FEATURES:
- val = SMCCC_RET_SUCCESS;
+ val[0] = SMCCC_RET_SUCCESS;
break;
}
break;
case ARM_SMCCC_HV_PV_TIME_FEATURES:
- val = kvm_hypercall_pv_features(vcpu);
+ val[0] = kvm_hypercall_pv_features(vcpu);
break;
case ARM_SMCCC_HV_PV_TIME_ST:
gpa = kvm_init_stolen_time(vcpu);
if (gpa != GPA_INVALID)
- val = gpa;
+ val[0] = gpa;
+ break;
+ case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+ val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+ val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+ val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+ val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+ val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+ val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+ break;
+ case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+ kvm_ptp_get_time(vcpu, val);
break;
case ARM_SMCCC_TRNG_VERSION:
case ARM_SMCCC_TRNG_FEATURES:
@@ -81,6 +143,6 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
return kvm_psci_call(vcpu);
}
- smccc_set_retval(vcpu, val, 0, 0, 0);
+ smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
return 1;
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8711894db8c2..c5d1f3c87dbd 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -88,6 +88,44 @@ static bool kvm_is_device_pfn(unsigned long pfn)
return !pfn_valid(pfn);
}
+static void *stage2_memcache_zalloc_page(void *arg)
+{
+ struct kvm_mmu_memory_cache *mc = arg;
+
+ /* Allocated with __GFP_ZERO, so no need to zero */
+ return kvm_mmu_memory_cache_alloc(mc);
+}
+
+static void *kvm_host_zalloc_pages_exact(size_t size)
+{
+ return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+}
+
+static void kvm_host_get_page(void *addr)
+{
+ get_page(virt_to_page(addr));
+}
+
+static void kvm_host_put_page(void *addr)
+{
+ put_page(virt_to_page(addr));
+}
+
+static int kvm_host_page_count(void *addr)
+{
+ return page_count(virt_to_page(addr));
+}
+
+static phys_addr_t kvm_host_pa(void *addr)
+{
+ return __pa(addr);
+}
+
+static void *kvm_host_va(phys_addr_t phys)
+{
+ return __va(phys);
+}
+
/*
* Unmapping vs dcache management:
*
@@ -127,7 +165,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
bool may_block)
{
- struct kvm *kvm = mmu->kvm;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
phys_addr_t end = start + size;
assert_spin_locked(&kvm->mmu_lock);
@@ -183,15 +221,39 @@ void free_hyp_pgds(void)
if (hyp_pgtable) {
kvm_pgtable_hyp_destroy(hyp_pgtable);
kfree(hyp_pgtable);
+ hyp_pgtable = NULL;
}
mutex_unlock(&kvm_hyp_pgd_mutex);
}
+static bool kvm_host_owns_hyp_mappings(void)
+{
+ if (static_branch_likely(&kvm_protected_mode_initialized))
+ return false;
+
+ /*
+ * This can happen at boot time when __create_hyp_mappings() is called
+ * after the hyp protection has been enabled, but the static key has
+ * not been flipped yet.
+ */
+ if (!hyp_pgtable && is_protected_kvm_enabled())
+ return false;
+
+ WARN_ON(!hyp_pgtable);
+
+ return true;
+}
+
static int __create_hyp_mappings(unsigned long start, unsigned long size,
unsigned long phys, enum kvm_pgtable_prot prot)
{
int err;
+ if (!kvm_host_owns_hyp_mappings()) {
+ return kvm_call_hyp_nvhe(__pkvm_create_mappings,
+ start, size, phys, prot);
+ }
+
mutex_lock(&kvm_hyp_pgd_mutex);
err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
mutex_unlock(&kvm_hyp_pgd_mutex);
@@ -253,6 +315,16 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
unsigned long base;
int ret = 0;
+ if (!kvm_host_owns_hyp_mappings()) {
+ base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
+ phys_addr, size, prot);
+ if (IS_ERR_OR_NULL((void *)base))
+ return PTR_ERR((void *)base);
+ *haddr = base;
+
+ return 0;
+ }
+
mutex_lock(&kvm_hyp_pgd_mutex);
/*
@@ -351,6 +423,17 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
return 0;
}
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
+ .zalloc_page = stage2_memcache_zalloc_page,
+ .zalloc_pages_exact = kvm_host_zalloc_pages_exact,
+ .free_pages_exact = free_pages_exact,
+ .get_page = kvm_host_get_page,
+ .put_page = kvm_host_put_page,
+ .page_count = kvm_host_page_count,
+ .phys_to_virt = kvm_host_va,
+ .virt_to_phys = kvm_host_pa,
+};
+
/**
* kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
* @kvm: The pointer to the KVM structure
@@ -374,7 +457,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
if (!pgt)
return -ENOMEM;
- err = kvm_pgtable_stage2_init(pgt, kvm);
+ err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops);
if (err)
goto out_free_pgtable;
@@ -387,7 +470,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
- mmu->kvm = kvm;
+ mmu->arch = &kvm->arch;
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
mmu->vmid.vmid_gen = 0;
@@ -421,10 +504,11 @@ static void stage2_unmap_memslot(struct kvm *kvm,
* +--------------------------------------------+
*/
do {
- struct vm_area_struct *vma = find_vma(current->mm, hva);
+ struct vm_area_struct *vma;
hva_t vm_start, vm_end;
- if (!vma || vma->vm_start >= reg_end)
+ vma = find_vma_intersection(current->mm, hva, reg_end);
+ if (!vma)
break;
/*
@@ -469,7 +553,7 @@ void stage2_unmap_vm(struct kvm *kvm)
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
- struct kvm *kvm = mmu->kvm;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
struct kvm_pgtable *pgt = NULL;
spin_lock(&kvm->mmu_lock);
@@ -538,7 +622,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- struct kvm *kvm = mmu->kvm;
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
}
@@ -555,7 +639,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_
* Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
* serializing operations for VM memory regions.
*/
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
@@ -839,13 +923,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
* the page we just got a reference to gets unmapped before we have a
* chance to grab the mmu_lock, which ensure that if the page gets
- * unmapped afterwards, the call to kvm_unmap_hva will take it away
+ * unmapped afterwards, the call to kvm_unmap_gfn will take it away
* from us again properly. This smp_rmb() interacts with the smp_wmb()
* in kvm_mmu_notifier_invalidate_<page|range_end>.
+ *
+ * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
+ * used to avoid unnecessary overhead introduced to locate the memory
+ * slot because it's always fixed even @gfn is adjusted for huge pages.
*/
smp_rmb();
- pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+ write_fault, &writable, NULL);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -911,7 +1000,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
kvm_set_pfn_dirty(pfn);
- mark_page_dirty(kvm, gfn);
+ mark_page_dirty_in_slot(kvm, memslot, gfn);
}
out_unlock:
@@ -1064,126 +1153,70 @@ out_unlock:
return ret;
}
-static int handle_hva_to_gpa(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- int (*handler)(struct kvm *kvm,
- gpa_t gpa, u64 size,
- void *data),
- void *data)
-{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int ret = 0;
-
- slots = kvm_memslots(kvm);
-
- /* we only care about the pages that the guest sees */
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gpa;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
-
- gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
- ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
- }
-
- return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- unsigned flags = *(unsigned *)data;
- bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
-
- __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
- return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_unmap_hva_range(start, end);
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
- return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- kvm_pfn_t *pfn = (kvm_pfn_t *)data;
-
- WARN_ON(size != PAGE_SIZE);
+ __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
+ (range->end - range->start) << PAGE_SHIFT,
+ range->may_block);
- /*
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
- __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- unsigned long end = hva + PAGE_SIZE;
- kvm_pfn_t pfn = pte_pfn(pte);
+ kvm_pfn_t pfn = pte_pfn(range->pte);
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_set_spte_hva(hva);
+ WARN_ON(range->end - range->start != 1);
/*
* We've moved a page around, probably through CoW, so let's treat it
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
+
+ /*
+ * The MMU notifiers will have unmapped a huge PMD before calling
+ * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
+ * therefore we never need to clear out a huge PMD through this
+ * calling path and a memcache is not required.
+ */
+ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
+ PAGE_SIZE, __pfn_to_phys(pfn),
+ KVM_PGTABLE_PROT_R, NULL);
+
return 0;
}
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- pte_t pte;
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
kvm_pte_t kpte;
+ pte_t pte;
+
+ if (!kvm->arch.mmu.pgt)
+ return 0;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+
+ kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT);
pte = __pte(kpte);
return pte_valid(pte) && pte_young(pte);
}
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
if (!kvm->arch.mmu.pgt)
return 0;
- trace_kvm_age_hva(start, end);
- return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- if (!kvm->arch.mmu.pgt)
- return 0;
- trace_kvm_test_age_hva(hva);
- return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
- kvm_test_age_hva_handler, NULL);
+ return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT);
}
phys_addr_t kvm_mmu_get_httbr(void)
@@ -1208,10 +1241,22 @@ static int kvm_map_idmap_text(void)
return err;
}
-int kvm_mmu_init(void)
+static void *kvm_hyp_zalloc_page(void *arg)
+{
+ return (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
+ .zalloc_page = kvm_hyp_zalloc_page,
+ .get_page = kvm_host_get_page,
+ .put_page = kvm_host_put_page,
+ .phys_to_virt = kvm_host_va,
+ .virt_to_phys = kvm_host_pa,
+};
+
+int kvm_mmu_init(u32 *hyp_va_bits)
{
int err;
- u32 hyp_va_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1225,8 +1270,8 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
- hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
- kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
+ *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
kern_hyp_va(PAGE_OFFSET),
@@ -1251,7 +1296,7 @@ int kvm_mmu_init(void)
goto out;
}
- err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+ err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
if (err)
goto out_free_pgtable;
@@ -1329,10 +1374,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* +--------------------------------------------+
*/
do {
- struct vm_area_struct *vma = find_vma(current->mm, hva);
+ struct vm_area_struct *vma;
hva_t vm_start, vm_end;
- if (!vma || vma->vm_start >= reg_end)
+ vma = find_vma_intersection(current->mm, hva, reg_end);
+ if (!vma)
break;
/*
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
index 739164324afe..151c31fb9860 100644
--- a/arch/arm64/kvm/perf.c
+++ b/arch/arm64/kvm/perf.c
@@ -50,12 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
int kvm_perf_init(void)
{
- /*
- * Check if HW_PERF_EVENTS are supported by checking the number of
- * hardware performance counters. This could ensure the presence of
- * a physical PMU and CONFIG_PERF_EVENT is selected.
- */
- if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0)
+ if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled())
static_branch_enable(&kvm_arm_pmu_available);
return perf_register_guest_info_callbacks(&kvm_guest_cbs);
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index e32c6e139a09..fd167d4f4215 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -739,7 +739,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
kvm_pmu_create_perf_event(vcpu, select_idx);
}
-static int kvm_pmu_probe_pmuver(void)
+int kvm_pmu_probe_pmuver(void)
{
struct perf_event_attr attr = { };
struct perf_event *event;
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index faf32a44ba04..03a6c1f4a09a 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
{
struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
- if (!ctx || !kvm_pmu_switch_needed(attr))
+ if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
@@ -49,7 +49,7 @@ void kvm_clr_pmu_events(u32 clr)
{
struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
- if (!ctx)
+ if (!kvm_arm_support_pmu_v3() || !ctx)
return;
ctx->pmu_events.events_host &= ~clr;
@@ -172,7 +172,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
struct kvm_host_data *host;
u32 events_guest, events_host;
- if (!has_vhe())
+ if (!kvm_arm_support_pmu_v3() || !has_vhe())
return;
preempt_disable();
@@ -193,7 +193,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
struct kvm_host_data *host;
u32 events_guest, events_host;
- if (!has_vhe())
+ if (!kvm_arm_support_pmu_v3() || !has_vhe())
return;
host = this_cpu_ptr_hyp_sym(kvm_host_data);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index bd354cd45d28..956cdc240148 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -74,10 +74,6 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
if (!system_supports_sve())
return -EINVAL;
- /* Verify that KVM startup enforced this when SVE was detected: */
- if (WARN_ON(!has_vhe()))
- return -EINVAL;
-
vcpu->arch.sve_max_vl = kvm_sve_max_vl;
/*
@@ -242,6 +238,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
/* Reset core registers */
memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+ memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
+ vcpu->arch.ctxt.spsr_abt = 0;
+ vcpu->arch.ctxt.spsr_und = 0;
+ vcpu->arch.ctxt.spsr_irq = 0;
+ vcpu->arch.ctxt.spsr_fiq = 0;
vcpu_gp_regs(vcpu)->pstate = pstate;
/* Reset system registers */
@@ -333,19 +334,10 @@ int kvm_set_ipa_limit(void)
return 0;
}
-/*
- * Configure the VTCR_EL2 for this VM. The VTCR value is common
- * across all the physical CPUs on the system. We use system wide
- * sanitised values to fill in different fields, except for Hardware
- * Management of Access Flags. HA Flag is set unconditionally on
- * all CPUs, as it is safe to run with or without the feature and
- * the bit is RES0 on CPUs that don't support it.
- */
int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
{
- u64 vtcr = VTCR_EL2_FLAGS, mmfr0;
- u32 parange, phys_shift;
- u8 lvls;
+ u64 mmfr0, mmfr1;
+ u32 phys_shift;
if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
return -EINVAL;
@@ -365,33 +357,8 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
}
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
- parange = cpuid_feature_extract_unsigned_field(mmfr0,
- ID_AA64MMFR0_PARANGE_SHIFT);
- if (parange > ID_AA64MMFR0_PARANGE_MAX)
- parange = ID_AA64MMFR0_PARANGE_MAX;
- vtcr |= parange << VTCR_EL2_PS_SHIFT;
-
- vtcr |= VTCR_EL2_T0SZ(phys_shift);
- /*
- * Use a minimum 2 level page table to prevent splitting
- * host PMD huge pages at stage2.
- */
- lvls = stage2_pgtable_levels(phys_shift);
- if (lvls < 2)
- lvls = 2;
- vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
-
- /*
- * Enable the Hardware Access Flag management, unconditionally
- * on all CPUs. The features is RES0 on CPUs without the support
- * and must be ignored by the CPUs.
- */
- vtcr |= VTCR_EL2_HA;
+ mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+ kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
- /* Set the vmid bits */
- vtcr |= (kvm_get_vmid_bits() == 16) ?
- VTCR_EL2_VS_16BIT :
- VTCR_EL2_VS_8BIT;
- kvm->arch.vtcr = vtcr;
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 4f2f1e3145de..76ea2800c33e 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1063,6 +1063,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
val = cpuid_feature_cap_perfmon_field(val,
ID_AA64DFR0_PMUVER_SHIFT,
kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0);
+ /* Hide SPE from guests */
+ val &= ~FEATURE(ID_AA64DFR0_PMSVER);
break;
case SYS_ID_DFR0_EL1:
/* Limit guests to PMUv3 for ARMv8.4 */
@@ -1472,6 +1474,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_GCR_EL1), undef_access },
{ SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
+ { SYS_DESC(SYS_TRFCR_EL1), undef_access },
{ SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
{ SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
{ SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
@@ -1501,6 +1504,19 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
+ { SYS_DESC(SYS_PMSCR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSICR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSIRR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSFCR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSEVFR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSLATFR_EL1), undef_access },
+ { SYS_DESC(SYS_PMSIDR_EL1), undef_access },
+ { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access },
+ { SYS_DESC(SYS_PMBPTR_EL1), undef_access },
+ { SYS_DESC(SYS_PMBSR_EL1), undef_access },
+ /* PMBIDR_EL1 is not trapped */
+
{ PMU_SYS_REG(SYS_PMINTENSET_EL1),
.access = access_pminten, .reg = PMINTENSET_EL1 },
{ PMU_SYS_REG(SYS_PMINTENCLR_EL1),
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index ff0444352bba..33e4e7dd2719 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -135,72 +135,6 @@ TRACE_EVENT(kvm_mmio_emulate,
__entry->vcpu_pc, __entry->instr, __entry->cpsr)
);
-TRACE_EVENT(kvm_unmap_hva_range,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end),
-
- TP_STRUCT__entry(
- __field( unsigned long, start )
- __field( unsigned long, end )
- ),
-
- TP_fast_assign(
- __entry->start = start;
- __entry->end = end;
- ),
-
- TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
- __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end),
-
- TP_STRUCT__entry(
- __field( unsigned long, start )
- __field( unsigned long, end )
- ),
-
- TP_fast_assign(
- __entry->start = start;
- __entry->end = end;
- ),
-
- TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
- __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
-);
-
TRACE_EVENT(kvm_set_way_flush,
TP_PROTO(unsigned long vcpu_pc, bool cache),
TP_ARGS(vcpu_pc, cache),
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index 978301392d67..acdb7b3cc97d 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -288,3 +288,10 @@ void kvm_get_kimage_voffset(struct alt_instr *alt,
{
generate_mov_q(kimage_voffset, origptr, updptr, nr_inst);
}
+
+void kvm_compute_final_ctr_el0(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst)
+{
+ generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0),
+ origptr, updptr, nr_inst);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 052917deb149..58cbda00e56d 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -335,13 +335,14 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
kfree(dist->spis);
dist->spis = NULL;
dist->nr_spis = 0;
+ dist->vgic_dist_base = VGIC_ADDR_UNDEF;
- if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
- list_del(&rdreg->list);
- kfree(rdreg);
- }
+ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
+ vgic_v3_free_redist_region(rdreg);
INIT_LIST_HEAD(&dist->rd_regions);
+ } else {
+ dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
}
if (vgic_has_its(kvm))
@@ -362,6 +363,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
vgic_flush_pending_lpis(vcpu);
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+ vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
}
/* To be called with kvm->lock held */
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index b9518f94bd43..61728c543eb9 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
/*
* If an LPI carries the HW bit, this means that this
* interrupt is controlled by GICv4, and we do not
- * have direct access to that state. Let's simply fail
- * the save operation...
+ * have direct access to that state without GICv4.1.
+ * Let's simply fail the save operation...
*/
- if (ite->irq->hw)
+ if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
return -EACCES;
ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 44419679f91a..7740995de982 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -87,8 +87,8 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
goto out;
}
- rdreg = list_first_entry(&vgic->rd_regions,
- struct vgic_redist_region, list);
+ rdreg = list_first_entry_or_null(&vgic->rd_regions,
+ struct vgic_redist_region, list);
if (!rdreg)
addr_ptr = &undef_value;
else
@@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev,
u64 addr;
unsigned long type = (unsigned long)attr->attr;
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
r = kvm_vgic_addr(dev->kvm, type, &addr, false);
if (r)
return (r == -ENODEV) ? -ENXIO : r;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 2f1b156021a6..a09cdc0b953c 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -251,30 +251,35 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
vgic_enable_lpis(vcpu);
}
-static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
+static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu)
{
- unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+ struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
- int target_vcpu_id = vcpu->vcpu_id;
- gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
- (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
- u64 value;
+ struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg;
- value = (u64)(mpidr & GENMASK(23, 0)) << 32;
- value |= ((target_vcpu_id & 0xffff) << 8);
+ if (!rdreg)
+ return false;
- if (addr == last_rdist_typer)
- value |= GICR_TYPER_LAST;
- if (vgic_has_its(vcpu->kvm))
- value |= GICR_TYPER_PLPIS;
+ if (vgic_cpu->rdreg_index < rdreg->free_index - 1) {
+ return false;
+ } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) {
+ struct list_head *rd_regions = &vgic->rd_regions;
+ gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
- return extract_bytes(value, addr & 7, len);
+ /*
+ * the rdist is the last one of the redist region,
+ * check whether there is no other contiguous rdist region
+ */
+ list_for_each_entry(iter, rd_regions, list) {
+ if (iter->base == end && iter->free_index > 0)
+ return false;
+ }
+ }
+ return true;
}
-static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
{
unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
int target_vcpu_id = vcpu->vcpu_id;
@@ -286,7 +291,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
if (vgic_has_its(vcpu->kvm))
value |= GICR_TYPER_PLPIS;
- /* reporting of the Last bit is not supported for userspace */
+ if (vgic_mmio_vcpu_rdist_is_last(vcpu))
+ value |= GICR_TYPER_LAST;
+
return extract_bytes(value, addr & 7, len);
}
@@ -612,7 +619,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER,
vgic_mmio_read_v3r_typer, vgic_mmio_write_wi,
- vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8,
+ NULL, vgic_mmio_uaccess_write_wi, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
@@ -714,6 +721,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
return -EINVAL;
vgic_cpu->rdreg = rdreg;
+ vgic_cpu->rdreg_index = rdreg->free_index;
rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
@@ -768,7 +776,7 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
}
/**
- * vgic_v3_insert_redist_region - Insert a new redistributor region
+ * vgic_v3_alloc_redist_region - Allocate a new redistributor region
*
* Performs various checks before inserting the rdist region in the list.
* Those tests depend on whether the size of the rdist region is known
@@ -782,8 +790,8 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm)
*
* Return 0 on success, < 0 otherwise
*/
-static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
- gpa_t base, uint32_t count)
+static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index,
+ gpa_t base, uint32_t count)
{
struct vgic_dist *d = &kvm->arch.vgic;
struct vgic_redist_region *rdreg;
@@ -791,10 +799,6 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
int ret;
- /* single rdist region already set ?*/
- if (!count && !list_empty(rd_regions))
- return -EINVAL;
-
/* cross the end of memory ? */
if (base + size < base)
return -EINVAL;
@@ -805,11 +809,15 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
} else {
rdreg = list_last_entry(rd_regions,
struct vgic_redist_region, list);
- if (index != rdreg->index + 1)
+
+ /* Don't mix single region and discrete redist regions */
+ if (!count && rdreg->count)
return -EINVAL;
- /* Cannot add an explicitly sized regions after legacy region */
- if (!rdreg->count)
+ if (!count)
+ return -EEXIST;
+
+ if (index != rdreg->index + 1)
return -EINVAL;
}
@@ -848,11 +856,17 @@ free:
return ret;
}
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg)
+{
+ list_del(&rdreg->list);
+ kfree(rdreg);
+}
+
int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
{
int ret;
- ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
+ ret = vgic_v3_alloc_redist_region(kvm, index, addr, count);
if (ret)
return ret;
@@ -861,8 +875,13 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
* afterwards will register the iodevs when needed.
*/
ret = vgic_register_all_redist_iodevs(kvm);
- if (ret)
+ if (ret) {
+ struct vgic_redist_region *rdreg;
+
+ rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+ vgic_v3_free_redist_region(rdreg);
return ret;
+ }
return 0;
}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index b2d73fc0d1ef..48c6067fc5ec 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
return region;
}
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
gpa_t addr, u32 *val)
{
- struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
struct kvm_vcpu *r_vcpu;
@@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
return 0;
}
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
gpa_t addr, const u32 *val)
{
- struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
struct kvm_vcpu *r_vcpu;
@@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
bool is_write, int offset, u32 *val)
{
if (is_write)
- return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+ return vgic_uaccess_write(vcpu, dev, offset, val);
else
- return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+ return vgic_uaccess_read(vcpu, dev, offset, val);
}
static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 6f530925a231..41ecf219c333 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
@@ -356,6 +358,32 @@ retry:
return 0;
}
+/*
+ * The deactivation of the doorbell interrupt will trigger the
+ * unmapping of the associated vPE.
+ */
+static void unmap_all_vpes(struct vgic_dist *dist)
+{
+ struct irq_desc *desc;
+ int i;
+
+ for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+ desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+ irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+ }
+}
+
+static void map_all_vpes(struct vgic_dist *dist)
+{
+ struct irq_desc *desc;
+ int i;
+
+ for (i = 0; i < dist->its_vm.nr_vpes; i++) {
+ desc = irq_to_desc(dist->its_vm.vpes[i]->irq);
+ irq_domain_activate_irq(irq_desc_get_irq_data(desc), false);
+ }
+}
+
/**
* vgic_v3_save_pending_tables - Save the pending tables into guest RAM
* kvm lock and all vcpu lock must be held
@@ -365,13 +393,28 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq;
gpa_t last_ptr = ~(gpa_t)0;
- int ret;
+ bool vlpi_avail = false;
+ int ret = 0;
u8 val;
+ if (unlikely(!vgic_initialized(kvm)))
+ return -ENXIO;
+
+ /*
+ * A preparation for getting any VLPI states.
+ * The above vgic initialized check also ensures that the allocation
+ * and enabling of the doorbells have already been done.
+ */
+ if (kvm_vgic_global_state.has_gicv4_1) {
+ unmap_all_vpes(dist);
+ vlpi_avail = true;
+ }
+
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
int byte_offset, bit_nr;
struct kvm_vcpu *vcpu;
gpa_t pendbase, ptr;
+ bool is_pending;
bool stored;
vcpu = irq->target_vcpu;
@@ -387,24 +430,35 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
- return ret;
+ goto out;
last_ptr = ptr;
}
stored = val & (1U << bit_nr);
- if (stored == irq->pending_latch)
+
+ is_pending = irq->pending_latch;
+
+ if (irq->hw && vlpi_avail)
+ vgic_v4_get_vlpi_state(irq, &is_pending);
+
+ if (stored == is_pending)
continue;
- if (irq->pending_latch)
+ if (is_pending)
val |= 1 << bit_nr;
else
val &= ~(1 << bit_nr);
ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
- return ret;
+ goto out;
}
- return 0;
+
+out:
+ if (vlpi_avail)
+ map_all_vpes(dist);
+
+ return ret;
}
/**
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 66508b03094f..c1845d8f5f7e 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm)
kvm_arm_resume_guest(kvm);
}
+/*
+ * Must be called with GICv4.1 and the vPE unmapped, which
+ * indicates the invalidation of any VPT caches associated
+ * with the vPE, thus we can get the VLPI state by peeking
+ * at the VPT.
+ */
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
+{
+ struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ int mask = BIT(irq->intid % BITS_PER_BYTE);
+ void *va;
+ u8 *ptr;
+
+ va = page_address(vpe->vpt_page);
+ ptr = va + irq->intid / BITS_PER_BYTE;
+
+ *val = !!(*ptr & mask);
+}
+
/**
* vgic_v4_init - Initialize the GICv4 data structures
* @kvm: Pointer to the VM being initialized
@@ -385,6 +404,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
struct vgic_its *its;
struct vgic_irq *irq;
struct its_vlpi_map map;
+ unsigned long flags;
int ret;
if (!vgic_supports_direct_msis(kvm))
@@ -430,6 +450,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
irq->host_irq = virq;
atomic_inc(&map.vpe->vlpi_count);
+ /* Transfer pending state */
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->pending_latch) {
+ ret = irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ irq->pending_latch);
+ WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+
+ /*
+ * Clear pending_latch and communicate this state
+ * change via vgic_queue_irq_unlock.
+ */
+ irq->pending_latch = false;
+ vgic_queue_irq_unlock(kvm, irq, flags);
+ } else {
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+
out:
mutex_unlock(&its->its_lock);
return ret;
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 64fcd7511110..dc1f3d1657ee 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -293,6 +293,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
u32 index);
+void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg);
bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
@@ -317,5 +318,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm);
int vgic_v4_init(struct kvm *kvm);
void vgic_v4_teardown(struct kvm *kvm);
void vgic_v4_configure_vsgis(struct kvm *kvm);
+void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
#endif
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index 073acbf02a7c..b84b179edba3 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -14,7 +14,7 @@
* Parameters:
* x0 - dest
*/
-SYM_FUNC_START(clear_page)
+SYM_FUNC_START_PI(clear_page)
mrs x1, dczid_el0
and w1, w1, #0xf
mov x2, #4
@@ -25,5 +25,5 @@ SYM_FUNC_START(clear_page)
tst x0, #(PAGE_SIZE - 1)
b.ne 1b
ret
-SYM_FUNC_END(clear_page)
+SYM_FUNC_END_PI(clear_page)
EXPORT_SYMBOL(clear_page)
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index e7a793961408..29144f4cd449 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -17,7 +17,7 @@
* x0 - dest
* x1 - src
*/
-SYM_FUNC_START(copy_page)
+SYM_FUNC_START_PI(copy_page)
alternative_if ARM64_HAS_NO_HW_PREFETCH
// Prefetch three cache lines ahead.
prfm pldl1strm, [x1, #128]
@@ -75,5 +75,5 @@ alternative_else_nop_endif
stnp x16, x17, [x0, #112 - 256]
ret
-SYM_FUNC_END(copy_page)
+SYM_FUNC_END_PI(copy_page)
EXPORT_SYMBOL(copy_page)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index ef031511ce29..0696a459ea95 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -35,6 +35,7 @@
#include <asm/fixmap.h>
#include <asm/kasan.h>
#include <asm/kernel-pgtable.h>
+#include <asm/kvm_host.h>
#include <asm/memory.h>
#include <asm/numa.h>
#include <asm/sections.h>
@@ -452,6 +453,8 @@ void __init bootmem_init(void)
dma_pernuma_cma_reserve();
+ kvm_hyp_reserve();
+
/*
* sparse_init() tries to allocate memory from memblock, so must be
* done after the fixed reservations
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 603ad562d101..fca4547d580f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -740,14 +740,7 @@ struct kvm_mips_callbacks {
int (*vcpu_init)(struct kvm_vcpu *vcpu);
void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
int (*vcpu_setup)(struct kvm_vcpu *vcpu);
- void (*flush_shadow_all)(struct kvm *kvm);
- /*
- * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
- * VZ root TLB, or T&E GVA page tables and corresponding root TLB
- * mappings).
- */
- void (*flush_shadow_memslot)(struct kvm *kvm,
- const struct kvm_memory_slot *slot);
+ void (*prepare_flush_shadow)(struct kvm *kvm);
gpa_t (*gva_to_gpa)(gva_t gva);
void (*queue_timer_int)(struct kvm_vcpu *vcpu);
void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -824,11 +817,6 @@ pgd_t *kvm_pgd_alloc(void);
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
/* Emulation */
enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
@@ -916,4 +904,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+int kvm_arch_flush_remote_tlb(struct kvm *kvm);
+
#endif /* __MIPS_KVM_HOST_H__ */
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 29d37ba1bea2..4d4af97dcc88 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -197,9 +197,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
/* Flush whole GPA */
kvm_mips_flush_gpa_pt(kvm, 0, ~0);
-
- /* Let implementation do the rest */
- kvm_mips_callbacks->flush_shadow_all(kvm);
+ kvm_flush_remote_tlbs(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -214,8 +212,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
/* Flush slot from GPA */
kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
slot->base_gfn + slot->npages - 1);
- /* Let implementation do the rest */
- kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
spin_unlock(&kvm->mmu_lock);
}
@@ -255,9 +252,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
/* Write protect GPA page table entries */
needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
new->base_gfn + new->npages - 1);
- /* Let implementation do the rest */
if (needs_flush)
- kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+ kvm_arch_flush_remote_tlbs_memslot(kvm, new);
spin_unlock(&kvm->mmu_lock);
}
}
@@ -972,11 +968,16 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
}
+int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+ kvm_mips_callbacks->prepare_flush_shadow(kvm);
+ return 1;
+}
+
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
- /* Let implementation handle TLB/GVA invalidation */
- kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
+ kvm_flush_remote_tlbs(kvm);
}
long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 190ca2451851..6d1f68cf4edf 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -439,85 +439,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
end_gfn << PAGE_SHIFT);
}
-static int handle_hva_to_gpa(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- int (*handler)(struct kvm *kvm, gfn_t gfn,
- gpa_t gfn_end,
- struct kvm_memory_slot *memslot,
- void *data),
- void *data)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int ret = 0;
-
- slots = kvm_memslots(kvm);
-
- /* we only care about the pages that the guest sees */
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
-
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- ret |= handler(kvm, gfn, gfn_end, memslot, data);
- }
-
- return ret;
-}
-
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
-{
- kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+ kvm_mips_flush_gpa_pt(kvm, range->start, range->end);
return 1;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
-{
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-
- kvm_mips_callbacks->flush_shadow_all(kvm);
- return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- gpa_t gpa = gfn << PAGE_SHIFT;
- pte_t hva_pte = *(pte_t *)data;
+ gpa_t gpa = range->start << PAGE_SHIFT;
+ pte_t hva_pte = range->pte;
pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
pte_t old_pte;
if (!gpa_pte)
- return 0;
+ return false;
/* Mapping may need adjusting depending on memslot flags */
old_pte = *gpa_pte;
- if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+ if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
hva_pte = pte_mkclean(hva_pte);
- else if (memslot->flags & KVM_MEM_READONLY)
+ else if (range->slot->flags & KVM_MEM_READONLY)
hva_pte = pte_wrprotect(hva_pte);
set_pte(gpa_pte, hva_pte);
/* Replacing an absent or old page doesn't need flushes */
if (!pte_present(old_pte) || !pte_young(old_pte))
- return 0;
+ return false;
/* Pages swapped, aged, moved, or cleaned require flushes */
return !pte_present(hva_pte) ||
@@ -526,27 +475,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
(pte_dirty(old_pte) && !pte_dirty(hva_pte));
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- unsigned long end = hva + PAGE_SIZE;
- int ret;
-
- ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
- if (ret)
- kvm_mips_callbacks->flush_shadow_all(kvm);
- return 0;
+ return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
}
-static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
-}
-
-static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
- struct kvm_memory_slot *memslot, void *data)
-{
- gpa_t gpa = gfn << PAGE_SHIFT;
+ gpa_t gpa = range->start << PAGE_SHIFT;
pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
if (!gpa_pte)
@@ -554,16 +490,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
return pte_young(*gpa_pte);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
- return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
/**
* _kvm_mips_map_page_fast() - Fast path GPA fault handler.
* @vcpu: VCPU pointer.
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index d0d03bddbbba..43cad10b877d 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -3210,32 +3210,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
}
-static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+static void kvm_vz_prepare_flush_shadow(struct kvm *kvm)
{
- if (cpu_has_guestid) {
- /* Flush GuestID for each VCPU individually */
- kvm_flush_remote_tlbs(kvm);
- } else {
+ if (!cpu_has_guestid) {
/*
* For each CPU there is a single GPA ASID used by all VCPUs in
* the VM, so it doesn't make sense for the VCPUs to handle
* invalidation of these ASIDs individually.
*
* Instead mark all CPUs as needing ASID invalidation in
- * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+ * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will
* kick any running VCPUs so they check asid_flush_mask.
*/
cpumask_setall(&kvm->arch.asid_flush_mask);
- kvm_flush_remote_tlbs(kvm);
}
}
-static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
-{
- kvm_vz_flush_shadow_all(kvm);
-}
-
static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
@@ -3291,8 +3281,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
.vcpu_init = kvm_vz_vcpu_init,
.vcpu_uninit = kvm_vz_vcpu_uninit,
.vcpu_setup = kvm_vz_vcpu_setup,
- .flush_shadow_all = kvm_vz_flush_shadow_all,
- .flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+ .prepare_flush_shadow = kvm_vz_prepare_flush_shadow,
.gva_to_gpa = kvm_vz_gva_to_gpa_cb,
.queue_timer_int = kvm_vz_queue_timer_int_cb,
.dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index c58121508157..a6e9a5585e61 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
unsigned int lpid);
extern int kvmppc_radix_init(void);
extern void kvmppc_radix_exit(void);
-extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
+extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
+extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
+extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn);
extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 05fb00d37609..1e83359f286b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -56,13 +56,6 @@
#define KVM_ARCH_WANT_MMU_NOTIFIER
-extern int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end,
- unsigned flags);
-extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-
#define HPTEG_CACHE_NUM (1 << 15)
#define HPTEG_HASH_BITS_PTE 13
#define HPTEG_HASH_BITS_PTE_LONG 12
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 9531b1c1b190..5bf8ae9bb2cc 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -281,11 +281,10 @@ struct kvmppc_ops {
const struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change);
- int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
- unsigned long end);
- int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
- int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
- void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
+ bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
+ bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
void (*free_memslot)(struct kvm_memory_slot *slot);
int (*init_vm)(struct kvm *kvm);
void (*destroy_vm)(struct kvm *kvm);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 44bf567b6589..2b691f4d1f26 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+ return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->age_hva(kvm, start, end);
+ return kvm->arch.kvm_ops->age_gfn(kvm, range);
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+ return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
- return 0;
+ return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
}
int kvmppc_core_init_vm(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 9b6323ec8e60..740e51def5a5 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -9,12 +9,10 @@
extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
- unsigned long end);
-extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
- unsigned long end);
-extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
-extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
+extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bb6773594cf8..b7bd9ca040b8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -752,51 +752,6 @@ void kvmppc_rmap_reset(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
-typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn);
-
-static int kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- hva_handler_fn handler)
-{
- int ret;
- int retval = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
-
- slots = kvm_memslots(kvm);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn, gfn+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for (; gfn < gfn_end; ++gfn) {
- ret = handler(kvm, memslot, gfn);
- retval |= ret;
- }
- }
-
- return retval;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- hva_handler_fn handler)
-{
- return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
-}
-
/* Must be called with both HPTE and rmap locked */
static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
struct kvm_memory_slot *memslot,
@@ -840,8 +795,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
}
}
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
unsigned long i;
__be64 *hptep;
@@ -874,16 +829,15 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
- return 0;
+ return false;
}
-int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ return kvm_unmap_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
- kvm_handle_hva_range(kvm, start, end, handler);
- return 0;
+ return kvm_unmap_rmapp(kvm, range->slot, range->start);
}
void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
@@ -913,8 +867,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
}
}
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
@@ -968,26 +922,26 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
return ret;
}
-int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ kvm_age_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
- return kvm_handle_hva_range(kvm, start, end, handler);
+ return kvm_age_rmapp(kvm, range->slot, range->start);
}
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
unsigned long *hp;
- int ret = 1;
+ bool ret = true;
unsigned long *rmapp;
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
if (*rmapp & KVMPPC_RMAP_REFERENCED)
- return 1;
+ return true;
lock_rmap(rmapp);
if (*rmapp & KVMPPC_RMAP_REFERENCED)
@@ -1002,27 +956,27 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
goto out;
} while ((i = j) != head);
}
- ret = 0;
+ ret = false;
out:
unlock_rmap(rmapp);
return ret;
}
-int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ kvm_test_age_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
- return kvm_handle_hva(kvm, hva, handler);
+ return kvm_test_age_rmapp(kvm, range->slot, range->start);
}
-void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
{
- hva_handler_fn handler;
+ if (kvm_is_radix(kvm))
+ return kvm_unmap_radix(kvm, range->slot, range->start);
- handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
- kvm_handle_hva(kvm, hva, handler);
+ return kvm_unmap_rmapp(kvm, range->slot, range->start);
}
static int vcpus_running(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index e603de7ade52..ec4f58fa9f5a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -993,8 +993,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
}
/* Called with kvm->mmu_lock held */
-int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
@@ -1002,24 +1002,24 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
- return 0;
+ return false;
}
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
kvm->arch.lpid);
- return 0;
+ return false;
}
/* Called with kvm->mmu_lock held */
-int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
- int ref = 0;
+ bool ref = false;
unsigned long old, *rmapp;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
@@ -1035,26 +1035,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
old & PTE_RPN_MASK,
1UL << shift);
- ref = 1;
+ ref = true;
}
return ref;
}
/* Called with kvm->mmu_lock held */
-int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn)
+bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long gfn)
+
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
- int ref = 0;
+ bool ref = false;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ref;
ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
if (ptep && pte_present(*ptep) && pte_young(*ptep))
- ref = 1;
+ ref = true;
return ref;
}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 4a532410e128..28a80d240b76 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4812,7 +4812,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
kvmhv_release_all_nested(kvm);
kvmppc_rmap_reset(kvm);
kvm->arch.process_table = 0;
- /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 0;
spin_unlock(&kvm->mmu_lock);
@@ -4834,7 +4834,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
if (err)
return err;
kvmppc_rmap_reset(kvm);
- /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 1;
spin_unlock(&kvm->mmu_lock);
@@ -5699,10 +5699,10 @@ static struct kvmppc_ops kvm_ops_hv = {
.flush_memslot = kvmppc_core_flush_memslot_hv,
.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
.commit_memory_region = kvmppc_core_commit_memory_region_hv,
- .unmap_hva_range = kvm_unmap_hva_range_hv,
- .age_hva = kvm_age_hva_hv,
- .test_age_hva = kvm_test_age_hva_hv,
- .set_spte_hva = kvm_set_spte_hva_hv,
+ .unmap_gfn_range = kvm_unmap_gfn_range_hv,
+ .age_gfn = kvm_age_gfn_hv,
+ .test_age_gfn = kvm_test_age_gfn_hv,
+ .set_spte_gfn = kvm_set_spte_gfn_hv,
.free_memslot = kvmppc_core_free_memslot_hv,
.init_vm = kvmppc_core_init_vm_hv,
.destroy_vm = kvmppc_core_destroy_vm_hv,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 913944dc3620..d7733b07f489 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
}
/************* MMU Notifiers *************/
-static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
long i;
struct kvm_vcpu *vcpu;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- slots = kvm_memslots(kvm);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn, gfn_end;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT,
+ range->end << PAGE_SHIFT);
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn, gfn+1, ..., gfn_end-1}.
- */
- gfn = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
- gfn_end << PAGE_SHIFT);
- }
+ return false;
}
-static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
- do_kvm_unmap_hva(kvm, start, end);
-
- return 0;
+ return do_kvm_unmap_gfn(kvm, range);
}
-static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
+static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
+static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* The page will get remapped properly on its next fault */
- do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+ return do_kvm_unmap_gfn(kvm, range);
}
/*****************************************/
@@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = {
.flush_memslot = kvmppc_core_flush_memslot_pr,
.prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
.commit_memory_region = kvmppc_core_commit_memory_region_pr,
- .unmap_hva_range = kvm_unmap_hva_range_pr,
- .age_hva = kvm_age_hva_pr,
- .test_age_hva = kvm_test_age_hva_pr,
- .set_spte_hva = kvm_set_spte_hva_pr,
+ .unmap_gfn_range = kvm_unmap_gfn_range_pr,
+ .age_gfn = kvm_age_gfn_pr,
+ .test_age_gfn = kvm_test_age_gfn_pr,
+ .set_spte_gfn = kvm_set_spte_gfn_pr,
.free_memslot = kvmppc_core_free_memslot_pr,
.init_vm = kvmppc_core_init_vm_pr,
.destroy_vm = kvmppc_core_destroy_vm_pr,
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index ed0c9c43d0cf..7f16afc331ef 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -721,45 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
/************* MMU Notifiers *************/
-static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- trace_kvm_unmap_hva(hva);
-
/*
* Flush all shadow tlb entries everywhere. This is slow, but
* we are 100% sure that we catch the to be unmapped page
*/
- kvm_flush_remote_tlbs(kvm);
-
- return 0;
+ return true;
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- /* kvm_unmap_hva flushes everything anyways */
- kvm_unmap_hva(kvm, start);
-
- return 0;
+ return kvm_e500_mmu_unmap_gfn(kvm, range);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* XXX could be more clever ;) */
- return 0;
+ return false;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
/* The page will get remapped properly on its next fault */
- kvm_unmap_hva(kvm, hva);
- return 0;
+ return kvm_e500_mmu_unmap_gfn(kvm, range);
}
/*****************************************/
diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h
index 3837842986aa..eff6e82dbcd4 100644
--- a/arch/powerpc/kvm/trace_booke.h
+++ b/arch/powerpc/kvm/trace_booke.h
@@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit,
)
);
-TRACE_EVENT(kvm_unmap_hva,
- TP_PROTO(unsigned long hva),
- TP_ARGS(hva),
-
- TP_STRUCT__entry(
- __field( unsigned long, hva )
- ),
-
- TP_fast_assign(
- __entry->hva = hva;
- ),
-
- TP_printk("unmap hva 0x%lx\n", __entry->hva)
-);
-
TRACE_EVENT(kvm_booke206_stlb_write,
TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 6bcfc5614bbc..8925f3969478 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -454,6 +454,7 @@ struct kvm_vcpu_stat {
u64 diagnose_44;
u64 diagnose_9c;
u64 diagnose_9c_ignored;
+ u64 diagnose_9c_forward;
u64 diagnose_258;
u64 diagnose_308;
u64 diagnose_500;
@@ -700,6 +701,10 @@ struct kvm_hw_bp_info_arch {
#define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \
(vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING))
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\
+ KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING)
+
struct kvm_guestdbg_info_arch {
unsigned long cr0;
unsigned long cr9;
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 01e360004481..e317fd4866c1 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void);
extern void __cpu_die(unsigned int cpu);
extern int __cpu_disable(void);
extern void schedule_mcck_handler(void);
+void notrace smp_yield_cpu(int cpu);
#endif /* __ASM_SMP_H */
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 58c8afa3da65..2fec2b80d35d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu)
asm volatile("diag %0,0,0x9c"
: : "d" (pcpu_devices[cpu].address));
}
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
/*
* Send cpus emergency shutdown signal. This gives the cpus the
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 5b8ec1c447e1..02c146f9e5cd 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
return 0;
}
+static int forward_cnt;
+static unsigned long cur_slice;
+
+static int diag9c_forwarding_overrun(void)
+{
+ /* Reset the count on a new slice */
+ if (time_after(jiffies, cur_slice)) {
+ cur_slice = jiffies;
+ forward_cnt = diag9c_forwarding_hz / HZ;
+ }
+ return forward_cnt-- <= 0 ? 1 : 0;
+}
+
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
@@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
if (!tcpu)
goto no_yield;
- /* target already running */
- if (READ_ONCE(tcpu->cpu) >= 0)
- goto no_yield;
+ /* target guest VCPU already running */
+ if (READ_ONCE(tcpu->cpu) >= 0) {
+ if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
+ goto no_yield;
+
+ /* target host CPU already running */
+ if (!vcpu_is_preempted(tcpu->cpu))
+ goto no_yield;
+ smp_yield_cpu(tcpu->cpu);
+ VCPU_EVENT(vcpu, 5,
+ "diag time slice end directed to %d: yield forwarded",
+ tid);
+ vcpu->stat.diagnose_9c_forward++;
+ return 0;
+ }
if (kvm_vcpu_yield_to(tcpu) <= 0)
goto no_yield;
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6d6b57059493..b9f85b2dc053 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
* kvm_s390_shadow_tables - walk the guest page table and create shadow tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the page table address result
+ * @pgt: pointer to the beginning of the page table for the given address if
+ * successful (return value 0), or to the first invalid DAT entry in
+ * case of exceptions (return value > 0)
* @fake: pgt references contiguous guest memory block, not a pgtable
*/
static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rfte.val = ptr;
goto shadow_r2t;
}
+ *pgt = ptr + vaddr.rfx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
if (rc)
return rc;
@@ -1060,6 +1063,7 @@ shadow_r2t:
rste.val = ptr;
goto shadow_r3t;
}
+ *pgt = ptr + vaddr.rsx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
if (rc)
return rc;
@@ -1087,6 +1091,7 @@ shadow_r3t:
rtte.val = ptr;
goto shadow_sgt;
}
+ *pgt = ptr + vaddr.rtx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
if (rc)
return rc;
@@ -1123,6 +1128,7 @@ shadow_sgt:
ste.val = ptr;
goto shadow_pgt;
}
+ *pgt = ptr + vaddr.sx * 8;
rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
if (rc)
return rc;
@@ -1157,6 +1163,8 @@ shadow_pgt:
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
+ * @datptr: will contain the address of the faulting DAT table entry, or of
+ * the valid leaf, plus some flags
*
* Returns: - 0 if the shadow fault was successfully resolved
* - > 0 (pgm exception code) on exceptions while faulting
@@ -1165,11 +1173,11 @@ shadow_pgt:
* - -ENOMEM if out of memory
*/
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr)
+ unsigned long saddr, unsigned long *datptr)
{
union vaddress vaddr;
union page_table_entry pte;
- unsigned long pgt;
+ unsigned long pgt = 0;
int dat_protection, fake;
int rc;
@@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
pte.val = pgt + vaddr.px * PAGE_SIZE;
goto shadow_page;
}
- if (!rc)
- rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+
+ switch (rc) {
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_FIRST_TRANS:
+ pgt |= PEI_NOT_PTE;
+ break;
+ case 0:
+ pgt += vaddr.px * 8;
+ rc = gmap_read_table(sg->parent, pgt, &pte.val);
+ }
+ if (datptr)
+ *datptr = pgt | dat_protection * PEI_DAT_PROT;
if (!rc && pte.i)
rc = PGM_PAGE_TRANSLATION;
if (!rc && pte.z)
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index f4c51756c462..7c72a5e3449f 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -18,17 +18,14 @@
/**
* kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
* @gra - guest real address
*
* Returns the guest absolute address that corresponds to the passed guest real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
*/
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
- unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
{
- unsigned long prefix = kvm_s390_get_prefix(vcpu);
-
if (gra < 2 * PAGE_SIZE)
gra += prefix;
else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
@@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
}
/**
+ * kvm_s390_real_to_abs - convert guest real address to guest absolute address
+ * @vcpu - guest virtual cpu
+ * @gra - guest real address
+ *
+ * Returns the guest absolute address that corresponds to the passed guest real
+ * address @gra of a virtual guest cpu by applying its prefix.
+ */
+static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
+ unsigned long gra)
+{
+ return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
+}
+
+/**
+ * _kvm_s390_logical_to_effective - convert guest logical to effective address
+ * @psw: psw of the guest
+ * @ga: guest logical address
+ *
+ * Convert a guest logical address to an effective address by applying the
+ * rules of the addressing mode defined by bits 31 and 32 of the given PSW
+ * (extendended/basic addressing mode).
+ *
+ * Depending on the addressing mode, the upper 40 bits (24 bit addressing
+ * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
+ * mode) of @ga will be zeroed and the remaining bits will be returned.
+ */
+static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
+ unsigned long ga)
+{
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
+ return ga;
+ if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
+ return ga & ((1UL << 31) - 1);
+ return ga & ((1UL << 24) - 1);
+}
+
+/**
* kvm_s390_logical_to_effective - convert guest logical to effective address
* @vcpu: guest virtual cpu
* @ga: guest logical address
@@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
unsigned long ga)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
-
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
- return ga;
- if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
- return ga & ((1UL << 31) - 1);
- return ga & ((1UL << 24) - 1);
+ return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
}
/*
@@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+/* MVPG PEI indication bits */
+#define PEI_DAT_PROT 2
+#define PEI_NOT_PTE 4
+
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr);
+ unsigned long saddr, unsigned long *datptr);
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2f09e9d7dc95..1296fc10f80c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("instruction_diag_44", diagnose_44),
VCPU_STAT("instruction_diag_9c", diagnose_9c),
VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+ VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
VCPU_STAT("instruction_diag_258", diagnose_258),
VCPU_STAT("instruction_diag_308", diagnose_308),
VCPU_STAT("instruction_diag_500", diagnose_500),
@@ -185,6 +186,11 @@ static bool use_gisa = true;
module_param(use_gisa, bool, 0644);
MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
+/* maximum diag9c forwarding per second */
+unsigned int diag9c_forwarding_hz;
+module_param(diag9c_forwarding_hz, uint, 0644);
+MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
+
/*
* For now we handle at most 16 double words as this is what the s390 base
* kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -544,6 +550,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_DIAG318:
r = 1;
break;
+ case KVM_CAP_SET_GUEST_DEBUG2:
+ r = KVM_GUESTDBG_VALID_MASK;
+ break;
case KVM_CAP_S390_HPAGE_1M:
r = 0;
if (hpage && !kvm_is_ucontrol(kvm))
@@ -4307,16 +4316,16 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu)
kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
if (MACHINE_HAS_GS) {
+ preempt_disable();
__ctl_set_bit(2, 4);
if (vcpu->arch.gs_enabled)
save_gs_cb(current->thread.gs_cb);
- preempt_disable();
current->thread.gs_cb = vcpu->arch.host_gscb;
restore_gs_cb(vcpu->arch.host_gscb);
- preempt_enable();
if (!vcpu->arch.host_gscb)
__ctl_clear_bit(2, 4);
vcpu->arch.host_gscb = NULL;
+ preempt_enable();
}
/* SIE will save etoken directly into SDNX and therefore kvm_run */
}
@@ -4542,7 +4551,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
/*
* As we are starting a second VCPU, we have to disable
* the IBS facility on all VCPUs to remove potentially
- * oustanding ENABLE requests.
+ * outstanding ENABLE requests.
*/
__disable_ibs_on_all_vcpus(vcpu->kvm);
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 79dcd647b378..9fad25109b0d 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
* @kvm: the KVM guest
*/
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
+
+/**
+ * diag9c_forwarding_hz
+ *
+ * Set the maximum number of diag9c forwarding per second
+ */
+extern unsigned int diag9c_forwarding_hz;
+
#endif
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index bd803e091918..4002a24bc43a 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
break;
- case ICPT_PARTEXEC:
- /* MVPG only */
- memcpy((void *)((u64)scb_o + 0xc0),
- (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
- break;
}
if (scb_s->ihcpu != 0xffffU)
@@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
if (!rc && (scb_s->ecb & ECB_TE))
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE);
+ prefix + PAGE_SIZE, NULL);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
current->thread.gmap_addr, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr);
+ current->thread.gmap_addr, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
current->thread.gmap_addr,
@@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
{
if (vsie_page->fault_addr)
kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr);
+ vsie_page->fault_addr, NULL);
vsie_page->fault_addr = 0;
}
@@ -984,6 +979,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/*
+ * Get a register for a nested guest.
+ * @vcpu the vcpu of the guest
+ * @vsie_page the vsie_page for the nested guest
+ * @reg the register number, the upper 4 bits are ignored.
+ * returns: the value of the register.
+ */
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+ /* no need to validate the parameter and/or perform error handling */
+ reg &= 0xf;
+ switch (reg) {
+ case 15:
+ return vsie_page->scb_s.gg15;
+ case 14:
+ return vsie_page->scb_s.gg14;
+ default:
+ return vcpu->run->s.regs.gprs[reg];
+ }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+ u64 *pei_block = &vsie_page->scb_o->mcic;
+ int edat, rc_dest, rc_src;
+ union ctlreg0 cr0;
+
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
+ prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+
+ dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+ dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
+ src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+ src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
+
+ rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
+ rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+ /*
+ * Either everything went well, or something non-critical went wrong
+ * e.g. because of a race. In either case, simply retry.
+ */
+ if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
+ retry_vsie_icpt(vsie_page);
+ return -EAGAIN;
+ }
+ /* Something more serious went wrong, propagate the error */
+ if (rc_dest < 0)
+ return rc_dest;
+ if (rc_src < 0)
+ return rc_src;
+
+ /* The only possible suppressing exception: just deliver it */
+ if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
+ clear_vsie_icpt(vsie_page);
+ rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
+ WARN_ON_ONCE(rc_dest);
+ return 1;
+ }
+
+ /*
+ * Forward the PEI intercept to the guest if it was a page fault, or
+ * also for segment and region table faults if EDAT applies.
+ */
+ if (edat) {
+ rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
+ rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
+ } else {
+ rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
+ rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
+ }
+ if (!rc_dest && !rc_src) {
+ pei_block[0] = pei_dest;
+ pei_block[1] = pei_src;
+ return 1;
+ }
+
+ retry_vsie_icpt(vsie_page);
+
+ /*
+ * The host has edat, and the guest does not, or it was an ASCE type
+ * exception. The host needs to inject the appropriate DAT interrupts
+ * into the guest.
+ */
+ if (rc_dest)
+ return inject_fault(vcpu, rc_dest, dest, 1);
+ return inject_fault(vcpu, rc_src, src, 0);
+}
+
+/*
* Run the vsie on a shadow scb and a shadow gmap, without any further
* sanity checks, handling SIE faults.
*
@@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if ((scb_s->ipa & 0xf000) != 0xf000)
scb_s->ipa += 0x1000;
break;
+ case ICPT_PARTEXEC:
+ if (scb_s->ipa == 0xb254)
+ rc = vsie_handle_mvpg(vcpu, vsie_page);
+ break;
}
return rc;
}
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 445e3ece4c23..1d2507f22437 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -57,24 +57,6 @@ static inline int sh_pmu_initialized(void)
return !!sh_pmu;
}
-const char *perf_pmu_name(void)
-{
- if (!sh_pmu)
- return NULL;
-
- return sh_pmu->name;
-}
-EXPORT_SYMBOL_GPL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
- if (!sh_pmu)
- return 0;
-
- return sh_pmu->num_events;
-}
-EXPORT_SYMBOL_GPL(perf_num_counters);
-
/*
* Release the PMU if this is the last perf_event.
*/
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3c94316169a3..ac37830ae941 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -340,6 +340,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 10eca9e8f7f6..cbbcee0a84f9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -221,12 +221,22 @@ enum x86_intercept_stage;
#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
+#define KVM_GUESTDBG_VALID_MASK \
+ (KVM_GUESTDBG_ENABLE | \
+ KVM_GUESTDBG_SINGLESTEP | \
+ KVM_GUESTDBG_USE_HW_BP | \
+ KVM_GUESTDBG_USE_SW_BP | \
+ KVM_GUESTDBG_INJECT_BP | \
+ KVM_GUESTDBG_INJECT_DB)
+
+
#define PFERR_PRESENT_BIT 0
#define PFERR_WRITE_BIT 1
#define PFERR_USER_BIT 2
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
#define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
#define PFERR_GUEST_FINAL_BIT 32
#define PFERR_GUEST_PAGE_BIT 33
@@ -236,6 +246,7 @@ enum x86_intercept_stage;
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
@@ -1054,6 +1065,9 @@ struct kvm_arch {
u32 user_space_msr_mask;
struct kvm_x86_msr_filter __rcu *msr_filter;
+ /* Guest can access the SGX PROVISIONKEY. */
+ bool sgx_provisioning_allowed;
+
struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
@@ -1068,25 +1082,36 @@ struct kvm_arch {
bool tdp_mmu_enabled;
/*
- * List of struct kvmp_mmu_pages being used as roots.
+ * List of struct kvm_mmu_pages being used as roots.
* All struct kvm_mmu_pages in the list should have
* tdp_mmu_page set.
- * All struct kvm_mmu_pages in the list should have a positive
- * root_count except when a thread holds the MMU lock and is removing
- * an entry from the list.
+ *
+ * For reads, this list is protected by:
+ * the MMU lock in read mode + RCU or
+ * the MMU lock in write mode
+ *
+ * For writes, this list is protected by:
+ * the MMU lock in read mode + the tdp_mmu_pages_lock or
+ * the MMU lock in write mode
+ *
+ * Roots will remain in the list until their tdp_mmu_root_count
+ * drops to zero, at which point the thread that decremented the
+ * count to zero should removed the root from the list and clean
+ * it up, freeing the root after an RCU grace period.
*/
struct list_head tdp_mmu_roots;
/*
* List of struct kvmp_mmu_pages not being used as roots.
* All struct kvm_mmu_pages in the list should have
- * tdp_mmu_page set and a root_count of 0.
+ * tdp_mmu_page set and a tdp_mmu_root_count of 0.
*/
struct list_head tdp_mmu_pages;
/*
* Protects accesses to the following fields when the MMU lock
* is held in read mode:
+ * - tdp_mmu_roots (above)
* - tdp_mmu_pages (above)
* - the link field of struct kvm_mmu_pages used by the TDP MMU
* - lpage_disallowed_mmu_pages
@@ -1143,6 +1168,9 @@ struct kvm_vcpu_stat {
u64 req_event;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
+ u64 nested_run;
+ u64 directed_yield_attempted;
+ u64 directed_yield_successful;
};
struct x86_instruction_info;
@@ -1269,8 +1297,8 @@ struct kvm_x86_ops {
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
- int pgd_level);
+ void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int root_level);
bool (*has_wbinvd_exit)(void);
@@ -1339,6 +1367,7 @@ struct kvm_x86_ops {
int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
@@ -1357,6 +1386,7 @@ struct kvm_x86_ops {
struct kvm_x86_nested_ops {
int (*check_events)(struct kvm_vcpu *vcpu);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+ void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
unsigned user_data_size);
@@ -1428,9 +1458,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1440,8 +1467,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1538,6 +1563,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
+int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
+int kvm_emulate_invd(struct kvm_vcpu *vcpu);
+int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
+int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -1566,14 +1596,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
-int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -1614,9 +1644,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
-int kvm_mmu_load(struct kvm_vcpu *vcpu);
-void kvm_mmu_unload(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -1735,11 +1762,7 @@ asmlinkage void kvm_spurious_fault(void);
_ASM_EXTABLE(666b, 667b)
#define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 31c4df123aa0..9c80c68d75b5 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -20,7 +20,6 @@
extern u64 sme_me_mask;
extern u64 sev_status;
-extern bool sev_enabled;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
unsigned long decrypted_kernel_vaddr,
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1c561945b426..772e60efe243 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -269,7 +269,9 @@ struct vmcb_save_area {
* SEV-ES guests when referenced through the GHCB or for
* saving to the host save area.
*/
- u8 reserved_7[80];
+ u8 reserved_7[72];
+ u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
+ u8 reserved_7b[4];
u32 pkru;
u8 reserved_7a[20];
u64 reserved_8; /* rax already available at 0x01f8 */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 358707f60d99..0ffaa3156a4e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -373,6 +373,7 @@ enum vmcs_field {
#define GUEST_INTR_STATE_MOV_SS 0x00000002
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010
/* GUEST_ACTIVITY_STATE flags */
#define GUEST_ACTIVITY_ACTIVE 0
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b8e650a985e3..946d761adbd3 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -27,6 +27,7 @@
#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5d32fa477a62..d307c22e5c18 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void)
}
}
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
static bool pv_tlb_flush_supported(void)
{
return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
@@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void)
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
}
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
static bool pv_ipi_supported(void)
{
return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
@@ -574,6 +574,54 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
}
}
+static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ /*
+ * The local vCPU is never preempted, so we do not explicitly
+ * skip check for local vCPU - it will never be cleared from
+ * flushmask.
+ */
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_multi(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+ int cpu;
+
+ if (!kvm_para_available() || nopv)
+ return 0;
+
+ if (pv_tlb_flush_supported() || pv_ipi_supported())
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+
+ return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
@@ -611,38 +659,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
local_irq_enable();
return 0;
}
-#endif
-
-static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
-{
- u8 state;
- int cpu;
- struct kvm_steal_time *src;
- struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
-
- cpumask_copy(flushmask, cpumask);
- /*
- * We have to call flush only on online vCPUs. And
- * queue flush_on_enter for pre-empted vCPUs
- */
- for_each_cpu(cpu, flushmask) {
- /*
- * The local vCPU is never preempted, so we do not explicitly
- * skip check for local vCPU - it will never be cleared from
- * flushmask.
- */
- src = &per_cpu(steal_time, cpu);
- state = READ_ONCE(src->preempted);
- if ((state & KVM_VCPU_PREEMPTED)) {
- if (try_cmpxchg(&src->preempted, &state,
- state | KVM_VCPU_FLUSH_TLB))
- __cpumask_clear_cpu(cpu, flushmask);
- }
- }
- native_flush_tlb_multi(flushmask, info);
-}
+#endif
static void __init kvm_guest_init(void)
{
@@ -658,12 +676,6 @@ static void __init kvm_guest_init(void)
static_call_update(pv_steal_clock, kvm_steal_clock);
}
- if (pv_tlb_flush_supported()) {
- pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
- pr_info("KVM setup pv remote TLB flush\n");
- }
-
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
@@ -673,6 +685,12 @@ static void __init kvm_guest_init(void)
}
#ifdef CONFIG_SMP
+ if (pv_tlb_flush_supported()) {
+ pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+ pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
@@ -739,7 +757,7 @@ static uint32_t __init kvm_detect(void)
static void __init kvm_apic_init(void)
{
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
if (pv_ipi_supported())
kvm_setup_pv_ipi();
#endif
@@ -799,32 +817,6 @@ static __init int activate_jump_labels(void)
}
arch_initcall(activate_jump_labels);
-static __init int kvm_alloc_cpumask(void)
-{
- int cpu;
- bool alloc = false;
-
- if (!kvm_para_available() || nopv)
- return 0;
-
- if (pv_tlb_flush_supported())
- alloc = true;
-
-#if defined(CONFIG_SMP)
- if (pv_ipi_supported())
- alloc = true;
-#endif
-
- if (alloc)
- for_each_possible_cpu(cpu) {
- zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
- GFP_KERNEL, cpu_to_node(cpu));
- }
-
- return 0;
-}
-arch_initcall(kvm_alloc_cpumask);
-
#ifdef CONFIG_PARAVIRT_SPINLOCKS
/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index eafc4d601f25..c589db5d91b3 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -23,6 +23,8 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c02466a1410b..19606a341888 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <asm/user.h>
#include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
#include "cpuid.h"
#include "lapic.h"
#include "mmu.h"
@@ -28,7 +29,7 @@
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
* aligned to sizeof(unsigned long) because it's not accessed via bitops.
*/
-u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_GPL(kvm_cpu_caps);
static u32 xstate_required_size(u64 xstate_bv, bool compacted)
@@ -53,6 +54,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
}
#define F feature_bit
+#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
@@ -170,6 +172,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ /*
+ * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+ * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+ * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
+ * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+ * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
+ * '1' even on CPUs that don't support XSAVE.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+ if (best) {
+ best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
+ best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+ best->ecx |= XFEATURE_MASK_FPSSE;
+ }
+
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -347,13 +364,13 @@ out:
return r;
}
-static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
+static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
{
const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
struct kvm_cpuid_entry2 entry;
reverse_cpuid_check(leaf);
- kvm_cpu_caps[leaf] &= mask;
cpuid_count(cpuid.function, cpuid.index,
&entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
@@ -361,6 +378,27 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
}
+static __always_inline
+void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+ BUILD_BUG_ON(leaf < NCAPINTS);
+
+ kvm_cpu_caps[leaf] = mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
+static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
+{
+ /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+ BUILD_BUG_ON(leaf >= NCAPINTS);
+
+ kvm_cpu_caps[leaf] &= mask;
+
+ __kvm_cpu_cap_mask(leaf);
+}
+
void kvm_set_cpu_caps(void)
{
unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
@@ -371,12 +409,13 @@ void kvm_set_cpu_caps(void)
unsigned int f_gbpages = 0;
unsigned int f_lm = 0;
#endif
+ memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
- BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
sizeof(boot_cpu_data.x86_capability));
memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
- sizeof(kvm_cpu_caps));
+ sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)));
kvm_cpu_cap_mask(CPUID_1_ECX,
/*
@@ -407,7 +446,7 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_7_0_EBX,
- F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+ F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
@@ -418,7 +457,8 @@ void kvm_set_cpu_caps(void)
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
+ F(SGX_LC)
);
/* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57))
@@ -457,6 +497,10 @@ void kvm_set_cpu_caps(void)
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
);
+ kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+ SF(SGX1) | SF(SGX2)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -514,6 +558,10 @@ void kvm_set_cpu_caps(void)
*/
kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
+ kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
+ 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+ F(SME_COHERENT));
+
kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
@@ -778,6 +826,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
}
break;
+ case 0x12:
+ /* Intel SGX */
+ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+ * and max enclave sizes. The SGX sub-features and MISCSELECT
+ * are restricted by kernel and KVM capabilities (like most
+ * feature flags), while enclave size is unrestricted.
+ */
+ cpuid_entry_override(entry, CPUID_12_EAX);
+ entry->ebx &= SGX_MISC_EXINFO;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ /*
+ * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
+ * feature flags. Advertise all supported flags, including
+ * privileged attributes that require explicit opt-in from
+ * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
+ * expected to derive it from supported XCR0.
+ */
+ entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
+ SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
+ SGX_ATTR_KSS;
+ entry->ebx &= 0;
+ break;
/* Intel PT */
case 0x14:
if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
@@ -869,8 +949,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
/* Support memory encryption cpuid if host supports it */
case 0x8000001F:
- if (!boot_cpu_has(X86_FEATURE_SEV))
+ if (!kvm_cpu_cap_has(X86_FEATURE_SEV))
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ else
+ cpuid_entry_override(entry, CPUID_8000_001F_EAX);
break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 2a0c5064497f..c99edfff7f82 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -3,11 +3,12 @@
#define ARCH_X86_KVM_CPUID_H
#include "x86.h"
+#include "reverse_cpuid.h"
#include <asm/cpu.h>
#include <asm/processor.h>
#include <uapi/asm/kvm_para.h>
-extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
@@ -58,144 +59,8 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
}
-struct cpuid_reg {
- u32 function;
- u32 index;
- int reg;
-};
-
-static const struct cpuid_reg reverse_cpuid[] = {
- [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
- [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
- [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
- [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
- [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
- [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
- [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
- [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
- [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
- [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
- [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
- [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
- [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
- [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
- [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
-};
-
-/*
- * Reverse CPUID and its derivatives can only be used for hardware-defined
- * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
- * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
- * is nonsensical as the bit number/mask is an arbitrary software-defined value
- * and can't be used by KVM to query/control guest capabilities. And obviously
- * the leaf being queried must have an entry in the lookup table.
- */
-static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
-{
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
- BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
- BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
- BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
-}
-
-/*
- * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
- * the hardware defined bit number (stored in bits 4:0) and a software defined
- * "word" (stored in bits 31:5). The word is used to index into arrays of
- * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
- */
-static __always_inline u32 __feature_bit(int x86_feature)
-{
- reverse_cpuid_check(x86_feature / 32);
- return 1 << (x86_feature & 31);
-}
-
-#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
-
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
-{
- unsigned int x86_leaf = x86_feature / 32;
-
- reverse_cpuid_check(x86_leaf);
- return reverse_cpuid[x86_leaf];
-}
-
-static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
- u32 reg)
-{
- switch (reg) {
- case CPUID_EAX:
- return &entry->eax;
- case CPUID_EBX:
- return &entry->ebx;
- case CPUID_ECX:
- return &entry->ecx;
- case CPUID_EDX:
- return &entry->edx;
- default:
- BUILD_BUG();
- return NULL;
- }
-}
-
-static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
-
- return __cpuid_entry_get_reg(entry, cpuid.reg);
-}
-
-static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- return *reg & __feature_bit(x86_feature);
-}
-
-static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- return cpuid_entry_get(entry, x86_feature);
-}
-
-static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- *reg &= ~__feature_bit(x86_feature);
-}
-
-static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- *reg |= __feature_bit(x86_feature);
-}
-
-static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
- unsigned int x86_feature,
- bool set)
-{
- u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
-
- /*
- * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
- * compiler into using CMOV instead of Jcc when possible.
- */
- if (set)
- *reg |= __feature_bit(x86_feature);
- else
- *reg &= ~__feature_bit(x86_feature);
-}
-
static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
- enum cpuid_leafs leaf)
+ unsigned int leaf)
{
u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
@@ -248,6 +113,14 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
}
+static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
+}
+
static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -308,7 +181,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
@@ -316,7 +189,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
@@ -324,7 +197,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
{
- unsigned int x86_leaf = x86_feature / 32;
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
reverse_cpuid_check(x86_leaf);
return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cdd2a2b6550e..77e1c89a95a7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4220,7 +4220,7 @@ static bool valid_cr(int nr)
}
}
-static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+static int check_cr_access(struct x86_emulate_ctxt *ctxt)
{
if (!valid_cr(ctxt->modrm_reg))
return emulate_ud(ctxt);
@@ -4228,80 +4228,6 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int check_cr_write(struct x86_emulate_ctxt *ctxt)
-{
- u64 new_val = ctxt->src.val64;
- int cr = ctxt->modrm_reg;
- u64 efer = 0;
-
- static u64 cr_reserved_bits[] = {
- 0xffffffff00000000ULL,
- 0, 0, 0, /* CR3 checked later */
- CR4_RESERVED_BITS,
- 0, 0, 0,
- CR8_RESERVED_BITS,
- };
-
- if (!valid_cr(cr))
- return emulate_ud(ctxt);
-
- if (new_val & cr_reserved_bits[cr])
- return emulate_gp(ctxt, 0);
-
- switch (cr) {
- case 0: {
- u64 cr4;
- if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
- ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
- return emulate_gp(ctxt, 0);
-
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
- if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
- !(cr4 & X86_CR4_PAE))
- return emulate_gp(ctxt, 0);
-
- break;
- }
- case 3: {
- u64 rsvd = 0;
-
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA) {
- u64 maxphyaddr;
- u32 eax, ebx, ecx, edx;
-
- eax = 0x80000008;
- ecx = 0;
- if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx,
- &edx, true))
- maxphyaddr = eax & 0xff;
- else
- maxphyaddr = 36;
- rsvd = rsvd_bits(maxphyaddr, 63);
- if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
- rsvd &= ~X86_CR3_PCID_NOFLUSH;
- }
-
- if (new_val & rsvd)
- return emulate_gp(ctxt, 0);
-
- break;
- }
- case 4: {
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
-
- if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
- return emulate_gp(ctxt, 0);
-
- break;
- }
- }
-
- return X86EMUL_CONTINUE;
-}
-
static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
{
unsigned long dr7;
@@ -4841,10 +4767,10 @@ static const struct opcode twobyte_table[256] = {
D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */
/* 0x20 - 0x2F */
- DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access),
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
- check_cr_write),
+ check_cr_access),
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
check_dr_write),
N, N, N, N,
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 2e11da2f5621..3db5c42c9ecd 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -62,7 +62,12 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
}
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
+/*
+ * The "raw" register helpers are only for cases where the full 64 bits of a
+ * register are read/written irrespective of current vCPU mode. In other words,
+ * odds are good you shouldn't be using the raw variants.
+ */
+static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
{
if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
return 0;
@@ -73,8 +78,8 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
return vcpu->arch.regs[reg];
}
-static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
- unsigned long val)
+static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
+ unsigned long val)
{
if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
return;
@@ -85,22 +90,22 @@ static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
{
- return kvm_register_read(vcpu, VCPU_REGS_RIP);
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RIP);
}
static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
{
- kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+ kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val);
}
static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
{
- return kvm_register_read(vcpu, VCPU_REGS_RSP);
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RSP);
}
static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
{
- kvm_register_write(vcpu, VCPU_REGS_RSP, val);
+ kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val);
}
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cc369b9ad8f1..152591f9243a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -296,6 +296,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
+
+ /* Check if there are APF page ready requests pending */
+ if (enabled)
+ kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
@@ -2261,6 +2265,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
static_branch_slow_dec_deferred(&apic_hw_disabled);
+ /* Check if there are APF page ready requests pending */
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
} else {
static_branch_inc(&apic_hw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
@@ -2869,7 +2875,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
return;
if (is_guest_mode(vcpu)) {
- r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ r = kvm_check_nested_events(vcpu);
if (r < 0)
return;
/*
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c68bfc3e2402..88d0ed5225a4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
@@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
@@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
- vcpu->arch.mmu->shadow_root_level);
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->shadow_root_level);
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* write-protects guest page to sync the guest modification, b) another one is
* used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
* between these two sorts are:
- * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 1) the first case clears MMU-writable bit.
* 2) the first case requires flushing tlb immediately avoiding corrupting
* shadow page table between all vcpus so it should be in the protection of
* mmu-lock. And the another case does not need to flush tlb until returning
@@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* So, there is the problem: the first case can meet the corrupted tlb caused
* by another case which write-protects pages but without flush tlb
* immediately. In order to making the first case be aware this problem we let
- * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
- * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ * it flush tlb if we try to write-protect a spte whose MMU-writable bit
+ * is set, it works since another case never touches MMU-writable bit.
*
* Anyway, whenever a spte is updated (only permission and status bits are
- * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * changed) we need to check whether the spte with MMU-writable becomes
* readonly, if that happens, we need to flush tlb. Fortunately,
* mmu_spte_update() has already handled it perfectly.
*
- * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * The rules to use MMU-writable and PT_WRITABLE_MASK:
* - if we want to see if it has writable tlb entry or if the spte can be
- * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ * writable on the mmu mapping, check MMU-writable, this is the most
* case, otherwise
* - if we fix page fault on the spte or do write-protection by dirty logging,
* check PT_WRITABLE_MASK.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 62b1729277ef..4b3ee244ebe0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -48,6 +48,7 @@
#include <asm/memtype.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+#include <asm/set_memory.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
#include "trace.h"
@@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void)
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
- u64 mask = make_mmio_spte(vcpu, gfn, access);
+ u64 spte = make_mmio_spte(vcpu, gfn, access);
- trace_mark_mmio_spte(sptep, gfn, mask);
- mmu_spte_set(sptep, mask);
+ trace_mark_mmio_spte(sptep, gfn, spte);
+ mmu_spte_set(sptep, spte);
}
static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte)
return spte & shadow_mmio_access_mask;
}
-static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access)
-{
- if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(vcpu, sptep, gfn, access);
- return true;
- }
-
- return false;
-}
-
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
u64 kvm_gen, spte_gen, gen;
@@ -725,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
* handling slots that are not large page aligned.
*/
static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+ const struct kvm_memory_slot *slot, int level)
{
unsigned long idx;
@@ -1118,7 +1107,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
- spte &= ~SPTE_MMU_WRITEABLE;
+ spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
return mmu_spte_update(sptep, spte);
@@ -1308,26 +1297,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
return flush;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
return kvm_zap_rmapp(kvm, rmap_head, slot);
}
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t pte)
{
u64 *sptep;
struct rmap_iterator iter;
int need_flush = 0;
u64 new_spte;
- pte_t *ptep = (pte_t *)data;
kvm_pfn_t new_pfn;
- WARN_ON(pte_huge(*ptep));
- new_pfn = pte_pfn(*ptep);
+ WARN_ON(pte_huge(pte));
+ new_pfn = pte_pfn(pte);
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
@@ -1336,7 +1324,7 @@ restart:
need_flush = 1;
- if (pte_write(*ptep)) {
+ if (pte_write(pte)) {
pte_list_remove(rmap_head, sptep);
goto restart;
} else {
@@ -1424,93 +1412,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-static __always_inline int
-kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn,
- int level,
- unsigned long data))
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t pte);
+
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ rmap_handler_t handler)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
struct slot_rmap_walk_iterator iterator;
- int ret = 0;
- int i;
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
+ bool ret = false;
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- gfn_start, gfn_end - 1,
- &iterator)
- ret |= handler(kvm, iterator.rmap, memslot,
- iterator.gfn, iterator.level, data);
- }
- }
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator)
+ ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+ iterator.level, range->pte);
return ret;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn, int level,
- unsigned long data))
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
- unsigned flags)
-{
- int r;
+ bool flush;
- r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+ flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
- return r;
+ return flush;
}
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int r;
+ bool flush;
- r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
if (is_tdp_mmu_enabled(kvm))
- r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+ flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
- return r;
+ return flush;
}
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1519,13 +1466,12 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
for_each_rmap_spte(rmap_head, &iter, sptep)
young |= mmu_spte_age(sptep);
- trace_kvm_age_page(gfn, level, slot, young);
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, pte_t unused)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1547,29 +1493,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
- kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(sp->role.level));
}
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young;
+
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
- young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+ young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
return young;
}
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- int young = false;
+ bool young;
+
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
- young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
- young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+ young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
return young;
}
@@ -2421,6 +2369,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
+ /*
+ * Note, this check is intentionally soft, it only guarantees that one
+ * page is available, while the caller may end up allocating as many as
+ * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
+ * exceeding the (arbitrary by default) limit will not harm the host,
+ * being too agressive may unnecessarily kill the guest, and getting an
+ * exact count is far more trouble than it's worth, especially in the
+ * page fault paths.
+ */
if (!kvm_mmu_available_pages(vcpu->kvm))
return -ENOSPC;
return 0;
@@ -2561,9 +2518,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
struct kvm_mmu_page *sp;
int ret;
- if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
- return 0;
-
sp = sptep_to_sp(sptep);
ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
@@ -2593,6 +2547,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ return RET_PF_EMULATE;
+ }
+
if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
@@ -2626,9 +2585,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
KVM_PAGES_PER_HPAGE(level));
- if (unlikely(is_mmio_spte(*sptep)))
- ret = RET_PF_EMULATE;
-
/*
* The fault is fully spurious if and only if the new SPTE and old SPTE
* are identical, and emulation is not required.
@@ -2745,7 +2701,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
unsigned long hva;
pte_t *pte;
@@ -2771,8 +2727,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
return level;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
@@ -2946,9 +2903,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
return true;
}
- if (unlikely(is_noslot_pfn(pfn)))
+ if (unlikely(is_noslot_pfn(pfn))) {
vcpu_cache_mmio_info(vcpu, gva, gfn,
access & shadow_mmio_access_mask);
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!shadow_mmio_value)) {
+ *ret_val = RET_PF_EMULATE;
+ return true;
+ }
+ }
return false;
}
@@ -3061,6 +3028,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (!is_shadow_present_pte(spte))
break;
+ if (!is_shadow_present_pte(spte))
+ break;
+
sp = sptep_to_sp(iterator.sptep);
if (!is_last_spte(spte, sp->role.level))
break;
@@ -3150,12 +3120,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
- if (kvm_mmu_put_root(kvm, sp)) {
- if (is_tdp_mmu_page(sp))
- kvm_tdp_mmu_free_root(kvm, sp);
- else if (sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
- }
+ if (is_tdp_mmu_page(sp))
+ kvm_tdp_mmu_put_root(kvm, sp, false);
+ else if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
*root_hpa = INVALID_PAGE;
}
@@ -3193,14 +3161,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
- } else {
- for (i = 0; i < 4; ++i)
- if (mmu->pae_root[i] != 0)
- mmu_free_root_page(kvm,
- &mmu->pae_root[i],
- &invalid_list);
- mmu->root_hpa = INVALID_PAGE;
+ } else if (mmu->pae_root) {
+ for (i = 0; i < 4; ++i) {
+ if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+ continue;
+
+ mmu_free_root_page(kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ }
}
+ mmu->root_hpa = INVALID_PAGE;
mmu->root_pgd = 0;
}
@@ -3226,155 +3197,208 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
{
struct kvm_mmu_page *sp;
- write_lock(&vcpu->kvm->mmu_lock);
-
- if (make_mmu_pages_available(vcpu)) {
- write_unlock(&vcpu->kvm->mmu_lock);
- return INVALID_PAGE;
- }
sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
++sp->root_count;
- write_unlock(&vcpu->kvm->mmu_lock);
return __pa(sp->spt);
}
static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
- u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u8 shadow_root_level = mmu->shadow_root_level;
hpa_t root;
unsigned i;
+ int r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->root_hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
- root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
- true);
-
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+ mmu->root_hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, true);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
+ mmu->pae_root[i] = root | PT_PRESENT_MASK |
+ shadow_me_mask;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
- } else
- BUG();
+ mmu->root_hpa = __pa(mmu->pae_root);
+ } else {
+ WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+ r = -EIO;
+ goto out_unlock;
+ }
/* root_pgd is ignored for direct MMUs. */
- vcpu->arch.mmu->root_pgd = 0;
-
- return 0;
+ mmu->root_pgd = 0;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
}
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
- u64 pdptr, pm_mask;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 pdptrs[4], pm_mask;
gfn_t root_gfn, root_pgd;
hpa_t root;
- int i;
+ unsigned i;
+ int r;
- root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ root_pgd = mmu->get_guest_pgd(vcpu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
/*
+ * On SVM, reading PDPTRs might access guest memory, which might fault
+ * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
+ */
+ if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ if (!(pdptrs[i] & PT_PRESENT_MASK))
+ continue;
+
+ if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ return 1;
+ }
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ /*
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
+ if (mmu->root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
- vcpu->arch.mmu->shadow_root_level, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->root_hpa = root;
+ mmu->shadow_root_level, false);
+ mmu->root_hpa = root;
goto set_root_pgd;
}
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
/*
* We shadow a 32 bit page table. This may be a legacy 2-level
* or a PAE 3-level page table. In either case we need to be aware that
* the shadow page table may be a PAE or a long mode page table.
*/
- pm_mask = PT_PRESENT_MASK;
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ pm_mask = PT_PRESENT_MASK | shadow_me_mask;
+ if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+ if (WARN_ON_ONCE(!mmu->lm_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+ }
+
for (i = 0; i < 4; ++i) {
- MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
- if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
- pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
- if (!(pdptr & PT_PRESENT_MASK)) {
- vcpu->arch.mmu->pae_root[i] = 0;
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
continue;
}
- root_gfn = pdptr >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
+ root_gfn = pdptrs[i] >> PAGE_SHIFT;
}
root = mmu_alloc_root(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, false);
- if (!VALID_PAGE(root))
- return -ENOSPC;
- vcpu->arch.mmu->pae_root[i] = root | pm_mask;
+ mmu->pae_root[i] = root | pm_mask;
}
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
+
+ if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ mmu->root_hpa = __pa(mmu->lm_root);
+ else
+ mmu->root_hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+ mmu->root_pgd = root_pgd;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return 0;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 *lm_root, *pae_root;
/*
- * If we shadow a 32 bit page table with a long mode page
- * table we enter this path.
+ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+ * tables are allocated and initialized at root creation as there is no
+ * equivalent level in the guest's NPT to shadow. Allocate the tables
+ * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
*/
- if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
- if (vcpu->arch.mmu->lm_root == NULL) {
- /*
- * The additional page necessary for this is only
- * allocated on demand.
- */
+ if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
+ mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+ return 0;
- u64 *lm_root;
+ /*
+ * This mess only works with 4-level paging and needs to be updated to
+ * work with 5-level paging.
+ */
+ if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
+ return -EIO;
- lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (lm_root == NULL)
- return 1;
+ if (mmu->pae_root && mmu->lm_root)
+ return 0;
- lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
+ /*
+ * The special roots should always be allocated in concert. Yell and
+ * bail if KVM ends up in a state where only one of the roots is valid.
+ */
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+ return -EIO;
- vcpu->arch.mmu->lm_root = lm_root;
- }
+ /*
+ * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+ * doesn't need to be decrypted.
+ */
+ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pae_root)
+ return -ENOMEM;
- vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
+ lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!lm_root) {
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
}
-set_root_pgd:
- vcpu->arch.mmu->root_pgd = root_pgd;
+ mmu->pae_root = pae_root;
+ mmu->lm_root = lm_root;
return 0;
}
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.mmu->direct_map)
- return mmu_alloc_direct_roots(vcpu);
- else
- return mmu_alloc_shadow_roots(vcpu);
-}
-
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
@@ -3422,7 +3446,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
- if (root && VALID_PAGE(root)) {
+ if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
mmu_sync_children(vcpu, sp);
@@ -3554,11 +3578,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
__is_rsvd_bits_set(rsvd_check, sptes[level], level);
if (reserved) {
- pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+ pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
__func__, addr);
for (level = root; level >= leaf; level--)
- pr_err("------ spte 0x%llx level %d.\n",
- sptes[level], level);
+ pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+ sptes[level], level,
+ rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
}
return reserved;
@@ -3653,6 +3678,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
+ return true;
+
/* Don't expose private memslots to L2. */
if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
*pfn = KVM_PFN_NOSLOT;
@@ -4615,12 +4648,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
- context->shadow_root_level = new_role.base.level;
-
__kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
- if (new_role.as_u64 != context->mmu_role.as_u64)
+ if (new_role.as_u64 != context->mmu_role.as_u64) {
shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+
+ /*
+ * Override the level set by the common init helper, nested TDP
+ * always uses the host's TDP configuration.
+ */
+ context->shadow_root_level = new_role.base.level;
+ }
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4802,16 +4840,23 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
if (r)
goto out;
- r = mmu_alloc_roots(vcpu);
- kvm_mmu_sync_roots(vcpu);
+ r = mmu_alloc_special_roots(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->arch.mmu->direct_map)
+ r = mmu_alloc_direct_roots(vcpu);
+ else
+ r = mmu_alloc_shadow_roots(vcpu);
if (r)
goto out;
+
+ kvm_mmu_sync_roots(vcpu);
+
kvm_mmu_load_pgd(vcpu);
static_call(kvm_x86_tlb_flush_current)(vcpu);
out:
return r;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@@ -4820,7 +4865,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
}
-EXPORT_SYMBOL_GPL(kvm_mmu_unload);
static bool need_remote_flush(u64 old, u64 new)
{
@@ -5169,10 +5213,10 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_
static __always_inline bool
slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+ gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool flush = false;
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
@@ -5180,7 +5224,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
flush |= fn(kvm, iterator.rmap, memslot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && lock_flush_tlb) {
+ if (flush && flush_on_yield) {
kvm_flush_remote_tlbs_with_address(kvm,
start_gfn,
iterator.gfn - start_gfn + 1);
@@ -5190,36 +5234,32 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
}
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
- end_gfn - start_gfn + 1);
- flush = false;
- }
-
return flush;
}
static __always_inline bool
slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+ bool flush_on_yield)
{
return slot_handle_level_range(kvm, memslot, fn, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
+ flush_on_yield, false);
}
static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- PG_LEVEL_4K, lock_flush_tlb);
+ PG_LEVEL_4K, flush_on_yield);
}
static void free_mmu_pages(struct kvm_mmu *mmu)
{
+ if (!tdp_enabled && mmu->pae_root)
+ set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
free_page((unsigned long)mmu->lm_root);
}
@@ -5240,9 +5280,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
* while the PDP table is a per-vCPU construct that's allocated at MMU
* creation. When emulating 32-bit mode, cr3 is only 32 bits even on
* x86_64. Therefore we need to allocate the PDP table in the first
- * 4GB of memory, which happens to fit the DMA32 zone. Except for
- * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
- * skip allocating the PDP table.
+ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
+ * generally doesn't use PAE paging and can skip allocating the PDP
+ * table. The main exception, handled here, is SVM's 32-bit NPT. The
+ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+ * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
*/
if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
@@ -5252,8 +5294,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
return -ENOMEM;
mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_mask);
+
for (i = 0; i < 4; ++i)
- mmu->pae_root[i] = INVALID_PAGE;
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
return 0;
}
@@ -5365,6 +5421,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
*/
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+ /* In order to ensure all threads see this change when
+ * handling the MMU reload signal, this must happen in the
+ * same critical section as kvm_reload_remote_mmus, and
+ * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
+ * could drop the MMU lock and yield.
+ */
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_invalidate_all_roots(kvm);
+
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
* Then all vcpus will switch to new shadow page table with the new
@@ -5377,10 +5442,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_zap_obsolete_pages(kvm);
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_zap_all(kvm);
-
write_unlock(&kvm->mmu_lock);
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
+ read_unlock(&kvm->mmu_lock);
+ }
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5420,7 +5488,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
- bool flush;
+ bool flush = false;
write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -5433,20 +5501,31 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (start >= end)
continue;
- slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
- PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true);
+ flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PG_LEVEL_4K,
+ KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
}
}
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+ write_unlock(&kvm->mmu_lock);
+
if (is_tdp_mmu_enabled(kvm)) {
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+ flush = false;
+
+ read_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+ gfn_end, flush, true);
if (flush)
- kvm_flush_remote_tlbs(kvm);
- }
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end);
- write_unlock(&kvm->mmu_lock);
+ read_unlock(&kvm->mmu_lock);
+ }
}
static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5465,10 +5544,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+ read_unlock(&kvm->mmu_lock);
+ }
+
/*
* We can flush all the TLBs out of the mmu lock without TLB
* corruption since we just change the spte from writable to
@@ -5476,9 +5559,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
* spte from present to present (changing the spte from present
* to nonpresent will flush all the TLBs immediately), in other
* words, the only case we care is mmu_spte_update() where we
- * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
- * instead of PT_WRITABLE_MASK, that means it does not depend
- * on PT_WRITABLE_MASK anymore.
+ * have checked Host-writable | MMU-writable instead of
+ * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
+ * anymore.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5529,21 +5612,32 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
+ bool flush;
write_lock(&kvm->mmu_lock);
- slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (is_tdp_mmu_enabled(kvm))
- kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
+
+ if (is_tdp_mmu_enabled(kvm)) {
+ flush = false;
+
+ read_lock(&kvm->mmu_lock);
+ flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ read_unlock(&kvm->mmu_lock);
+ }
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
/*
* All current use cases for flushing the TLBs for a specific memslot
- * are related to dirty logging, and do the TLB flush out of mmu_lock.
+ * related to dirty logging, and many do the TLB flush out of mmu_lock.
* The interaction between the various operations on memslot must be
* serialized by slots_locks to ensure the TLB flush from one operation
* is observed by any other operation on the same memslot.
@@ -5560,10 +5654,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu_enabled(kvm)) {
+ read_lock(&kvm->mmu_lock);
+ flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+ read_unlock(&kvm->mmu_lock);
+ }
+
/*
* It's also safe to flush TLBs out of mmu lock here as currently this
* function is only used for dirty logging, in which case flushing TLB
@@ -5701,25 +5799,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
-static void kvm_set_mmio_spte_mask(void)
-{
- u64 mask;
-
- /*
- * Set a reserved PA bit in MMIO SPTEs to generate page faults with
- * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
- * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
- * 52-bit physical addresses then there are no reserved PA bits in the
- * PTEs and so the reserved PA approach must be disabled.
- */
- if (shadow_phys_bits < 52)
- mask = BIT_ULL(51) | PT_PRESENT_MASK;
- else
- mask = 0;
-
- kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
-}
-
static bool get_nx_auto_mode(void)
{
/* Return true when CPU has the bug, and mitigations are ON */
@@ -5785,8 +5864,6 @@ int kvm_mmu_module_init(void)
kvm_mmu_reset_all_pte_masks();
- kvm_set_mmio_spte_mask();
-
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, SLAB_ACCOUNT, NULL);
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index ced15fd58fde..cedc17b2f60e 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu->pae_root[i];
- if (root && VALID_PAGE(root)) {
+ if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
__mmu_spte_walk(vcpu, sp, fn, 2);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 360983865398..d64ccb417c60 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -20,6 +20,16 @@ extern bool dbg;
#define MMU_WARN_ON(x) do { } while (0)
#endif
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT 0
+#define IS_VALID_PAE_ROOT(x) (!!(x))
+
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
@@ -40,7 +50,11 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- int root_count; /* Currently serving as active root */
+ /* Currently serving as active root */
+ union {
+ int root_count;
+ refcount_t tdp_mmu_root_count;
+ };
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
DECLARE_BITMAP(unsync_child_bitmap, 512);
@@ -78,9 +92,14 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+ return role.smm ? 1 : 0;
+}
+
static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
{
- return sp->role.smm ? 1 : 0;
+ return kvm_mmu_role_as_id(sp->role);
}
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
@@ -108,22 +127,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- BUG_ON(!sp->root_count);
- lockdep_assert_held(&kvm->mmu_lock);
-
- ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- lockdep_assert_held(&kvm->mmu_lock);
- --sp->root_count;
-
- return !sp->root_count;
-}
-
/*
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
*
@@ -146,8 +149,9 @@ enum {
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
#define SET_SPTE_SPURIOUS BIT(2)
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level);
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 55d7b473ac44..70b7e44e3035 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -503,6 +503,7 @@ error:
#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.async_page_fault = false;
trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
@@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
nr_present++;
- host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+ host_writable = sp->spt[i] & shadow_host_writable_mask;
set_spte_ret |= set_spte(vcpu, &sp->spt[i],
pte_access, PG_LEVEL_4K,
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index ef55f0bc4ccf..66d43cec0c31 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -16,13 +16,20 @@
#include "spte.h"
#include <asm/e820/api.h>
+#include <asm/vmx.h>
+static bool __read_mostly enable_mmio_caching = true;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
u64 __read_mostly shadow_user_mask;
u64 __read_mostly shadow_accessed_mask;
u64 __read_mostly shadow_dirty_mask;
u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
u64 __read_mostly shadow_mmio_access_mask;
u64 __read_mostly shadow_present_mask;
u64 __read_mostly shadow_me_mask;
@@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
- BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen)
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
{
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
- u64 mask = generation_mmio_spte_mask(gen);
+ u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
+ WARN_ON_ONCE(!shadow_mmio_value);
+
access &= shadow_mmio_access_mask;
- mask |= shadow_mmio_value | access;
- mask |= gpa | shadow_nonpresent_or_rsvd_mask;
- mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ spte |= shadow_mmio_value | access;
+ spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+ spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
- return mask;
+ return spte;
}
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
bool can_unsync, bool host_writable, bool ad_disabled,
u64 *new_spte)
{
- u64 spte = 0;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
int ret = 0;
if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
else if (kvm_vcpu_ad_need_write_protect(vcpu))
- spte |= SPTE_AD_WRPROT_ONLY_MASK;
+ spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+
+ /*
+ * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set
+ * if PAE paging may be employed (shadow paging or any 32-bit KVM).
+ */
+ WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
+ (spte & SPTE_TDP_AD_MASK));
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
kvm_is_mmio_pfn(pfn));
if (host_writable)
- spte |= SPTE_HOST_WRITEABLE;
+ spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
@@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
- spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
/*
* Optimization: for pte sync, if spte was writable the hash
@@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
__func__, gfn);
ret |= SET_SPTE_WRITE_PROTECTED_PT;
pte_access &= ~ACC_WRITE_MASK;
- spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
}
}
@@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte = mark_spte_for_access_track(spte);
out:
+ WARN_ON(is_mmio_spte(spte));
*new_spte = spte;
return ret;
}
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
{
- u64 spte;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
- spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_mask;
+ spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (ad_disabled)
- spte |= SPTE_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED_MASK;
else
spte |= shadow_accessed_mask;
@@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
new_spte |= (u64)new_pfn << PAGE_SHIFT;
new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~SPTE_HOST_WRITEABLE;
+ new_spte &= ~shadow_host_writable_mask;
new_spte = mark_spte_for_access_track(new_spte);
@@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte)
return spte;
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
- WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
- shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
+
+ if (!enable_mmio_caching)
+ mmio_value = 0;
+
+ /*
+ * Disable MMIO caching if the MMIO value collides with the bits that
+ * are used to hold the relocated GFN when the L1TF mitigation is
+ * enabled. This should never fire as there is no known hardware that
+ * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+ * MMIO value are not susceptible to L1TF.
+ */
+ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+ SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+ mmio_value = 0;
+
+ /*
+ * The masked MMIO value must obviously match itself and a removed SPTE
+ * must not get a false positive. Removed SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * not set any RWX bits.
+ */
+ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+ WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ mmio_value = 0;
+
+ shadow_mmio_value = mmio_value;
+ shadow_mmio_mask = mmio_mask;
shadow_mmio_access_mask = access_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- * - Setting either @accessed_mask or @dirty_mask requires setting both
- * - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
{
- BUG_ON(!dirty_mask != !accessed_mask);
- BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
- shadow_present_mask = p_mask;
- shadow_acc_track_mask = acc_track_mask;
- shadow_me_mask = me_mask;
+ shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+ shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+ shadow_nx_mask = 0ull;
+ shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_me_mask = 0ull;
+
+ shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
+
+ /*
+ * EPT Misconfigurations are generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+ VMX_EPT_RWX_MASK, 0);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
void kvm_mmu_reset_all_pte_masks(void)
{
u8 low_phys_bits;
-
- shadow_user_mask = 0;
- shadow_accessed_mask = 0;
- shadow_dirty_mask = 0;
- shadow_nx_mask = 0;
- shadow_x_mask = 0;
- shadow_present_mask = 0;
- shadow_acc_track_mask = 0;
+ u64 mask;
shadow_phys_bits = kvm_get_shadow_phys_bits();
@@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+ shadow_user_mask = PT_USER_MASK;
+ shadow_accessed_mask = PT_ACCESSED_MASK;
+ shadow_dirty_mask = PT_DIRTY_MASK;
+ shadow_nx_mask = PT64_NX_MASK;
+ shadow_x_mask = 0;
+ shadow_present_mask = PT_PRESENT_MASK;
+ shadow_acc_track_mask = 0;
+ shadow_me_mask = sme_me_mask;
+
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 6de3950fd704..bca0ba11cccf 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -5,18 +5,33 @@
#include "mmu_internal.h"
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+. MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
/*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE. This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
*/
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
+#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
@@ -51,16 +66,46 @@
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page. This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+ PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
/*
- * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* the memslots generation and is derived as follows:
*
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -71,39 +116,44 @@
*/
#define MMIO_SPTE_GEN_LOW_START 3
-#define MMIO_SPTE_GEN_LOW_END 11
+#define MMIO_SPTE_GEN_LOW_END 10
-#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_START 52
#define MMIO_SPTE_GEN_HIGH_END 62
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+ (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
extern u64 __read_mostly shadow_mmio_access_mask;
extern u64 __read_mostly shadow_present_mask;
extern u64 __read_mostly shadow_me_mask;
/*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
* pages.
*/
@@ -121,28 +171,21 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
- PT64_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
-
-/*
* If a thread running without exclusive control of the MMU lock must perform a
* multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
- * This constant works because it is considered non-present on both AMD and
- * Intel CPUs and does not create a L1TF vulnerability because the pfn section
- * is zeroed out.
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability. Use only low bits to avoid 64-bit immediates.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE (1ull << 59)
+#define REMOVED_SPTE 0x5a0ULL
+
+/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
static inline bool is_removed_spte(u64 spte)
{
@@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits;
static inline bool is_mmio_spte(u64 spte)
{
- return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
+ return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ likely(shadow_mmio_value);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+ return !!(pte & SPTE_MMU_PRESENT_MASK);
}
static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
+ /*
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+ * and non-TDP SPTEs will never set these bits. Optimize for 64-bit
+ * TDP and do the A/D type check unconditionally.
+ */
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON(is_mmio_spte(spte));
+ MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
@@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte)
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
-static inline bool is_shadow_present_pte(u64 pte)
-{
- return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
-}
-
static inline bool is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
@@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte)
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
- return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
- (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+ return (spte & shadow_host_writable_mask) &&
+ (spte & shadow_mmu_writable_mask);
}
static inline u64 get_mmio_spte_generation(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 34207b874886..88f69a6cc492 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
if (!kvm->arch.tdp_mmu_enabled)
@@ -41,32 +50,85 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
rcu_barrier();
}
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{
- if (kvm_mmu_put_root(kvm, root))
- kvm_tdp_mmu_free_root(kvm, root);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
}
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
- struct kvm_mmu_page *root)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
{
- lockdep_assert_held_write(&kvm->mmu_lock);
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
- if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
- return false;
+ tdp_mmu_free_sp(sp);
+}
- kvm_mmu_get_root(kvm, root);
- return true;
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
+
+ WARN_ON(!root->tdp_mmu_page);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+
+ zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
- struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root,
+ bool shared)
{
struct kvm_mmu_page *next_root;
- next_root = list_next_entry(root, link);
- tdp_mmu_put_root(kvm, root);
+ rcu_read_lock();
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link, typeof(*next_root), link);
+
+ rcu_read_unlock();
+
+ if (prev_root)
+ kvm_tdp_mmu_put_root(kvm, prev_root, shared);
+
return next_root;
}
@@ -75,35 +137,24 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* This makes it safe to release the MMU lock and yield within the loop, but
* if exiting the loop early, the caller must drop the reference to the most
* recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
*/
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
- for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \
- typeof(*_root), link); \
- tdp_mmu_next_root_valid(_kvm, _root); \
- _root = tdp_mmu_next_root(_kvm, _root))
-
-#define for_each_tdp_mmu_root(_kvm, _root) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush);
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WARN_ON(root->root_count);
- WARN_ON(!root->tdp_mmu_page);
-
- list_del(&root->link);
-
- zap_gfn_range(kvm, root, 0, max_gfn, false, false);
-
- free_page((unsigned long)root->spt);
- kmem_cache_free(mmu_page_header_cache, root);
-}
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
+ lockdep_is_held_type(&kvm->mmu_lock, 0) || \
+ lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
+ if (kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
int level)
@@ -137,81 +188,46 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
return sp;
}
-static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
union kvm_mmu_page_role role;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+ lockdep_assert_held_write(&kvm->mmu_lock);
- write_lock(&kvm->mmu_lock);
+ role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
/* Check for an existing root before allocating a new one. */
- for_each_tdp_mmu_root(kvm, root) {
- if (root->role.word == role.word) {
- kvm_mmu_get_root(kvm, root);
- write_unlock(&kvm->mmu_lock);
- return root;
- }
+ for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ if (root->role.word == role.word &&
+ kvm_tdp_mmu_get_root(kvm, root))
+ goto out;
}
root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
- root->root_count = 1;
-
- list_add(&root->link, &kvm->arch.tdp_mmu_roots);
-
- write_unlock(&kvm->mmu_lock);
-
- return root;
-}
-
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *root;
+ refcount_set(&root->tdp_mmu_root_count, 1);
- root = get_tdp_mmu_vcpu_root(vcpu);
- if (!root)
- return INVALID_PAGE;
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out:
return __pa(root->spt);
}
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
- free_page((unsigned long)sp->spt);
- kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
- struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
- rcu_head);
-
- tdp_mmu_free_sp(sp);
-}
-
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
- bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
return;
if (is_accessed_spte(old_spte) &&
- (!is_accessed_spte(new_spte) || pfn_changed))
+ (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
@@ -455,7 +471,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (was_leaf && is_dirty_spte(old_spte) &&
- (!is_dirty_spte(new_spte) || pfn_changed))
+ (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
/*
@@ -479,8 +495,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
}
/*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
*
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
@@ -488,9 +505,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* Returns: true if the SPTE was set, false if it was not. If false is returned,
* this function will have no side-effects.
*/
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
lockdep_assert_held_read(&kvm->mmu_lock);
@@ -498,19 +515,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
* Do not change removed SPTEs. Only the thread that froze the SPTE
* may modify it.
*/
- if (iter->old_spte == REMOVED_SPTE)
+ if (is_removed_spte(iter->old_spte))
return false;
if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
new_spte) != iter->old_spte)
return false;
- handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
+ __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+ handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
return true;
}
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+ return false;
+
+ handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ return true;
+}
+
static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -569,7 +599,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(iter->old_spte == REMOVED_SPTE);
+ WARN_ON(is_removed_spte(iter->old_spte));
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
@@ -634,7 +664,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* Return false if a yield was not needed.
*/
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush)
+ struct tdp_iter *iter, bool flush,
+ bool shared)
{
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -646,7 +677,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
if (flush)
kvm_flush_remote_tlbs(kvm);
- cond_resched_rwlock_write(&kvm->mmu_lock);
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -664,24 +699,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup. Note, in some use cases a flush may be
- * required by prior actions. Ensure the pending flush is performed prior to
- * yielding.
+ * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
*/
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush)
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared)
{
struct tdp_iter iter;
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
+retry:
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
flush = false;
continue;
}
@@ -699,8 +742,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
- flush = true;
+ if (!shared) {
+ tdp_mmu_set_spte(kvm, &iter, 0);
+ flush = true;
+ } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
}
rcu_read_unlock();
@@ -712,15 +764,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
*/
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+ gfn_t end, bool can_yield, bool flush,
+ bool shared)
{
struct kvm_mmu_page *root;
- bool flush = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root)
- flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+ flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
+ shared);
return flush;
}
@@ -728,14 +786,116 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- bool flush;
+ bool flush = false;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+ flush, false);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root)
+{
+ struct kvm_mmu_page *next_root;
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root && !(next_root->role.invalid &&
+ refcount_read(&next_root->tdp_mmu_root_count)))
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link,
+ typeof(*next_root), link);
+
+ return next_root;
+}
+
+/*
+ * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial amount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+ gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ struct kvm_mmu_page *next_root;
+ struct kvm_mmu_page *root;
+ bool flush = false;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ root = next_invalidated_root(kvm, NULL);
+
+ while (root) {
+ next_root = next_invalidated_root(kvm, root);
+
+ rcu_read_unlock();
+
+ flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
+ true);
+
+ /*
+ * Put the reference acquired in
+ * kvm_tdp_mmu_invalidate_roots
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+
+ root = next_root;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
if (flush)
kvm_flush_remote_tlbs(kvm);
}
/*
+ * Mark each TDP MMU root as invalid so that other threads
+ * will drop their references and allow the root count to
+ * go to 0.
+ *
+ * Also take a reference on all roots so that this thread
+ * can do the bulk of the work required to free the roots
+ * once they are invalidated. Without this reference, a
+ * vCPU thread might drop the last reference to a root and
+ * get stuck with tearing down the entire paging structure.
+ *
+ * Roots which have a zero refcount should be skipped as
+ * they're already being torn down.
+ * Already invalid roots should be referenced again so that
+ * they aren't freed before kvm_tdp_mmu_zap_all_fast is
+ * done with them.
+ *
+ * This has essentially the same effect for the TDP MMU
+ * as updating mmu_valid_gen does for the shadow MMU.
+ */
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
+ if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+ root->role.invalid = true;
+}
+
+/*
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
*/
@@ -777,12 +937,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
ret = RET_PF_EMULATE;
- } else
+ } else {
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
+ }
- trace_kvm_mmu_set_spte(iter->level, iter->gfn,
- rcu_dereference(iter->sptep));
if (!prefault)
vcpu->stat.pf_fixed++;
@@ -882,199 +1041,139 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
return ret;
}
-static __always_inline int
-kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root,
- gfn_t start,
- gfn_t end,
- unsigned long data))
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
struct kvm_mmu_page *root;
- int ret = 0;
- int as_id;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- as_id = kvm_mmu_page_as_id(root);
- slots = __kvm_memslots(kvm, as_id);
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- ret |= handler(kvm, memslot, root, gfn_start,
- gfn_end, data);
- }
- }
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
+ flush |= zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
- return ret;
+ return flush;
}
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start,
- gfn_t end, unsigned long unused)
-{
- return zap_gfn_range(kvm, root, start, end, false, false);
-}
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range);
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ tdp_handler_t handler)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- zap_gfn_range_hva_wrapper);
+ struct kvm_mmu_page *root;
+ struct tdp_iter iter;
+ bool ret = false;
+
+ rcu_read_lock();
+
+ /*
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code.
+ */
+ for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+ tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+ ret |= handler(kvm, &iter, range);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
}
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
*/
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start, gfn_t end,
- unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- int young = 0;
u64 new_spte = 0;
- rcu_read_lock();
+ /* If we have a non-accessed entry we don't need to change the pte. */
+ if (!is_accessed_spte(iter->old_spte))
+ return false;
- tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ new_spte = iter->old_spte;
+
+ if (spte_ad_enabled(new_spte)) {
+ new_spte &= ~shadow_accessed_mask;
+ } else {
/*
- * If we have a non-accessed entry we don't need to change the
- * pte.
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
*/
- if (!is_accessed_spte(iter.old_spte))
- continue;
-
- new_spte = iter.old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)&new_spte);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+ if (is_writable_pte(new_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(new_spte));
- new_spte = mark_spte_for_access_track(new_spte);
- }
- new_spte &= ~shadow_dirty_mask;
-
- tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
- young = 1;
-
- trace_kvm_age_page(iter.gfn, iter.level, slot, young);
+ new_spte = mark_spte_for_access_track(new_spte);
}
- rcu_read_unlock();
+ tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
- return young;
+ return true;
}
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
- age_gfn_range);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
}
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long unused2)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
-
- tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
- if (is_accessed_spte(iter.old_spte))
- return 1;
-
- return 0;
+ return is_accessed_spte(iter->old_spte);
}
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
- test_age_gfn);
+ return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
- unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_gfn_range *range)
{
- struct tdp_iter iter;
- pte_t *ptep = (pte_t *)data;
- kvm_pfn_t new_pfn;
u64 new_spte;
- int need_flush = 0;
-
- rcu_read_lock();
- WARN_ON(pte_huge(*ptep));
+ /* Huge pages aren't expected to be modified without first being zapped. */
+ WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
- new_pfn = pte_pfn(*ptep);
-
- tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
- if (iter.level != PG_LEVEL_4K)
- continue;
-
- if (!is_shadow_present_pte(iter.old_spte))
- break;
-
- tdp_mmu_set_spte(kvm, &iter, 0);
-
- kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
+ if (iter->level != PG_LEVEL_4K ||
+ !is_shadow_present_pte(iter->old_spte))
+ return false;
- if (!pte_write(*ptep)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- iter.old_spte, new_pfn);
+ /*
+ * Note, when changing a read-only SPTE, it's not strictly necessary to
+ * zero the SPTE before setting the new PFN, but doing so preserves the
+ * invariant that the PFN of a present * leaf SPTE can never change.
+ * See __handle_changed_spte().
+ */
+ tdp_mmu_set_spte(kvm, iter, 0);
- tdp_mmu_set_spte(kvm, &iter, new_spte);
- }
+ if (!pte_write(range->pte)) {
+ new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+ pte_pfn(range->pte));
- need_flush = 1;
+ tdp_mmu_set_spte(kvm, iter, new_spte);
}
- if (need_flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
- rcu_read_unlock();
-
- return 0;
+ return true;
}
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
- (unsigned long)host_ptep,
- set_tdp_spte);
+ bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+
+ /* FIXME: return 'flush' instead of flushing here. */
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
+
+ return false;
}
/*
@@ -1095,7 +1194,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1105,7 +1205,15 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+ new_spte)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
spte_set = true;
}
@@ -1122,17 +1230,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
int min_level)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
- }
return spte_set;
}
@@ -1154,7 +1258,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1169,7 +1274,15 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
}
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
+ new_spte)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
spte_set = true;
}
@@ -1187,17 +1300,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
- }
return spte_set;
}
@@ -1259,37 +1368,32 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
bool wrprot)
{
struct kvm_mmu_page *root;
- int root_as_id;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
- }
}
/*
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static void zap_collapsible_spte_range(struct kvm *kvm,
+static bool zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot,
+ bool flush)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
kvm_pfn_t pfn;
- bool spte_set = false;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
- if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
- spte_set = false;
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+ flush = false;
continue;
}
@@ -1303,38 +1407,43 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
pfn, PG_LEVEL_NUM))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
-
- spte_set = true;
+ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+ /*
+ * The iter must explicitly re-read the SPTE because
+ * the atomic cmpxchg failed.
+ */
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+ goto retry;
+ }
+ flush = true;
}
rcu_read_unlock();
- if (spte_set)
- kvm_flush_remote_tlbs(kvm);
+
+ return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ bool flush)
{
struct kvm_mmu_page *root;
- int root_as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
+ lockdep_assert_held_read(&kvm->mmu_lock);
- zap_collapsible_spte_range(kvm, root, slot);
- }
+ for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+ flush = zap_collapsible_spte_range(kvm, root, slot, flush);
+
+ return flush;
}
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -1351,7 +1460,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
break;
new_spte = iter.old_spte &
- ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
@@ -1364,24 +1473,19 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
/*
* Removes write access on the last level SPTE mapping this GFN and unsets the
- * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted.
+ * MMU-writable bit to ensure future writes continue to be intercepted.
* Returns true if an SPTE was set and a TLB flush is needed.
*/
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
struct kvm_mmu_page *root;
- int root_as_id;
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
+ for_each_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn);
- }
+
return spte_set;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 31096ece9b14..5fdf63090451 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -6,14 +6,28 @@
#include <linux/kvm_host.h>
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
-bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
- bool can_yield);
-static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
- gfn_t end)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+ struct kvm_mmu_page *root)
{
- return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+ if (root->role.invalid)
+ return false;
+
+ return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared);
+
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
+ gfn_t end, bool can_yield, bool flush,
+ bool shared);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
+ gfn_t start, gfn_t end, bool flush,
+ bool shared)
+{
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
+ shared);
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
@@ -29,23 +43,23 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
* of the shadow page's gfn range and stop iterating before yielding.
*/
lockdep_assert_held_write(&kvm->mmu_lock);
- return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
+ sp->gfn, end, false, false, false);
}
+
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int map_writable, int max_level, kvm_pfn_t pfn,
bool prefault);
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
-
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
-
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
- pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
int min_level);
@@ -55,8 +69,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *slot);
+bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ bool flush);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
new file mode 100644
index 000000000000..a19d473d0184
--- /dev/null
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_REVERSE_CPUID_H
+#define ARCH_X86_KVM_REVERSE_CPUID_H
+
+#include <uapi/asm/kvm.h>
+#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
+
+/*
+ * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
+ * be directly used by KVM. Note, these word values conflict with the kernel's
+ * "bug" caps, but KVM doesn't use those.
+ */
+enum kvm_only_cpuid_leafs {
+ CPUID_12_EAX = NCAPINTS,
+ NR_KVM_CPU_CAPS,
+
+ NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
+};
+
+#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+ [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
+ [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
+ [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+};
+
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities. And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
+{
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+ if (x86_feature == X86_FEATURE_SGX1)
+ return KVM_X86_FEATURE_SGX1;
+ else if (x86_feature == X86_FEATURE_SGX2)
+ return KVM_X86_FEATURE_SGX2;
+
+ return x86_feature;
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+ return __feature_translate(x86_feature) / 32;
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5). The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+ x86_feature = __feature_translate(x86_feature);
+
+ reverse_cpuid_check(x86_feature / 32);
+ return 1 << (x86_feature & 31);
+}
+
+#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ reverse_cpuid_check(x86_leaf);
+ return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
+}
+
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+ return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool set)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ /*
+ * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+ * compiler into using CMOV instead of Jcc when possible.
+ */
+ if (set)
+ *reg |= __feature_bit(x86_feature);
+ else
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+#endif /* ARCH_X86_KVM_REVERSE_CPUID_H */
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 3e55674098be..712b4e0de481 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
return -EINVAL;
- if (!svm->vcpu.arch.apic->regs)
+ if (!vcpu->arch.apic->regs)
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
- svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+ svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
/* Setting AVIC backing page address in the phy APIC ID table */
entry = avic_get_physical_id_entry(vcpu, id);
@@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
}
}
-int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
@@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, svm->vcpu.vcpu_id, icrh, icrl);
+ index, vcpu->vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset)
return ret;
}
-int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret = 0;
u32 offset = svm->vmcb->control.exit_info_1 &
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
@@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
AVIC_UNACCEL_ACCESS_WRITE_MASK;
bool trap = is_avic_unaccelerated_access_trap(offset);
- trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
trap, write, vector);
if (trap) {
/* Handling Trap */
@@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
ret = avic_unaccel_trap_write(svm);
} else {
/* Handling Fault */
- ret = kvm_emulate_instruction(&svm->vcpu, 0);
+ ret = kvm_emulate_instruction(vcpu, 0);
}
return ret;
@@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (!avic || !irqchip_in_kernel(vcpu->kvm))
return 0;
- ret = avic_init_backing_page(&svm->vcpu);
+ ret = avic_init_backing_page(vcpu);
if (ret)
return ret;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fb204eaa8bb3..540d43ba2cf4 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -29,6 +29,8 @@
#include "lapic.h"
#include "svm.h"
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
@@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
return;
c = &svm->vmcb->control;
- h = &svm->nested.hsave->control;
+ h = &svm->vmcb01.ptr->control;
g = &svm->nested.ctl;
for (i = 0; i < MAX_INTERCEPT; i++)
@@ -213,44 +215,64 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
return true;
}
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
{
- struct vcpu_svm *svm = to_svm(vcpu);
+ u64 addr = PAGE_ALIGN(pa);
- if (WARN_ON(!is_guest_mode(vcpu)))
- return true;
-
- if (!nested_svm_vmrun_msrpm(svm)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return false;
- }
-
- return true;
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
}
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_control_area *control)
{
- if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+ if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
return false;
- if (control->asid == 0)
+ if (CC(control->asid == 0))
return false;
- if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
- !npt_enabled)
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ return false;
+
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
return false;
return true;
}
-static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area *save)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- bool vmcb12_lma;
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+ return false;
+ }
+
+ if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ return true;
+}
+/* Common checks that apply to both L1 and L2 state. */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area *save)
+{
/*
* FIXME: these should be done after copying the fields,
* to avoid TOC/TOU races. For these save area checks
@@ -258,31 +280,27 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
* kvm_set_cr4 handle failure; EFER_SVME is an exception
* so it is force-set later in nested_prepare_vmcb_save.
*/
- if ((vmcb12->save.efer & EFER_SVME) == 0)
+ if (CC(!(save->efer & EFER_SVME)))
return false;
- if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
return false;
- if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
return false;
- vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+ if (!nested_vmcb_check_cr3_cr4(vcpu, save))
+ return false;
- if (vmcb12_lma) {
- if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
- !(vmcb12->save.cr0 & X86_CR0_PE) ||
- kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
- return false;
- }
- if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
return false;
return true;
}
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
- struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
{
copy_vmcb_control_area(&svm->nested.ctl, control);
@@ -294,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
/*
* Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
*/
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
{
u32 mask;
svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
@@ -324,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
* Transfer any event that L0 or L1 wanted to inject into L2 to
* EXIT_INT_INFO.
*/
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
- struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 exit_int_info = 0;
@@ -369,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_npt)
{
- if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
return -EINVAL;
if (!nested_npt && is_pae_paging(vcpu) &&
(cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
- if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
return -EINVAL;
}
@@ -393,15 +411,42 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return 0;
}
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+
+ nested_vmcb02_compute_g_pat(svm);
+
/* Load the nested guest state */
- svm->vmcb->save.es = vmcb12->save.es;
- svm->vmcb->save.cs = vmcb12->save.cs;
- svm->vmcb->save.ss = vmcb12->save.ss;
- svm->vmcb->save.ds = vmcb12->save.ds;
- svm->vmcb->save.gdtr = vmcb12->save.gdtr;
- svm->vmcb->save.idtr = vmcb12->save.idtr;
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ svm->vmcb->save.es = vmcb12->save.es;
+ svm->vmcb->save.cs = vmcb12->save.cs;
+ svm->vmcb->save.ss = vmcb12->save.ss;
+ svm->vmcb->save.ds = vmcb12->save.ds;
+ svm->vmcb->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+ svm->vmcb->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+ }
+
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
/*
@@ -413,7 +458,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
@@ -422,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.rax = vmcb12->save.rax;
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
- svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
- svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+ }
}
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ /*
+ * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+ * avic_physical_id.
+ */
+ WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+ svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+ svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+ /* Done at vmrun: asid. */
+
+ /* Also overwritten later if necessary. */
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
if (nested_npt_enabled(svm))
nested_svm_init_mmu_context(&svm->vcpu);
@@ -439,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
- (svm->nested.hsave->control.int_ctl & mask);
+ (svm->vmcb01.ptr->control.int_ctl & mask);
svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
@@ -454,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
enter_guest_mode(&svm->vcpu);
/*
- * Merge guest and host intercepts - must be called with vcpu in
- * guest-mode to take affect here
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
*/
recalc_intercepts(svm);
+}
- vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
}
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
struct vmcb *vmcb12)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -482,8 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
svm->nested.vmcb12_gpa = vmcb12_gpa;
- nested_prepare_vmcb_control(svm);
- nested_prepare_vmcb_save(svm, vmcb12);
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm);
+ nested_vmcb02_prepare_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
@@ -491,47 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
return ret;
if (!npt_enabled)
- svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+ vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
svm_set_gif(svm, true);
return 0;
}
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int ret;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
u64 vmcb12_gpa;
- if (is_smm(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ ++vcpu->stat.nested_run;
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
vmcb12_gpa = svm->vmcb->save.rax;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
} else if (ret) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ ret = kvm_skip_emulated_instruction(vcpu);
vmcb12 = map.hva;
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
- load_nested_vmcb_control(svm, &vmcb12->control);
+ nested_load_control_from_vmcb12(svm, &vmcb12->control);
- if (!nested_vmcb_check_save(svm, vmcb12) ||
- !nested_vmcb_check_controls(&svm->nested.ctl)) {
+ if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+ !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
@@ -541,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
/* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
/*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
*/
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(&hsave->control, &vmcb->control);
+ svm->vmcb01.ptr->save.efer = vcpu->arch.efer;
+ svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu);
+ svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4;
+ svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+ svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu);
+
+ if (!npt_enabled)
+ svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
@@ -587,7 +667,7 @@ out_exit_err:
nested_svm_vmexit(svm);
out:
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
@@ -610,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
int nested_svm_vmexit(struct vcpu_svm *svm)
{
- int rc;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
struct vmcb *vmcb12;
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
+ int rc;
- rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ /* Triple faults in L2 should never escape. */
+ WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
vmcb12 = map.hva;
/* Exit Guest-Mode */
- leave_guest_mode(&svm->vcpu);
+ leave_guest_mode(vcpu);
svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
/* in case we halted in L2 */
svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -644,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->save.gdtr = vmcb->save.gdtr;
vmcb12->save.idtr = vmcb->save.idtr;
vmcb12->save.efer = svm->vcpu.arch.efer;
- vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu);
- vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
vmcb12->save.cr2 = vmcb->save.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
- vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
- vmcb12->save.rip = kvm_rip_read(&svm->vcpu);
- vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu);
- vmcb12->save.rax = kvm_rax_read(&svm->vcpu);
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
vmcb12->save.dr7 = vmcb->save.dr7;
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
vmcb12->save.cpl = vmcb->save.cpl;
@@ -663,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
- nested_vmcb_save_pending_event(svm, vmcb12);
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
if (svm->nrips_enabled)
vmcb12->control.next_rip = vmcb->control.next_rip;
@@ -678,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.pause_filter_thresh =
svm->vmcb->control.pause_filter_thresh;
- /* Restore the original control entries */
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+ WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
- /* On vmexit the GIF is set to false */
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
svm_set_gif(svm, false);
+ svm->vmcb->control.exit_int_info = 0;
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
- svm->vcpu.arch.l1_tsc_offset;
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ }
svm->nested.ctl.nested_cr3 = 0;
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
- kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- kvm_rax_write(&svm->vcpu, hsave->save.rax);
- kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
- kvm_rip_write(&svm->vcpu, hsave->save.rip);
- svm->vmcb->save.dr7 = DR7_FIXED_1;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+ svm_set_efer(vcpu, svm->vmcb->save.efer);
+ svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+ kvm_rax_write(vcpu, svm->vmcb->save.rax);
+ kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+ kvm_rip_write(vcpu, svm->vmcb->save.rip);
- vmcb_mark_all_dirty(svm->vmcb);
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
vmcb12->control.exit_info_1,
@@ -717,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.exit_int_info_err,
KVM_ISA_SVM);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ nested_svm_uninit_mmu_context(vcpu);
- rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+ rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
if (rc)
return 1;
- if (npt_enabled)
- svm->vmcb->save.cr3 = hsave->save.cr3;
-
/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
* doesn't end up in L1.
*/
svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
return 0;
}
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
int svm_allocate_nested(struct vcpu_svm *svm)
{
- struct page *hsave_page;
+ struct page *vmcb02_page;
if (svm->nested.initialized)
return 0;
- hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!hsave_page)
+ vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb02_page)
return -ENOMEM;
- svm->nested.hsave = page_address(hsave_page);
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
- goto err_free_hsave;
+ goto err_free_vmcb02;
svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
-err_free_hsave:
- __free_page(hsave_page);
+err_free_vmcb02:
+ __free_page(vmcb02_page);
return -ENOMEM;
}
@@ -772,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm)
svm_vcpu_free_msrpm(svm->nested.msrpm);
svm->nested.msrpm = NULL;
- __free_page(virt_to_page(svm->nested.hsave));
- svm->nested.hsave = NULL;
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
svm->nested.initialized = false;
}
@@ -783,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm)
*/
void svm_leave_nested(struct vcpu_svm *svm)
{
- if (is_guest_mode(&svm->vcpu)) {
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
- leave_guest_mode(&svm->vcpu);
- copy_vmcb_control_area(&vmcb->control, &hsave->control);
- nested_svm_uninit_mmu_context(&svm->vcpu);
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+ nested_svm_uninit_mmu_context(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
}
- kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -903,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
return vmexit;
}
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
{
- if (!(svm->vcpu.arch.efer & EFER_SVME) ||
- !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -960,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
nested_svm_vmexit(svm);
}
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_SMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
- trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
-static void nested_svm_init(struct vcpu_svm *svm)
-{
- svm->vmcb->control.exit_code = SVM_EXIT_INIT;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-}
-
-
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1017,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_init(svm))
return 0;
- nested_svm_init(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
return 0;
}
if (vcpu->arch.exception.pending) {
- if (block_nested_events)
+ /*
+ * Only a pending nested run can block a pending exception.
+ * Otherwise an injected NMI/interrupt should either be
+ * lost or delivered to the nested hypervisor in the EXITINTINFO
+ * vmcb field, while delivering the pending exception.
+ */
+ if (svm->nested.nested_run_pending)
return -EBUSY;
if (!nested_exit_on_exception(svm))
return 0;
@@ -1035,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_smi(svm))
return 0;
- nested_svm_smi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
return 0;
}
@@ -1044,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_nmi(svm))
return 0;
- nested_svm_nmi(svm);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
return 0;
}
@@ -1053,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
if (!nested_exit_on_intr(svm))
return 0;
- nested_svm_intr(svm);
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
return 0;
}
@@ -1072,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
- excp_bits)
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
return NESTED_EXIT_HOST;
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
svm->vcpu.arch.apf.host_apf_flags)
@@ -1137,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
sizeof(user_vmcb->control)))
return -EFAULT;
- if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
sizeof(user_vmcb->save)))
return -EFAULT;
-
out:
return kvm_state.size;
}
@@ -1150,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state *kvm_state)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area *ctl;
@@ -1195,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
ret = -ENOMEM;
- ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
- save = kzalloc(sizeof(*save), GFP_KERNEL);
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
+ save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
if (!ctl || !save)
goto out_free;
@@ -1207,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- if (!nested_vmcb_check_controls(ctl))
+ if (!nested_vmcb_check_controls(vcpu, ctl))
goto out_free;
/*
* Processor state contains L2 state. Check that it is
- * valid for guest mode (see nested_vmcb_checks).
+ * valid for guest mode (see nested_vmcb_check_save).
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1221,29 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
/*
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
- * TODO: validate reserved bits for all saved state.
*/
- if (!(save->cr0 & X86_CR0_PG))
- goto out_free;
- if (!(save->efer & EFER_SVME))
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !nested_vmcb_valid_sregs(vcpu, save))
goto out_free;
/*
- * All checks done, we can enter guest mode. L1 control fields
- * come from the nested save state. Guest state is already
- * in the registers, the save area of the nested state instead
- * contains saved L1 state.
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
*/
svm->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
- copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = *save;
-
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
- load_nested_vmcb_control(svm, ctl);
- nested_prepare_vmcb_control(svm);
+ if (svm->current_vmcb == &svm->vmcb01)
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm->vmcb01.ptr->save.es = save->es;
+ svm->vmcb01.ptr->save.cs = save->cs;
+ svm->vmcb01.ptr->save.ss = save->ss;
+ svm->vmcb01.ptr->save.ds = save->ds;
+ svm->vmcb01.ptr->save.gdtr = save->gdtr;
+ svm->vmcb01.ptr->save.idtr = save->idtr;
+ svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+ svm->vmcb01.ptr->save.efer = save->efer;
+ svm->vmcb01.ptr->save.cr0 = save->cr0;
+ svm->vmcb01.ptr->save.cr3 = save->cr3;
+ svm->vmcb01.ptr->save.cr4 = save->cr4;
+ svm->vmcb01.ptr->save.rax = save->rax;
+ svm->vmcb01.ptr->save.rsp = save->rsp;
+ svm->vmcb01.ptr->save.rip = save->rip;
+ svm->vmcb01.ptr->save.cpl = 0;
+
+ nested_load_control_from_vmcb12(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+ nested_vmcb02_prepare_control(svm);
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
@@ -1254,8 +1336,31 @@ out_free:
return ret;
}
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm)))
+ return false;
+
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ return true;
+}
+
struct kvm_x86_nested_ops svm_nested_ops = {
.check_events = svm_check_nested_events,
+ .triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 415a49b8b8f8..1356ee095cd5 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -44,12 +44,25 @@
#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
#endif
+#ifdef CONFIG_KVM_AMD_SEV
+/* enable/disable SEV support */
+static bool sev_enabled = true;
+module_param_named(sev, sev_enabled, bool, 0444);
+
+/* enable/disable SEV-ES support */
+static bool sev_es_enabled = true;
+module_param_named(sev_es, sev_es_enabled, bool, 0444);
+#else
+#define sev_enabled false
+#define sev_es_enabled false
+#endif /* CONFIG_KVM_AMD_SEV */
+
static u8 sev_enc_bit;
-static int sev_flush_asids(void);
static DECLARE_RWSEM(sev_deactivate_lock);
static DEFINE_MUTEX(sev_bitmap_lock);
unsigned int max_sev_asid;
static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
@@ -61,9 +74,15 @@ struct enc_region {
unsigned long size;
};
-static int sev_flush_asids(void)
+/* Called with the sev_bitmap_lock held, or on shutdown */
+static int sev_flush_asids(int min_asid, int max_asid)
{
- int ret, error = 0;
+ int ret, pos, error = 0;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ pos = find_next_bit(sev_reclaim_asid_bitmap, max_asid, min_asid);
+ if (pos >= max_asid)
+ return -EBUSY;
/*
* DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
@@ -82,17 +101,15 @@ static int sev_flush_asids(void)
return ret;
}
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
/* Must be called with the sev_bitmap_lock held */
static bool __sev_recycle_asids(int min_asid, int max_asid)
{
- int pos;
-
- /* Check if there are any ASIDs to reclaim before performing a flush */
- pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid);
- if (pos >= max_asid)
- return false;
-
- if (sev_flush_asids())
+ if (sev_flush_asids(min_asid, max_asid))
return false;
/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
@@ -184,49 +201,41 @@ static void sev_asid_free(struct kvm_sev_info *sev)
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
- struct sev_data_decommission *decommission;
- struct sev_data_deactivate *data;
+ struct sev_data_decommission decommission;
+ struct sev_data_deactivate deactivate;
if (!handle)
return;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return;
-
- /* deactivate handle */
- data->handle = handle;
+ deactivate.handle = handle;
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
down_read(&sev_deactivate_lock);
- sev_guest_deactivate(data, NULL);
+ sev_guest_deactivate(&deactivate, NULL);
up_read(&sev_deactivate_lock);
- kfree(data);
-
- decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
- if (!decommission)
- return;
-
/* decommission handle */
- decommission->handle = handle;
- sev_guest_decommission(decommission, NULL);
-
- kfree(decommission);
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
}
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ bool es_active = argp->id == KVM_SEV_ES_INIT;
int asid, ret;
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
ret = -EBUSY;
if (unlikely(sev->active))
return ret;
+ sev->es_active = es_active;
asid = sev_asid_new(sev);
if (asid < 0)
- return ret;
+ goto e_no_asid;
sev->asid = asid;
ret = sev_platform_init(&argp->error);
@@ -234,6 +243,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free;
sev->active = true;
+ sev->asid = asid;
INIT_LIST_HEAD(&sev->regions_list);
return 0;
@@ -241,34 +251,21 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
e_free:
sev_asid_free(sev);
sev->asid = 0;
+e_no_asid:
+ sev->es_active = false;
return ret;
}
-static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
- if (!sev_es)
- return -ENOTTY;
-
- to_kvm_svm(kvm)->sev_info.es_active = true;
-
- return sev_guest_init(kvm, argp);
-}
-
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
- struct sev_data_activate *data;
+ struct sev_data_activate activate;
int asid = sev_get_asid(kvm);
int ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
/* activate ASID on the given handle */
- data->handle = handle;
- data->asid = asid;
- ret = sev_guest_activate(data, error);
- kfree(data);
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
return ret;
}
@@ -298,7 +295,7 @@ static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_start *start;
+ struct sev_data_launch_start start;
struct kvm_sev_launch_start params;
void *dh_blob, *session_blob;
int *error = &argp->error;
@@ -310,20 +307,16 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
- if (!start)
- return -ENOMEM;
+ memset(&start, 0, sizeof(start));
dh_blob = NULL;
if (params.dh_uaddr) {
dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
- if (IS_ERR(dh_blob)) {
- ret = PTR_ERR(dh_blob);
- goto e_free;
- }
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
- start->dh_cert_address = __sme_set(__pa(dh_blob));
- start->dh_cert_len = params.dh_len;
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
}
session_blob = NULL;
@@ -334,40 +327,38 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_dh;
}
- start->session_address = __sme_set(__pa(session_blob));
- start->session_len = params.session_len;
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
}
- start->handle = params.handle;
- start->policy = params.policy;
+ start.handle = params.handle;
+ start.policy = params.policy;
/* create memory encryption context */
- ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
if (ret)
goto e_free_session;
/* Bind ASID to this guest */
- ret = sev_bind_asid(kvm, start->handle, error);
+ ret = sev_bind_asid(kvm, start.handle, error);
if (ret)
goto e_free_session;
/* return handle to userspace */
- params.handle = start->handle;
+ params.handle = start.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
- sev_unbind_asid(kvm, start->handle);
+ sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
}
- sev->handle = start->handle;
+ sev->handle = start.handle;
sev->fd = argp->sev_fd;
e_free_session:
kfree(session_blob);
e_free_dh:
kfree(dh_blob);
-e_free:
- kfree(start);
return ret;
}
@@ -486,7 +477,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
- struct sev_data_launch_update_data *data;
+ struct sev_data_launch_update_data data;
struct page **inpages;
int ret;
@@ -496,20 +487,14 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
vaddr = params.uaddr;
size = params.len;
vaddr_end = vaddr + size;
/* Lock the user memory. */
inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
- if (IS_ERR(inpages)) {
- ret = PTR_ERR(inpages);
- goto e_free;
- }
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
/*
* Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
@@ -517,6 +502,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
*/
sev_clflush_pages(inpages, npages);
+ data.reserved = 0;
+ data.handle = sev->handle;
+
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
int offset, len;
@@ -531,10 +519,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
- data->handle = sev->handle;
- data->len = len;
- data->address = __sme_page_pa(inpages[i]) + offset;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
if (ret)
goto e_unpin;
@@ -550,8 +537,6 @@ e_unpin:
}
/* unlock the user pages */
sev_unpin_memory(kvm, inpages, npages);
-e_free:
- kfree(data);
return ret;
}
@@ -603,23 +588,22 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_update_vmsa *vmsa;
+ struct sev_data_launch_update_vmsa vmsa;
+ struct kvm_vcpu *vcpu;
int i, ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
- vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL);
- if (!vmsa)
- return -ENOMEM;
+ vmsa.reserved = 0;
- for (i = 0; i < kvm->created_vcpus; i++) {
- struct vcpu_svm *svm = to_svm(kvm->vcpus[i]);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
/* Perform some pre-encryption checks against the VMSA */
ret = sev_es_sync_vmsa(svm);
if (ret)
- goto e_free;
+ return ret;
/*
* The LAUNCH_UPDATE_VMSA command will perform in-place
@@ -629,27 +613,25 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
*/
clflush_cache_range(svm->vmsa, PAGE_SIZE);
- vmsa->handle = sev->handle;
- vmsa->address = __sme_pa(svm->vmsa);
- vmsa->len = PAGE_SIZE;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa,
+ vmsa.handle = sev->handle;
+ vmsa.address = __sme_pa(svm->vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
&argp->error);
if (ret)
- goto e_free;
+ return ret;
svm->vcpu.arch.guest_state_protected = true;
}
-e_free:
- kfree(vmsa);
- return ret;
+ return 0;
}
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *measure = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_measure *data;
+ struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
void __user *p = NULL;
void *blob = NULL;
@@ -661,9 +643,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
@@ -671,23 +651,20 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
+ data.address = __psp_pa(blob);
+ data.len = params.len;
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
@@ -704,63 +681,50 @@ cmd:
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(measure, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_finish *data;
- int ret;
+ struct sev_data_launch_finish data;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
-
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
-
- kfree(data);
- return ret;
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
}
static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_guest_status params;
- struct sev_data_guest_status *data;
+ struct sev_data_guest_status data;
int ret;
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
if (ret)
- goto e_free;
+ return ret;
- params.policy = data->policy;
- params.state = data->state;
- params.handle = data->handle;
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
ret = -EFAULT;
-e_free:
- kfree(data);
+
return ret;
}
@@ -769,23 +733,17 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
int *error, bool enc)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_dbg *data;
- int ret;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ struct sev_data_dbg data;
- data->handle = sev->handle;
- data->dst_addr = dst;
- data->src_addr = src;
- data->len = size;
+ data.reserved = 0;
+ data.handle = sev->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
- ret = sev_issue_cmd(kvm,
- enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
- data, error);
- kfree(data);
- return ret;
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
}
static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
@@ -1005,7 +963,7 @@ err:
static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_launch_secret *data;
+ struct sev_data_launch_secret data;
struct kvm_sev_launch_secret params;
struct page **pages;
void *blob, *hdr;
@@ -1037,41 +995,36 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_unpin_memory;
}
- ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- goto e_unpin_memory;
+ memset(&data, 0, sizeof(data));
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- data->guest_address = __sme_page_pa(pages[0]) + offset;
- data->guest_len = params.guest_len;
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
if (IS_ERR(blob)) {
ret = PTR_ERR(blob);
- goto e_free;
+ goto e_unpin_memory;
}
- data->trans_address = __psp_pa(blob);
- data->trans_len = params.trans_len;
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
if (IS_ERR(hdr)) {
ret = PTR_ERR(hdr);
goto e_free_blob;
}
- data->hdr_address = __psp_pa(hdr);
- data->hdr_len = params.hdr_len;
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
kfree(hdr);
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
e_unpin_memory:
/* content of memory is updated, mark pages dirty */
for (i = 0; i < n; i++) {
@@ -1086,7 +1039,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
void __user *report = (void __user *)(uintptr_t)argp->data;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct sev_data_attestation_report *data;
+ struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
void __user *p;
void *blob = NULL;
@@ -1098,9 +1051,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
- if (!data)
- return -ENOMEM;
+ memset(&data, 0, sizeof(data));
/* User wants to query the blob length */
if (!params.len)
@@ -1108,23 +1059,20 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
p = (void __user *)(uintptr_t)params.uaddr;
if (p) {
- if (params.len > SEV_FW_BLOB_MAX_SIZE) {
- ret = -EINVAL;
- goto e_free;
- }
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
- ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
- goto e_free;
+ return -ENOMEM;
- data->address = __psp_pa(blob);
- data->len = params.len;
- memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
}
cmd:
- data->handle = sev->handle;
- ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
/*
* If we query the session length, FW responded with expected data.
*/
@@ -1140,22 +1088,417 @@ cmd:
}
done:
- params.len = data->len;
+ params.len = data.len;
if (copy_to_user(report, &params, sizeof(params)))
ret = -EFAULT;
e_free_blob:
kfree(blob);
-e_free:
- kfree(data);
return ret;
}
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ int ret;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+ if (ret < 0)
+ return ret;
+
+ params->session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ &params);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ int ret;
+
+ data.handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+ if (ret < 0)
+ return ret;
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (!guest_page)
+ return -EFAULT;
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ params.hdr_len);
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret)
+ goto e_free_session;
+
+ params.handle = start.handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ &params, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if ((params.guest_len + offset > PAGE_SIZE))
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ ret = -EFAULT;
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (!guest_page)
+ goto e_free_trans;
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = sev->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = sev->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
int r;
- if (!svm_sev_enabled() || !sev)
+ if (!sev_enabled)
return -ENOTTY;
if (!argp)
@@ -1166,13 +1509,22 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
mutex_lock(&kvm->lock);
+ /* enc_context_owner handles all memory enc operations */
+ if (is_mirroring_enc_context(kvm)) {
+ r = -EINVAL;
+ goto out;
+ }
+
switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es_enabled) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
- case KVM_SEV_ES_INIT:
- r = sev_es_guest_init(kvm, &sev_cmd);
- break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
@@ -1203,6 +1555,27 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_GET_ATTESTATION_REPORT:
r = sev_get_attestation_report(kvm, &sev_cmd);
break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -1226,6 +1599,10 @@ int svm_register_enc_region(struct kvm *kvm,
if (!sev_guest(kvm))
return -ENOTTY;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
@@ -1292,6 +1669,10 @@ int svm_unregister_enc_region(struct kvm *kvm,
struct enc_region *region;
int ret;
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
if (!sev_guest(kvm)) {
@@ -1322,6 +1703,71 @@ failed:
return ret;
}
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct file *source_kvm_file;
+ struct kvm *source_kvm;
+ struct kvm_sev_info *mirror_sev;
+ unsigned int asid;
+ int ret;
+
+ source_kvm_file = fget(source_fd);
+ if (!file_is_kvm(source_kvm_file)) {
+ ret = -EBADF;
+ goto e_source_put;
+ }
+
+ source_kvm = source_kvm_file->private_data;
+ mutex_lock(&source_kvm->lock);
+
+ if (!sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto e_source_unlock;
+ }
+
+ /* Mirrors of mirrors should work, but let's not get silly */
+ if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ ret = -EINVAL;
+ goto e_source_unlock;
+ }
+
+ asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ kvm_get_kvm(source_kvm);
+
+ fput(source_kvm_file);
+ mutex_unlock(&source_kvm->lock);
+ mutex_lock(&kvm->lock);
+
+ if (sev_guest(kvm)) {
+ ret = -EINVAL;
+ goto e_mirror_unlock;
+ }
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev = &to_kvm_svm(kvm)->sev_info;
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->asid = asid;
+ mirror_sev->active = true;
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+e_mirror_unlock:
+ mutex_unlock(&kvm->lock);
+ kvm_put_kvm(source_kvm);
+ return ret;
+e_source_unlock:
+ mutex_unlock(&source_kvm->lock);
+e_source_put:
+ fput(source_kvm_file);
+ return ret;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1331,6 +1777,12 @@ void sev_vm_destroy(struct kvm *kvm)
if (!sev_guest(kvm))
return;
+ /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ if (is_mirroring_enc_context(kvm)) {
+ kvm_put_kvm(sev->enc_context_owner);
+ return;
+ }
+
mutex_lock(&kvm->lock);
/*
@@ -1358,12 +1810,24 @@ void sev_vm_destroy(struct kvm *kvm)
sev_asid_free(sev);
}
+void __init sev_set_cpu_caps(void)
+{
+ if (!sev_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV);
+ if (!sev_es_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+}
+
void __init sev_hardware_setup(void)
{
+#ifdef CONFIG_KVM_AMD_SEV
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
bool sev_es_supported = false;
bool sev_supported = false;
+ if (!sev_enabled || !npt_enabled)
+ goto out;
+
/* Does the CPU support SEV? */
if (!boot_cpu_has(X86_FEATURE_SEV))
goto out;
@@ -1376,12 +1840,12 @@ void __init sev_hardware_setup(void)
/* Maximum number of encrypted guests supported simultaneously */
max_sev_asid = ecx;
-
- if (!svm_sev_enabled())
+ if (!max_sev_asid)
goto out;
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
/* Initialize SEV ASID bitmaps */
sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
@@ -1389,8 +1853,11 @@ void __init sev_hardware_setup(void)
goto out;
sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
- if (!sev_reclaim_asid_bitmap)
+ if (!sev_reclaim_asid_bitmap) {
+ bitmap_free(sev_asid_bitmap);
+ sev_asid_bitmap = NULL;
goto out;
+ }
sev_asid_count = max_sev_asid - min_sev_asid + 1;
if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
@@ -1400,7 +1867,7 @@ void __init sev_hardware_setup(void)
sev_supported = true;
/* SEV-ES support requested? */
- if (!sev_es)
+ if (!sev_es_enabled)
goto out;
/* Does the CPU support SEV-ES? */
@@ -1419,21 +1886,36 @@ void __init sev_hardware_setup(void)
sev_es_supported = true;
out:
- sev = sev_supported;
- sev_es = sev_es_supported;
+ sev_enabled = sev_supported;
+ sev_es_enabled = sev_es_supported;
+#endif
}
void sev_hardware_teardown(void)
{
- if (!svm_sev_enabled())
+ if (!sev_enabled)
return;
+ /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
+ sev_flush_asids(0, max_sev_asid);
+
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
+
misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+}
- sev_flush_asids();
+int sev_cpu_init(struct svm_cpu_data *sd)
+{
+ if (!sev_enabled)
+ return 0;
+
+ sd->sev_vmcbs = kcalloc(max_sev_asid + 1, sizeof(void *), GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ return -ENOMEM;
+
+ return 0;
}
/*
@@ -1825,7 +2307,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
len, GHCB_SCRATCH_AREA_LIMIT);
return false;
}
- scratch_va = kzalloc(len, GFP_KERNEL);
+ scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
return false;
@@ -1899,7 +2381,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
vcpu->arch.regs[VCPU_REGS_RCX] = 0;
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
ret = -EINVAL;
break;
@@ -1949,8 +2431,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
return ret;
}
-int sev_handle_vmgexit(struct vcpu_svm *svm)
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_gpa, exit_code;
struct ghcb *ghcb;
@@ -1962,13 +2445,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
return sev_handle_vmgexit_msr_protocol(svm);
if (!ghcb_gpa) {
- vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n");
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
return -EINVAL;
}
- if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) {
/* Unable to map GHCB from guest */
- vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
return -EINVAL;
}
@@ -1976,7 +2459,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
svm->ghcb = svm->ghcb_map.hva;
ghcb = svm->ghcb_map.hva;
- trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb);
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
exit_code = ghcb_get_sw_exit_code(ghcb);
@@ -1994,7 +2477,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
break;
- ret = kvm_sev_es_mmio_read(&svm->vcpu,
+ ret = kvm_sev_es_mmio_read(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->ghcb_sa);
@@ -2003,19 +2486,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
if (!setup_vmgexit_scratch(svm, false, control->exit_info_2))
break;
- ret = kvm_sev_es_mmio_write(&svm->vcpu,
+ ret = kvm_sev_es_mmio_write(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->ghcb_sa);
break;
case SVM_VMGEXIT_NMI_COMPLETE:
- ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET);
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
- ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+ ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
- struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
switch (control->exit_info_1) {
case 0:
@@ -2040,12 +2523,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm)
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
- vcpu_unimpl(&svm->vcpu,
+ vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
break;
default:
- ret = svm_invoke_exit_handler(svm, exit_code);
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
}
return ret;
@@ -2154,5 +2637,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
* the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
* non-zero value.
*/
+ if (!svm->ghcb)
+ return;
+
ghcb_set_sw_exit_info_2(svm->ghcb, 1);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6dad89248312..9790c73f2a32 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -56,9 +56,6 @@ static const struct x86_cpu_id svm_cpu_id[] = {
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#endif
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
@@ -95,6 +92,8 @@ static const struct svm_direct_access_msrs {
} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
+ { .index = MSR_IA32_SYSENTER_EIP, .always = false },
+ { .index = MSR_IA32_SYSENTER_ESP, .always = false },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
{ .index = MSR_FS_BASE, .always = true },
@@ -186,14 +185,6 @@ module_param(vls, int, 0444);
static int vgif = true;
module_param(vgif, int, 0444);
-/* enable/disable SEV support */
-int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
-module_param(sev, int, 0444);
-
-/* enable/disable SEV-ES support */
-int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
-module_param(sev_es, int, 0444);
-
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -214,6 +205,15 @@ struct kvm_ldttss_desc {
DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+#define TSC_AUX_URET_SLOT 0
+
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
@@ -279,7 +279,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
* In this case we will return to the nested guest
* as soon as we leave SMM.
*/
- if (!is_smm(&svm->vcpu))
+ if (!is_smm(vcpu))
svm_free_nested(svm);
} else {
@@ -363,10 +363,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
bool has_error_code = vcpu->arch.exception.has_error_code;
u32 error_code = vcpu->arch.exception.error_code;
- kvm_deliver_exception_payload(&svm->vcpu);
+ kvm_deliver_exception_payload(vcpu);
if (nr == BP_VECTOR && !nrips) {
- unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
/*
* For guest debugging where we have to reinject #BP if some
@@ -375,8 +375,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- (void)skip_emulated_instruction(&svm->vcpu);
- rip = kvm_rip_read(&svm->vcpu);
+ (void)skip_emulated_instruction(vcpu);
+ rip = kvm_rip_read(vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
}
@@ -553,23 +553,21 @@ static void svm_cpu_uninit(int cpu)
static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd;
+ int ret = -ENOMEM;
sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
if (!sd)
- return -ENOMEM;
+ return ret;
sd->cpu = cpu;
sd->save_area = alloc_page(GFP_KERNEL);
if (!sd->save_area)
goto free_cpu_data;
+
clear_page(page_address(sd->save_area));
- if (svm_sev_enabled()) {
- sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
- sizeof(void *),
- GFP_KERNEL);
- if (!sd->sev_vmcbs)
- goto free_save_area;
- }
+ ret = sev_cpu_init(sd);
+ if (ret)
+ goto free_save_area;
per_cpu(svm_data, cpu) = sd;
@@ -579,7 +577,7 @@ free_save_area:
__free_page(sd->save_area);
free_cpu_data:
kfree(sd);
- return -ENOMEM;
+ return ret;
}
@@ -681,14 +679,15 @@ void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
u32 *svm_vcpu_alloc_msrpm(void)
{
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
+ unsigned int order = get_order(MSRPM_SIZE);
+ struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
u32 *msrpm;
if (!pages)
return NULL;
msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
return msrpm;
}
@@ -707,7 +706,7 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
void svm_vcpu_free_msrpm(u32 *msrpm)
{
- __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER);
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
@@ -881,20 +880,20 @@ static __init void svm_adjust_mmio_mask(void)
*/
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
- kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}
static void svm_hardware_teardown(void)
{
int cpu;
- if (svm_sev_enabled())
- sev_hardware_teardown();
+ sev_hardware_teardown();
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
+ get_order(IOPM_SIZE));
iopm_base = 0;
}
@@ -922,6 +921,9 @@ static __init void svm_set_cpu_caps(void)
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* CPUID 0x8000001F (SME/SEV features) */
+ sev_set_cpu_caps();
}
static __init int svm_hardware_setup(void)
@@ -930,14 +932,15 @@ static __init int svm_hardware_setup(void)
struct page *iopm_pages;
void *iopm_va;
int r;
+ unsigned int order = get_order(IOPM_SIZE);
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
@@ -956,6 +959,9 @@ static __init int svm_hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 32;
}
+ if (boot_cpu_has(X86_FEATURE_RDTSCP))
+ kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
+
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
pause_filter_count = 0;
@@ -969,21 +975,6 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
- if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) {
- sev_hardware_setup();
- } else {
- sev = false;
- sev_es = false;
- }
-
- svm_adjust_mmio_mask();
-
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
/*
* KVM's MMU doesn't support using 2-level paging for itself, and thus
* NPT isn't supported if the host is using 2-level paging since host
@@ -998,6 +989,17 @@ static __init int svm_hardware_setup(void)
kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ /* Note, SEV setup consumes npt_enabled. */
+ sev_hardware_setup();
+
+ svm_adjust_mmio_mask();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
if (nrips) {
if (!boot_cpu_has(X86_FEATURE_NRIPS))
nrips = false;
@@ -1084,8 +1086,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
if (is_guest_mode(vcpu)) {
/* Write L1's TSC offset. */
g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = offset;
+ svm->vmcb01.ptr->control.tsc_offset;
+ svm->vmcb01.ptr->control.tsc_offset = offset;
}
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1113,12 +1115,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
}
}
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
- svm->vcpu.arch.hflags = 0;
+ vcpu->arch.hflags = 0;
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1126,7 +1129,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ if (!kvm_vcpu_apicv_active(vcpu))
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
@@ -1170,12 +1173,12 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_RDPRU);
svm_set_intercept(svm, INTERCEPT_RSM);
- if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
svm_set_intercept(svm, INTERCEPT_MONITOR);
svm_set_intercept(svm, INTERCEPT_MWAIT);
}
- if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ if (!kvm_hlt_in_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_HLT);
control->iopm_base_pa = __sme_set(iopm_base);
@@ -1201,19 +1204,19 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_cr4(&svm->vcpu, 0);
- svm_set_efer(&svm->vcpu, 0);
+ svm_set_cr4(vcpu, 0);
+ svm_set_efer(vcpu, 0);
save->dr6 = 0xffff0ff0;
- kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
save->rip = 0x0000fff0;
- svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+ vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
/*
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
* It also updates the guest-visible cr0 value.
*/
- svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
- kvm_mmu_reset_context(&svm->vcpu);
+ svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ kvm_mmu_reset_context(vcpu);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -1225,17 +1228,18 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_exception_intercept(svm, PF_VECTOR);
svm_clr_intercept(svm, INTERCEPT_CR3_READ);
svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = svm->vcpu.arch.pat;
+ save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
- svm->asid_generation = 0;
+ svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
svm->nested.vmcb12_gpa = 0;
- svm->vcpu.arch.hflags = 0;
+ svm->nested.last_vmcb12_gpa = 0;
+ vcpu->arch.hflags = 0;
- if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
control->pause_filter_count = pause_filter_count;
if (pause_filter_thresh)
control->pause_filter_thresh = pause_filter_thresh;
@@ -1246,18 +1250,15 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_check_invpcid(svm);
- if (kvm_vcpu_apicv_active(&svm->vcpu))
- avic_init_vmcb(svm);
-
/*
- * If hardware supports Virtual VMLOAD VMSAVE then enable it
- * in VMCB and clear intercepts to avoid #VMEXIT.
+ * If the host supports V_SPEC_CTRL then disable the interception
+ * of MSR_IA32_SPEC_CTRL.
*/
- if (vls) {
- svm_clr_intercept(svm, INTERCEPT_VMLOAD);
- svm_clr_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- }
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm);
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1265,11 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
- if (sev_guest(svm->vcpu.kvm)) {
+ if (sev_guest(vcpu->kvm)) {
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/* Perform SEV-ES specific VMCB updates */
sev_es_init_vmcb(svm);
}
@@ -1291,12 +1292,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
svm->virt_spec_ctrl = 0;
if (!init_event) {
- svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
}
- init_vmcb(svm);
+ init_vmcb(vcpu);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
kvm_rdx_write(vcpu, eax);
@@ -1305,10 +1306,16 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
+}
+
static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
- struct page *vmcb_page;
+ struct page *vmcb01_page;
struct page *vmsa_page = NULL;
int err;
@@ -1316,11 +1323,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!vmcb_page)
+ vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!vmcb01_page)
goto out;
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
/*
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
@@ -1356,20 +1363,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
- svm->vmcb = page_address(vmcb_page);
- svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT);
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
if (vmsa_page)
svm->vmsa = page_address(vmsa_page);
- svm->asid_generation = 0;
svm->guest_state_loaded = false;
- init_vmcb(svm);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+ init_vmcb(vcpu);
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
- if (sev_es_guest(svm->vcpu.kvm))
+ if (sev_es_guest(vcpu->kvm))
/* Perform SEV-ES specific VMCB creation updates */
sev_es_create_vcpu(svm);
@@ -1379,7 +1387,7 @@ error_free_vmsa_page:
if (vmsa_page)
__free_page(vmsa_page);
error_free_vmcb_page:
- __free_page(vmcb_page);
+ __free_page(vmcb01_page);
out:
return err;
}
@@ -1407,32 +1415,23 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
- __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
- __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- unsigned int i;
if (svm->guest_state_loaded)
return;
/*
- * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
- * area (non-sev-es). Save ones that aren't so we can restore them
- * individually later.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- /*
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
- if (sev_es_guest(svm->vcpu.kvm)) {
+ if (sev_es_guest(vcpu->kvm)) {
sev_es_prepare_guest_switch(svm, vcpu->cpu);
} else {
vmsave(__sme_page_pa(sd->save_area));
@@ -1446,29 +1445,15 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
}
}
- /* This assumes that the kernel never uses MSR_TSC_AUX */
if (static_cpu_has(X86_FEATURE_RDTSCP))
- wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
svm->guest_state_loaded = true;
}
static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- unsigned int i;
-
- if (!svm->guest_state_loaded)
- return;
-
- /*
- * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
- * area (non-sev-es). Restore the ones that weren't.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- svm->guest_state_loaded = false;
+ to_svm(vcpu)->guest_state_loaded = false;
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1476,11 +1461,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- vmcb_mark_all_dirty(svm->vmcb);
- }
-
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
@@ -1564,7 +1544,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
/* Drop int_ctl fields related to VINTR injection. */
svm->vmcb->control.int_ctl &= mask;
if (is_guest_mode(&svm->vcpu)) {
- svm->nested.hsave->control.int_ctl &= mask;
+ svm->vmcb01.ptr->control.int_ctl &= mask;
WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
(svm->nested.ctl.int_ctl & V_TPR_MASK));
@@ -1577,16 +1557,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
switch (seg) {
case VCPU_SREG_CS: return &save->cs;
case VCPU_SREG_DS: return &save->ds;
case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
}
BUG();
return NULL;
@@ -1709,37 +1690,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
- ulong gcr0;
- u64 *hcr0;
-
- /*
- * SEV-ES guests must always keep the CR intercepts cleared. CR
- * tracking is done using the CR write traps.
- */
- if (sev_es_guest(svm->vcpu.kvm))
- return;
-
- gcr0 = svm->vcpu.arch.cr0;
- hcr0 = &svm->vmcb->save.cr0;
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
- vmcb_mark_dirty(svm->vmcb, VMCB_CR);
-
- if (gcr0 == *hcr0) {
- svm_clr_intercept(svm, INTERCEPT_CR0_READ);
- svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
- } else {
- svm_set_intercept(svm, INTERCEPT_CR0_READ);
- svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
- }
-}
-
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1757,7 +1711,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vcpu->arch.cr0 = cr0;
if (!npt_enabled)
- cr0 |= X86_CR0_PG | X86_CR0_WP;
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
/*
* re-enable caching here because the QEMU bios
@@ -1765,10 +1719,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* reboot
*/
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
- update_cr0_intercept(svm);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
}
static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1847,7 +1817,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
- svm->asid_generation = sd->asid_generation;
+ svm->current_vmcb->asid_generation = sd->asid_generation;
svm->asid = sd->next_asid++;
}
@@ -1896,39 +1866,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
-static int pf_interception(struct vcpu_svm *svm)
+static int pf_interception(struct kvm_vcpu *vcpu)
{
- u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
- return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int npf_interception(struct vcpu_svm *svm)
+static int npf_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
u64 error_code = svm->vmcb->control.exit_info_1;
trace_kvm_page_fault(fault_address, error_code);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ return kvm_mmu_page_fault(vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
}
-static int db_interception(struct vcpu_svm *svm)
+static int db_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
- if (!(svm->vcpu.guest_debug &
+ if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
- kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
return 1;
}
@@ -1938,7 +1912,7 @@ static int db_interception(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
- if (svm->vcpu.guest_debug &
+ if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
@@ -1952,9 +1926,10 @@ static int db_interception(struct vcpu_svm *svm)
return 1;
}
-static int bp_interception(struct vcpu_svm *svm)
+static int bp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1962,14 +1937,14 @@ static int bp_interception(struct vcpu_svm *svm)
return 0;
}
-static int ud_interception(struct vcpu_svm *svm)
+static int ud_interception(struct kvm_vcpu *vcpu)
{
- return handle_ud(&svm->vcpu);
+ return handle_ud(vcpu);
}
-static int ac_interception(struct vcpu_svm *svm)
+static int ac_interception(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
return 1;
}
@@ -2012,7 +1987,7 @@ static bool is_erratum_383(void)
return true;
}
-static void svm_handle_mce(struct vcpu_svm *svm)
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
{
if (is_erratum_383()) {
/*
@@ -2021,7 +1996,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -2033,20 +2008,21 @@ static void svm_handle_mce(struct vcpu_svm *svm)
kvm_machine_check();
}
-static int mc_interception(struct vcpu_svm *svm)
+static int mc_interception(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int shutdown_interception(struct vcpu_svm *svm)
+static int shutdown_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
/*
* The VM save area has already been encrypted so it
* cannot be reinitialized - just terminate.
*/
- if (sev_es_guest(svm->vcpu.kvm))
+ if (sev_es_guest(vcpu->kvm))
return -EINVAL;
/*
@@ -2054,20 +2030,20 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm);
+ init_vmcb(vcpu);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}
-static int io_interception(struct vcpu_svm *svm)
+static int io_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
- ++svm->vcpu.stat.io_exits;
+ ++vcpu->stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
port = io_info >> 16;
@@ -2082,93 +2058,69 @@ static int io_interception(struct vcpu_svm *svm)
svm->next_rip = svm->vmcb->control.exit_info_2;
- return kvm_fast_pio(&svm->vcpu, size, port, in);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
- return 1;
+ return kvm_fast_pio(vcpu, size, port, in);
}
-static int intr_interception(struct vcpu_svm *svm)
+static int nmi_interception(struct kvm_vcpu *vcpu)
{
- ++svm->vcpu.stat.irq_exits;
return 1;
}
-static int nop_on_interception(struct vcpu_svm *svm)
+static int intr_interception(struct kvm_vcpu *vcpu)
{
+ ++vcpu->stat.irq_exits;
return 1;
}
-static int halt_interception(struct vcpu_svm *svm)
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
{
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_hypercall(&svm->vcpu);
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
struct kvm_host_map map;
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
if (ret) {
if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
+ kvm_inject_gp(vcpu, 0);
return 1;
}
- nested_vmcb = map.hva;
+ vmcb12 = map.hva;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ if (vmload) {
+ nested_svm_vmloadsave(vmcb12, svm->vmcb);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else
+ nested_svm_vmloadsave(svm->vmcb, vmcb12);
- nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
-static int vmsave_interception(struct vcpu_svm *svm)
+static int vmload_interception(struct kvm_vcpu *vcpu)
{
- struct vmcb *nested_vmcb;
- struct kvm_host_map map;
- int ret;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
- if (ret) {
- if (ret == -EINVAL)
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- nested_vmcb = map.hva;
-
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
- nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- kvm_vcpu_unmap(&svm->vcpu, &map, true);
+ return vmload_vmsave_interception(vcpu, true);
+}
- return ret;
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
}
-static int vmrun_interception(struct vcpu_svm *svm)
+static int vmrun_interception(struct kvm_vcpu *vcpu)
{
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- return nested_svm_vmrun(svm);
+ return nested_svm_vmrun(vcpu);
}
enum {
@@ -2207,7 +2159,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
};
- int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_INSTR_VMRUN] = vmrun_interception,
[SVM_INSTR_VMLOAD] = vmload_interception,
[SVM_INSTR_VMSAVE] = vmsave_interception,
@@ -2216,17 +2168,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
int ret;
if (is_guest_mode(vcpu)) {
- svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
/* Returns '1' or -errno on failure, '0' on success. */
- ret = nested_svm_vmexit(svm);
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
if (ret)
return ret;
return 1;
}
- return svm_instr_handlers[opcode](svm);
+ return svm_instr_handlers[opcode](vcpu);
}
/*
@@ -2237,9 +2185,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
* regions (e.g. SMM memory on host).
* 2) VMware backdoor
*/
-static int gp_interception(struct vcpu_svm *svm)
+static int gp_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
u32 error_code = svm->vmcb->control.exit_info_1;
int opcode;
@@ -2304,73 +2252,58 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
}
}
-static int stgi_interception(struct vcpu_svm *svm)
+static int stgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, true);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
return ret;
}
-static int clgi_interception(struct vcpu_svm *svm)
+static int clgi_interception(struct kvm_vcpu *vcpu)
{
int ret;
- if (nested_svm_check_permissions(svm))
+ if (nested_svm_check_permissions(vcpu))
return 1;
- ret = kvm_skip_emulated_instruction(&svm->vcpu);
- svm_set_gif(svm, false);
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
return ret;
}
-static int invlpga_interception(struct vcpu_svm *svm)
+static int invlpga_interception(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
- kvm_rax_read(&svm->vcpu));
-
- /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
+ gva_t gva = kvm_rax_read(vcpu);
+ u32 asid = kvm_rcx_read(vcpu);
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
+ /* FIXME: Handle an address size prefix. */
+ if (!is_long_mode(vcpu))
+ gva = (u32)gva;
-static int skinit_interception(struct vcpu_svm *svm)
-{
- trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, gva);
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
- return kvm_emulate_wbinvd(&svm->vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int skinit_interception(struct kvm_vcpu *vcpu)
{
- u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_rcx_read(&svm->vcpu);
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
- int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
- return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int rdpru_interception(struct vcpu_svm *svm)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
-static int task_switch_interception(struct vcpu_svm *svm)
+static int task_switch_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
u16 tss_selector;
int reason;
int int_type = svm->vmcb->control.exit_int_info &
@@ -2399,7 +2332,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
if (reason == TASK_SWITCH_GATE) {
switch (type) {
case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = false;
+ vcpu->arch.nmi_injected = false;
break;
case SVM_EXITINTINFO_TYPE_EXEPT:
if (svm->vmcb->control.exit_info_2 &
@@ -2408,10 +2341,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
error_code =
(u32)svm->vmcb->control.exit_info_2;
}
- kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_exception_queue(vcpu);
break;
case SVM_EXITINTINFO_TYPE_INTR:
- kvm_clear_interrupt_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(vcpu);
break;
default:
break;
@@ -2422,77 +2355,58 @@ static int task_switch_interception(struct vcpu_svm *svm)
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
- if (!skip_emulated_instruction(&svm->vcpu))
+ if (!skip_emulated_instruction(vcpu))
return 0;
}
if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
int_vec = -1;
- return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
has_error_code, error_code);
}
-static int cpuid_interception(struct vcpu_svm *svm)
+static int iret_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_cpuid(&svm->vcpu);
-}
+ struct vcpu_svm *svm = to_svm(vcpu);
-static int iret_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.nmi_window_exits;
- svm->vcpu.arch.hflags |= HF_IRET_MASK;
- if (!sev_es_guest(svm->vcpu.kvm)) {
+ ++vcpu->stat.nmi_window_exits;
+ vcpu->arch.hflags |= HF_IRET_MASK;
+ if (!sev_es_guest(vcpu->kvm)) {
svm_clr_intercept(svm, INTERCEPT_IRET);
- svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
}
- kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
-static int invd_interception(struct vcpu_svm *svm)
-{
- /* Treat an INVD instruction as a NOP and just skip it. */
- return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
+static int invlpg_interception(struct kvm_vcpu *vcpu)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return kvm_emulate_instruction(&svm->vcpu, 0);
+ return kvm_emulate_instruction(vcpu, 0);
- kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
}
-static int emulate_on_interception(struct vcpu_svm *svm)
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction(&svm->vcpu, 0);
+ return kvm_emulate_instruction(vcpu, 0);
}
-static int rsm_interception(struct vcpu_svm *svm)
+static int rsm_interception(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
}
-static int rdpmc_interception(struct vcpu_svm *svm)
-{
- int err;
-
- if (!nrips)
- return emulate_on_interception(svm);
-
- err = kvm_rdpmc(&svm->vcpu);
- return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
unsigned long val)
{
- unsigned long cr0 = svm->vcpu.arch.cr0;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
bool ret = false;
- if (!is_guest_mode(&svm->vcpu) ||
+ if (!is_guest_mode(vcpu) ||
(!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
@@ -2509,17 +2423,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
#define CR_VALID (1ULL << 63)
-static int cr_interception(struct vcpu_svm *svm)
+static int cr_interception(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
int reg, cr;
unsigned long val;
int err;
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
- return emulate_on_interception(svm);
+ return emulate_on_interception(vcpu);
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
@@ -2530,61 +2445,61 @@ static int cr_interception(struct vcpu_svm *svm)
err = 0;
if (cr >= 16) { /* mov to cr */
cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_read(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- if (!check_selective_cr0_intercepted(svm, val))
- err = kvm_set_cr0(&svm->vcpu, val);
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
else
return 1;
break;
case 3:
- err = kvm_set_cr3(&svm->vcpu, val);
+ err = kvm_set_cr3(vcpu, val);