aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@s-opensource.com>2018-01-03 04:14:04 -0500
committerMauro Carvalho Chehab <mchehab@s-opensource.com>2018-01-03 04:14:04 -0500
commit3bdf481e39ff1d36c1f2e1b3862db2ac329b12cd (patch)
tree91ad50553511f1cb4583edce10c16371f69fa7aa
parentd0c8f6ad8b381dd572576ac50b9696d4d31142bb (diff)
parent30a7acd573899fd8b8ac39236eff6468b195ac7d (diff)
Merge tag 'v4.15-rc6' into patchwork
Linux 4.15-rc6 * tag 'v4.15-rc6': (734 commits) Linux 4.15-rc6 MAINTAINERS: mark arch/blackfin/ and its gubbins as orphaned x86/ldt: Make LDT pgtable free conditional x86/ldt: Plug memory leak in error path x86/mm: Remove preempt_disable/enable() from __native_flush_tlb() x86/smpboot: Remove stale TLB flush invocations objtool: Fix seg fault with clang-compiled objects objtool: Fix seg fault caused by missing parameter kbuild: add '-fno-stack-check' to kernel build options timerqueue: Document return values of timerqueue_add/del() timers: Invoke timer_start_debug() where it makes sense nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick() timers: Reinitialize per cpu bases on hotplug timers: Use deferrable base independent of base::nohz_active genirq/msi, x86/vector: Prevent reservation mode for non maskable MSI genirq/irqdomain: Rename early argument of irq_domain_activate_irq() x86/vector: Use IRQD_CAN_RESERVE flag genirq: Introduce IRQD_CAN_RESERVE flag genirq/msi: Handle reactivation only on success gpio: brcmstb: Make really use of the new lockdep class ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.rst1
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt18
-rw-r--r--Documentation/admin-guide/thunderbolt.rst2
-rw-r--r--Documentation/arm64/silicon-errata.txt1
-rw-r--r--Documentation/cgroup-v2.txt7
-rw-r--r--Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/da7218.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/da7219.txt2
-rw-r--r--Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt18
-rw-r--r--Documentation/filesystems/overlayfs.txt34
-rw-r--r--Documentation/locking/crossrelease.txt874
-rw-r--r--Documentation/vm/zswap.txt22
-rw-r--r--Documentation/x86/x86_64/mm.txt29
-rw-r--r--MAINTAINERS20
-rw-r--r--Makefile5
-rw-r--r--arch/arm/boot/dts/vf610-zii-dev-rev-c.dts4
-rw-r--r--arch/arm/lib/csumpartialcopyuser.S4
-rw-r--r--arch/arm64/Kconfig12
-rw-r--r--arch/arm64/include/asm/assembler.h10
-rw-r--r--arch/arm64/include/asm/cpufeature.h3
-rw-r--r--arch/arm64/include/asm/cputype.h2
-rw-r--r--arch/arm64/include/asm/pgtable.h41
-rw-r--r--arch/arm64/kernel/cpu-reset.S1
-rw-r--r--arch/arm64/kernel/cpufeature.c3
-rw-r--r--arch/arm64/kernel/efi-entry.S2
-rw-r--r--arch/arm64/kernel/fpsimd.c2
-rw-r--r--arch/arm64/kernel/head.S1
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c2
-rw-r--r--arch/arm64/kernel/relocate_kernel.S1
-rw-r--r--arch/arm64/kvm/hyp-init.S1
-rw-r--r--arch/arm64/kvm/hyp/debug-sr.c3
-rw-r--r--arch/arm64/mm/dump.c2
-rw-r--r--arch/arm64/mm/fault.c5
-rw-r--r--arch/arm64/mm/init.c3
-rw-r--r--arch/parisc/boot/compressed/misc.c4
-rw-r--r--arch/parisc/include/asm/thread_info.h5
-rw-r--r--arch/parisc/kernel/entry.S12
-rw-r--r--arch/parisc/kernel/hpmc.S1
-rw-r--r--arch/parisc/kernel/unwind.c1
-rw-r--r--arch/parisc/lib/delay.c2
-rw-r--r--arch/powerpc/include/asm/mmu_context.h5
-rw-r--r--arch/powerpc/kernel/process.c2
-rw-r--r--arch/powerpc/kvm/book3s_xive.c7
-rw-r--r--arch/powerpc/net/bpf_jit_comp64.c6
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/powerpc/perf/imc-pmu.c17
-rw-r--r--arch/powerpc/sysdev/fsl_msi.c4
-rw-r--r--arch/riscv/include/asm/barrier.h19
-rw-r--r--arch/riscv/kernel/setup.c11
-rw-r--r--arch/riscv/kernel/sys_riscv.c2
-rw-r--r--arch/s390/include/asm/pgtable.h6
-rw-r--r--arch/s390/kernel/compat_linux.c1
-rw-r--r--arch/s390/net/bpf_jit_comp.c11
-rw-r--r--arch/sparc/lib/hweight.S4
-rw-r--r--arch/sparc/mm/fault_32.c2
-rw-r--r--arch/sparc/mm/fault_64.c2
-rw-r--r--arch/sparc/mm/gup.c4
-rw-r--r--arch/sparc/net/bpf_jit_comp_64.c6
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/um/include/asm/mmu_context.h3
-rw-r--r--arch/um/kernel/trap.c2
-rw-r--r--arch/unicore32/include/asm/mmu_context.h5
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Kconfig.debug1
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/head_64.S16
-rw-r--r--arch/x86/boot/compressed/misc.c16
-rw-r--r--arch/x86/boot/compressed/pagetable.c3
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c28
-rw-r--r--arch/x86/boot/genimage.sh32
-rw-r--r--arch/x86/crypto/salsa20_glue.c7
-rw-r--r--arch/x86/entry/calling.h145
-rw-r--r--arch/x86/entry/entry_32.S14
-rw-r--r--arch/x86/entry/entry_64.S237
-rw-r--r--arch/x86/entry/entry_64_compat.S31
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c38
-rw-r--r--arch/x86/events/intel/core.c5
-rw-r--r--arch/x86/events/intel/ds.c130
-rw-r--r--arch/x86/events/perf_event.h23
-rw-r--r--arch/x86/include/asm/asm.h2
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h81
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/desc.h14
-rw-r--r--arch/x86/include/asm/disabled-features.h8
-rw-r--r--arch/x86/include/asm/espfix.h7
-rw-r--r--arch/x86/include/asm/fixmap.h7
-rw-r--r--arch/x86/include/asm/hypervisor.h25
-rw-r--r--arch/x86/include/asm/intel_ds.h36
-rw-r--r--arch/x86/include/asm/invpcid.h53
-rw-r--r--arch/x86/include/asm/irqdomain.h2
-rw-r--r--arch/x86/include/asm/irqflags.h3
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h113
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/pgalloc.h11
-rw-r--r--arch/x86/include/asm/pgtable.h30
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h15
-rw-r--r--arch/x86/include/asm/pgtable_64.h92
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h51
-rw-r--r--arch/x86/include/asm/processor-flags.h5
-rw-r--r--arch/x86/include/asm/processor.h82
-rw-r--r--arch/x86/include/asm/pti.h14
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/suspend_32.h8
-rw-r--r--arch/x86/include/asm/suspend_64.h19
-rw-r--r--arch/x86/include/asm/switch_to.h13
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h312
-rw-r--r--arch/x86/include/asm/trace/irq_vectors.h16
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/unwind.h7
-rw-r--r--arch/x86/include/asm/vsyscall.h1
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h7
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/msi.c8
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/vector.c20
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c10
-rw-r--r--arch/x86/kernel/asm-offsets_32.c9
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/cpu/common.c103
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c13
-rw-r--r--arch/x86/kernel/doublefault.c36
-rw-r--r--arch/x86/kernel/dumpstack.c81
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c12
-rw-r--r--arch/x86/kernel/head_64.S30
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c12
-rw-r--r--arch/x86/kernel/irq_64.c4
-rw-r--r--arch/x86/kernel/ldt.c198
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/process.c19
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c14
-rw-r--r--arch/x86/kernel/smpboot.c19
-rw-r--r--arch/x86/kernel/stacktrace.c6
-rw-r--r--arch/x86/kernel/tls.c11
-rw-r--r--arch/x86/kernel/traps.c77
-rw-r--r--arch/x86/kernel/unwind_orc.c88
-rw-r--r--arch/x86/kernel/vmlinux.lds.S17
-rw-r--r--arch/x86/kvm/emulate.c32
-rw-r--r--arch/x86/kvm/mmu.c8
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c48
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/lib/x86-opcode-map.txt13
-rw-r--r--arch/x86/mm/Makefile9
-rw-r--r--arch/x86/mm/cpu_entry_area.c166
-rw-r--r--arch/x86/mm/debug_pagetables.c80
-rw-r--r--arch/x86/mm/dump_pagetables.c141
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init.c80
-rw-r--r--arch/x86/mm/init_32.c6
-rw-r--r--arch/x86/mm/ioremap.c4
-rw-r--r--arch/x86/mm/kasan_init_64.c23
-rw-r--r--arch/x86/mm/kmmio.c12
-rw-r--r--arch/x86/mm/mem_encrypt.c4
-rw-r--r--arch/x86/mm/pgtable.c5
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/pti.c387
-rw-r--r--arch/x86/mm/tlb.c64
-rw-r--r--arch/x86/pci/fixup.c27
-rw-r--r--arch/x86/platform/efi/efi_64.c5
-rw-r--r--arch/x86/platform/uv/tlb_uv.c2
-rw-r--r--arch/x86/platform/uv/uv_irq.c2
-rw-r--r--arch/x86/power/cpu.c115
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/enlighten.c81
-rw-r--r--arch/x86/xen/enlighten_pv.c5
-rw-r--r--arch/x86/xen/mmu_pv.c14
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-map.c38
-rw-r--r--block/blk-throttle.c8
-rw-r--r--block/bounce.c6
-rw-r--r--block/kyber-iosched.c37
-rw-r--r--crypto/af_alg.c19
-rw-r--r--crypto/algif_aead.c18
-rw-r--r--crypto/algif_skcipher.c16
-rw-r--r--crypto/hmac.c6
-rw-r--r--crypto/mcryptd.c23
-rw-r--r--crypto/rsa_helper.c2
-rw-r--r--crypto/salsa20_generic.c7
-rw-r--r--crypto/shash.c5
-rw-r--r--crypto/skcipher.c10
-rw-r--r--drivers/acpi/apei/erst.c2
-rw-r--r--drivers/acpi/cppc_acpi.c2
-rw-r--r--drivers/acpi/device_pm.c2
-rw-r--r--drivers/acpi/nfit/core.c9
-rw-r--r--drivers/android/binder.c44
-rw-r--r--drivers/ata/ahci_mtk.c6
-rw-r--r--drivers/ata/ahci_qoriq.c12
-rw-r--r--drivers/ata/libata-core.c12
-rw-r--r--drivers/ata/pata_pdc2027x.c16
-rw-r--r--drivers/base/cacheinfo.c13
-rw-r--r--drivers/base/power/main.c15
-rw-r--r--drivers/block/null_blk.c4
-rw-r--r--drivers/char/ipmi/ipmi_si_intf.c44
-rw-r--r--drivers/char/ipmi/ipmi_si_parisc.c2
-rw-r--r--drivers/char/ipmi/ipmi_si_pci.c7
-rw-r--r--drivers/clk/clk.c8
-rw-r--r--drivers/clk/sunxi/clk-sun9i-mmc.c12
-rw-r--r--drivers/cpufreq/cpufreq_governor.c19
-rw-r--r--drivers/cpufreq/imx6q-cpufreq.c11
-rw-r--r--drivers/dma/at_hdmac.c4
-rw-r--r--drivers/dma/dma-jz4740.c4
-rw-r--r--drivers/dma/dmatest.c55
-rw-r--r--drivers/dma/fsl-edma.c28
-rw-r--r--drivers/dma/ioat/init.c2
-rw-r--r--drivers/gpio/gpio-bcm-kona.c3
-rw-r--r--drivers/gpio/gpio-brcmstb.c4
-rw-r--r--drivers/gpio/gpio-reg.c4
-rw-r--r--drivers/gpio/gpio-tegra.c4
-rw-r--r--drivers/gpio/gpio-xgene-sb.c2
-rw-r--r--drivers/gpio/gpiolib-acpi.c2
-rw-r--r--drivers/gpio/gpiolib-devprop.c17
-rw-r--r--drivers/gpio/gpiolib-of.c3
-rw-r--r--drivers/gpio/gpiolib.c27
-rw-r--r--drivers/gpio/gpiolib.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c2
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c13
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h2
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c51
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h1
-rw-r--r--drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c9
-rw-r--r--drivers/gpu/drm/amd/display/dc/core/dc_link.c4
-rw-r--r--drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c26
-rw-r--r--drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c9
-rw-r--r--drivers/gpu/drm/drm_connector.c63
-rw-r--r--drivers/gpu/drm/drm_crtc_internal.h1
-rw-r--r--drivers/gpu/drm/drm_edid.c52
-rw-r--r--drivers/gpu/drm/drm_lease.c26
-rw-r--r--drivers/gpu/drm/drm_mm.c8
-rw-r--r--drivers/gpu/drm/drm_mode_config.c5
-rw-r--r--drivers/gpu/drm/drm_plane.c42
-rw-r--r--drivers/gpu/drm/drm_syncobj.c77
-rw-r--r--drivers/gpu/drm/i915/gvt/display.c5
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c9
-rw-r--r--drivers/gpu/drm/i915/i915_sw_fence.c3
-rw-r--r--drivers/gpu/drm/i915/intel_breadcrumbs.c22
-rw-r--r--drivers/gpu/drm/i915/intel_ddi.c4
-rw-r--r--drivers/gpu/drm/i915/intel_display.c5
-rw-r--r--drivers/gpu/drm/i915/intel_lpe_audio.c2
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_bo.c5
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_drm.c2
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_drv.h11
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_fbcon.c2
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_mem.c6
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_ttm.c39
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_vmm.c2
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/engine/device/base.c2
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/subdev/bios/dp.c9
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c2
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c7
-rw-r--r--drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c20
-rw-r--r--drivers/gpu/drm/sun4i/sun4i_tcon.c4
-rw-r--r--drivers/gpu/drm/ttm/ttm_page_alloc.c3
-rw-r--r--drivers/gpu/drm/vc4/vc4_gem.c4
-rw-r--r--drivers/gpu/drm/vc4/vc4_irq.c1
-rw-r--r--drivers/hid/hid-core.c2
-rw-r--r--drivers/hid/hid-cp2112.c15
-rw-r--r--drivers/hid/hid-holtekff.c8
-rw-r--r--drivers/hv/vmbus_drv.c2
-rw-r--r--drivers/hwmon/hwmon.c21
-rw-r--r--drivers/hwtracing/stm/ftrace.c6
-rw-r--r--drivers/i2c/busses/i2c-cht-wc.c2
-rw-r--r--drivers/i2c/busses/i2c-piix4.c2
-rw-r--r--drivers/i2c/busses/i2c-stm32.h3
-rw-r--r--drivers/i2c/busses/i2c-stm32f4.c3
-rw-r--r--drivers/i2c/busses/i2c-stm32f7.c3
-rw-r--r--drivers/infiniband/core/cma.c2
-rw-r--r--drivers/infiniband/core/device.c2
-rw-r--r--drivers/infiniband/core/iwcm.c2
-rw-r--r--drivers/infiniband/core/nldev.c2
-rw-r--r--drivers/infiniband/core/security.c10
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c10
-rw-r--r--drivers/infiniband/core/verbs.c3
-rw-r--r--drivers/infiniband/hw/cxgb4/cq.c16
-rw-r--r--drivers/infiniband/hw/cxgb4/iw_cxgb4.h2
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c94
-rw-r--r--drivers/infiniband/hw/cxgb4/t4.h6
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h1
-rw-r--r--drivers/infiniband/hw/hfi1/pcie.c30
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c26
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.c11
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.h2
-rw-r--r--drivers/infiniband/hw/mlx5/main.c43
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h4
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c1
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma.h6
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c7
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c17
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c14
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c7
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c1
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c7
-rw-r--r--drivers/iommu/amd_iommu.c2
-rw-r--r--drivers/iommu/intel_irq_remapping.c2
-rw-r--r--drivers/irqchip/irq-gic-v3-its.c4
-rw-r--r--drivers/irqchip/irq-renesas-intc-irqpin.c6
-rw-r--r--drivers/leds/led-core.c2
-rw-r--r--drivers/md/dm-bufio.c8
-rw-r--r--drivers/md/dm-cache-target.c12
-rw-r--r--drivers/md/dm-mpath.c67
-rw-r--r--drivers/md/dm-snap.c48
-rw-r--r--drivers/md/dm-table.c5
-rw-r--r--drivers/md/dm-thin.c22
-rw-r--r--drivers/mfd/arizona-irq.c4
-rw-r--r--drivers/mfd/cros_ec_spi.c53
-rw-r--r--drivers/mfd/twl4030-audio.c9
-rw-r--r--drivers/mfd/twl6040.c12
-rw-r--r--drivers/misc/eeprom/at24.c26
-rw-r--r--drivers/misc/pti.c2
-rw-r--r--drivers/mmc/core/card.h2
-rw-r--r--drivers/mmc/core/mmc.c2
-rw-r--r--drivers/mmc/core/quirks.h8
-rw-r--r--drivers/mtd/mtdcore.c2
-rw-r--r--drivers/mtd/nand/brcmnand/brcmnand.c2
-rw-r--r--drivers/mtd/nand/gpio.c6
-rw-r--r--drivers/mtd/nand/gpmi-nand/gpmi-nand.c6
-rw-r--r--drivers/net/dsa/mv88e6xxx/port.c1
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_cfg.h5
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c16
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_hw.h29
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_nic.c82
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_nic.h2
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c5
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c17
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c29
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h6
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c80
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h6
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/ver.h6
-rw-r--r--drivers/net/ethernet/arc/emac.h2
-rw-r--r--drivers/net/ethernet/arc/emac_main.c164
-rw-r--r--drivers/net/ethernet/arc/emac_rockchip.c10
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c4
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c14
-rw-r--r--drivers/net/ethernet/broadcom/tg3.c21
-rw-r--r--drivers/net/ethernet/broadcom/tg3.h7
-rw-r--r--drivers/net/ethernet/freescale/fec_main.c6
-rw-r--r--drivers/net/ethernet/marvell/mvneta.c8
-rw-r--r--drivers/net/ethernet/marvell/skge.c1
-rw-r--r--drivers/net/ethernet/mediatek/mtk_eth_soc.c11
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_port.c57
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_selftest.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/mlx4_en.h3
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/resource_tracker.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/cmd.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h9
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c10
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c10
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c63
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eq.c20
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fs_core.c16
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lag.c56
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c75
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/qp.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/rl.c22
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/vxlan.c64
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/vxlan.h1
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum.c18
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c15
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/main.c55
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/main.h8
-rw-r--r--drivers/net/ethernet/qualcomm/emac/emac-phy.c7
-rw-r--r--drivers/net/ethernet/qualcomm/emac/emac.c6
-rw-r--r--drivers/net/ethernet/renesas/ravb_main.c27
-rw-r--r--drivers/net/ethernet/renesas/sh_eth.c10
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/common.h2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c5
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/enh_desc.c3
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/norm_desc.c2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c6
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c2
-rw-r--r--drivers/net/hippi/rrunner.c2
-rw-r--r--drivers/net/phy/at803x.c4
-rw-r--r--drivers/net/phy/marvell.c18
-rw-r--r--drivers/net/phy/mdio-xgene.c21
-rw-r--r--drivers/net/phy/mdio_bus.c1
-rw-r--r--drivers/net/phy/meson-gxl.c74
-rw-r--r--drivers/net/phy/micrel.c1
-rw-r--r--drivers/net/phy/phy.c9
-rw-r--r--drivers/net/phy/phy_device.c10
-rw-r--r--drivers/net/phy/phylink.c2
-rw-r--r--drivers/net/usb/qmi_wwan.c2
-rw-r--r--drivers/net/vxlan.c19
-rw-r--r--drivers/net/wireless/mac80211_hwsim.c3
-rw-r--r--drivers/nvdimm/btt.c201
-rw-r--r--drivers/nvdimm/btt.h47
-rw-r--r--drivers/nvdimm/pfn_devs.c20
-rw-r--r--drivers/nvme/host/core.c11
-rw-r--r--drivers/nvme/host/fc.c1
-rw-r--r--drivers/nvmem/meson-mx-efuse.c4
-rw-r--r--drivers/of/of_mdio.c3
-rw-r--r--drivers/parisc/lba_pci.c33
-rw-r--r--drivers/pci/host/pci-hyperv.c8
-rw-r--r--drivers/pci/host/pcie-rcar.c8
-rw-r--r--drivers/pci/pci-driver.c9
-rw-r--r--drivers/phy/motorola/phy-cpcap-usb.c2
-rw-r--r--drivers/phy/renesas/Kconfig2
-rw-r--r--drivers/phy/rockchip/phy-rockchip-typec.c2
-rw-r--r--drivers/phy/tegra/xusb.c58
-rw-r--r--drivers/pinctrl/intel/pinctrl-cherryview.c16
-rw-r--r--drivers/pinctrl/pinctrl-single.c5
-rw-r--r--drivers/pinctrl/stm32/pinctrl-stm32.c2
-rw-r--r--drivers/platform/x86/asus-wireless.c1
-rw-r--r--drivers/platform/x86/dell-laptop.c17
-rw-r--r--drivers/platform/x86/dell-wmi.c2
-rw-r--r--drivers/s390/net/qeth_core.h6
-rw-r--r--drivers/s390/net/qeth_core_main.c15
-rw-r--r--drivers/s390/net/qeth_l3.h2
-rw-r--r--drivers/s390/net/qeth_l3_main.c36
-rw-r--r--drivers/s390/net/qeth_l3_sys.c75
-rw-r--r--drivers/scsi/aacraid/aacraid.h1
-rw-r--r--drivers/scsi/aacraid/commsup.c8
-rw-r--r--drivers/scsi/aacraid/linit.c2
-rw-r--r--drivers/scsi/bfa/bfad_bsg.c6
-rw-r--r--drivers/scsi/bfa/bfad_im.c6
-rw-r--r--drivers/scsi/bfa/bfad_im.h10
-rw-r--r--drivers/scsi/libfc/fc_lport.c4
-rw-r--r--drivers/scsi/libsas/sas_expander.c10
-rw-r--r--drivers/scsi/lpfc/lpfc_mem.c2
-rw-r--r--drivers/scsi/osd/osd_initiator.c4
-rw-r--r--drivers/scsi/scsi_debugfs.c6
-rw-r--r--drivers/scsi/scsi_devinfo.c33
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/scsi/scsi_scan.c13
-rw-r--r--drivers/scsi/scsi_sysfs.c10
-rw-r--r--drivers/scsi/scsi_transport_spi.c12
-rw-r--r--drivers/scsi/sd.c4
-rw-r--r--drivers/scsi/storvsc_drv.c3
-rw-r--r--drivers/spi/spi-armada-3700.c8
-rw-r--r--drivers/spi/spi-atmel.c2
-rw-r--r--drivers/spi/spi-rspi.c4
-rw-r--r--drivers/spi/spi-sun4i.c2
-rw-r--r--drivers/spi/spi-xilinx.c11
-rw-r--r--drivers/staging/android/ion/Kconfig2
-rw-r--r--drivers/staging/android/ion/ion.c4
-rw-r--r--drivers/staging/android/ion/ion_cma_heap.c15
-rw-r--r--drivers/staging/ccree/ssi_hash.c2
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c23
-rw-r--r--drivers/staging/pi433/rf69.c2
-rw-r--r--drivers/target/target_core_pscsi.c4
-rw-r--r--drivers/thunderbolt/nhi.c2
-rw-r--r--drivers/tty/n_tty.c4
-rw-r--r--drivers/usb/chipidea/ci_hdrc_msm.c2
-rw-r--r--drivers/usb/core/config.c6
-rw-r--r--drivers/usb/core/quirks.c6
-rw-r--r--drivers/usb/dwc2/core.h4
-rw-r--r--drivers/usb/dwc2/gadget.c42
-rw-r--r--drivers/usb/dwc2/params.c29
-rw-r--r--drivers/usb/dwc3/dwc3-of-simple.c5
-rw-r--r--drivers/usb/dwc3/gadget.c4
-rw-r--r--drivers/usb/gadget/Kconfig4
-rw-r--r--drivers/usb/gadget/legacy/Kconfig12
-rw-r--r--drivers/usb/host/xhci-debugfs.c16
-rw-r--r--drivers/usb/host/xhci-mem.c15
-rw-r--r--drivers/usb/host/xhci-pci.c3
-rw-r--r--drivers/usb/host/xhci-ring.c6
-rw-r--r--drivers/usb/host/xhci.c6
-rw-r--r--drivers/usb/musb/da8xx.c10
-rw-r--r--drivers/usb/serial/ftdi_sio.c1
-rw-r--r--drivers/usb/serial/ftdi_sio_ids.h6
-rw-r--r--drivers/usb/serial/option.c17
-rw-r--r--drivers/usb/serial/qcserial.c3
-rw-r--r--drivers/usb/storage/unusual_devs.h7
-rw-r--r--drivers/usb/storage/unusual_uas.h7
-rw-r--r--drivers/usb/usbip/stub_dev.c3
-rw-r--r--drivers/usb/usbip/stub_main.c5
-rw-r--r--drivers/usb/usbip/stub_rx.c47
-rw-r--r--drivers/usb/usbip/stub_tx.c13
-rw-r--r--drivers/usb/usbip/usbip_common.c16
-rw-r--r--drivers/usb/usbip/usbip_common.h1
-rw-r--r--drivers/usb/usbip/vhci_hcd.c12
-rw-r--r--drivers/usb/usbip/vhci_rx.c23
-rw-r--r--drivers/usb/usbip/vhci_sysfs.c25
-rw-r--r--drivers/usb/usbip/vhci_tx.c3
-rw-r--r--drivers/virtio/virtio_mmio.c43
-rw-r--r--drivers/xen/Kconfig2
-rw-r--r--drivers/xen/balloon.c65
-rw-r--r--fs/autofs4/waitq.c1
-rw-r--r--fs/ceph/mds_client.c42
-rw-r--r--fs/cifs/smb2ops.c3
-rw-r--r--fs/cifs/smb2pdu.c30
-rw-r--r--fs/cramfs/Kconfig1
-rw-r--r--fs/dax.c3
-rw-r--r--fs/exec.c14
-rw-r--r--fs/ext4/extents.c1
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c9
-rw-r--r--fs/ext4/namei.c4
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/nfs/client.c11
-rw-r--r--fs/nfs/nfs4client.c17
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfsd/auth.c3
-rw-r--r--fs/overlayfs/Kconfig10
-rw-r--r--fs/overlayfs/dir.c3
-rw-r--r--fs/overlayfs/namei.c18
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/readdir.c7
-rw-r--r--fs/overlayfs/super.c87
-rw-r--r--fs/super.c37
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr.c20
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c9
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_defer.c39
-rw-r--r--fs/xfs/libxfs/xfs_defer.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h1
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c52
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c99
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h16
-rw-r--r--fs/xfs/scrub/scrub.c1
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_fsops.c5
-rw-r--r--fs/xfs/xfs_icache.c35
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c61
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_reflink.c23
-rw-r--r--fs/xfs/xfs_super.c9
-rw-r--r--fs/xfs/xfs_symlink.c15
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--include/asm-generic/mm_hooks.h5
-rw-r--r--include/asm-generic/pgtable.h5
-rw-r--r--include/crypto/internal/hash.h8
-rw-r--r--include/crypto/mcryptd.h1
-rw-r--r--include/drm/drm_connector.h10
-rw-r--r--include/drm/drm_edid.h2
-rw-r--r--include/drm/drm_mode_config.h18
-rw-r--r--include/kvm/arm_arch_timer.h2
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk_types.h9
-rw-r--r--include/linux/blkdev.h25
-rw-r--r--include/linux/bpf_verifier.h4
-rw-r--r--include/linux/compiler.h47
-rw-r--r--include/linux/completion.h45
-rw-r--r--include/linux/cpuhotplug.h2
-rw-r--r--include/linux/cred.h1
-rw-r--r--include/linux/gpio/driver.h33
-rw-r--r--include/linux/idr.h1
-rw-r--r--include/linux/intel-pti.h43
-rw-r--r--include/linux/ipv6.h3
-rw-r--r--include/linux/irq.h17
-rw-r--r--include/linux/irqdesc.h9
-rw-r--r--include/linux/irqdomain.h2
-rw-r--r--include/linux/lockdep.h125
-rw-r--r--include/linux/mfd/rtsx_pci.h2
-rw-r--r--include/linux/mlx5/driver.h7
-rw-r--r--include/linux/mlx5/mlx5_ifc.h8
-rw-r--r--include/linux/oom.h9
-rw-r--r--include/linux/pci.h3
-rw-r--r--include/linux/pm.h1
-rw-r--r--include/linux/pti.h50
-rw-r--r--include/linux/ptr_ring.h9
-rw-r--r--include/linux/rbtree.h2
-rw-r--r--include/linux/rwlock_types.h3
-rw-r--r--include/linux/sched.h17
-rw-r--r--include/linux/sched/coredump.h1
-rw-r--r--include/linux/spi/spi.h2
-rw-r--r--include/linux/spinlock.h5
-rw-r--r--include/linux/spinlock_types.h3
-rw-r--r--include/linux/string.h5
-rw-r--r--include/linux/tick.h1
-rw-r--r--include/linux/timer.h4
-rw-r--r--include/linux/trace.h2
-rw-r--r--include/net/cfg80211.h1
-rw-r--r--include/net/gue.h18
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/pkt_cls.h5
-rw-r--r--include/net/sch_generic.h1
-rw-r--r--include/net/sock.h5
-rw-r--r--include/net/xfrm.h3
-rw-r--r--include/trace/events/clk.h4
-rw-r--r--include/trace/events/kvm.h7
-rw-r--r--include/trace/events/preemptirq.h11
-rw-r--r--include/trace/events/tcp.h97
-rw-r--r--include/uapi/linux/pkt_sched.h1
-rw-r--r--include/uapi/linux/rtnetlink.h1
-rw-r--r--include/xen/balloon.h5
-rw-r--r--init/Kconfig6
-rw-r--r--init/main.c16
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/verifier.c283
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/stat.c8
-rw-r--r--kernel/cpu.c12
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/irq/debug.h5
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/generic-chip.c11
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/msi.c64
-rw-r--r--kernel/kcov.c4
-rw-r--r--kernel/locking/lockdep.c652
-rw-r--r--kernel/locking/spinlock.c13
-rw-r--r--kernel/sched/core.c22
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/posix-timers.c29
-rw-r--r--kernel/time/tick-sched.c32
-rw-r--r--kernel/time/timer.c35
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/bpf_trace.c19
-rw-r--r--kernel/trace/ring_buffer.c18
-rw-r--r--kernel/trace/trace.c54
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c33
-rw-r--r--lib/Kconfig.debug33
-rw-r--r--lib/kobject_uevent.c16
-rw-r--r--lib/rbtree.c10
-rw-r--r--lib/test_bpf.c43
-rw-r--r--lib/timerqueue.c8
-rw-r--r--mm/backing-dev.c5
-rw-r--r--mm/early_ioremap.c2
-rw-r--r--mm/frame_vector.c6
-rw-r--r--mm/gup.c2
-rw-r--r--mm/hmm.c8
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/memory.c11
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page_alloc.c11
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/slab.c23
-rw-r--r--net/batman-adv/bat_iv_ogm.c4
-rw-r--r--net/batman-adv/bat_v.c2
-rw-r--r--net/batman-adv/fragmentation.c2
-rw-r--r--net/batman-adv/tp_meter.c4
-rw-r--r--net/bridge/br_netlink.c11
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/netprio_cgroup.c1
-rw-r--r--net/core/skbuff.c17
-rw-r--r--net/dsa/slave.c1
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/fib_frontend.c9
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/igmp.c44
-rw-r--r--net/ipv4/ip_gre.c3
-rw-r--r--net/ipv4/ip_tunnel.c4
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c3
-rw-r--r--net/ipv4/raw.c15
-rw-r--r--net/ipv4/tcp_input.c10
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/xfrm4_input.c12
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/ipv6/ip6_gre.c58
-rw-r--r--net/ipv6/ip6_output.c12
-rw-r--r--net/ipv6/ip6_tunnel.c9
-rw-r--r--net/ipv6/ipv6_sockglue.c1
-rw-r--r--net/ipv6/mcast.c25
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c8
-rw-r--r--net/ipv6/route.c20
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/ipv6/xfrm6_input.c10
-rw-r--r--net/mac80211/ht.c5
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c128
-rw-r--r--net/netfilter/nf_conntrack_netlink.c13
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c3
-rw-r--r--net/netfilter/nf_tables_api.c7
-rw-r--r--net/netfilter/nfnetlink_cthelper.c10
-rw-r--r--net/netfilter/nfnetlink_log.c5
-rw-r--r--net/netfilter/nfnetlink_queue.c5
-rw-r--r--net/netfilter/nft_exthdr.c2
-rw-r--r--net/netfilter/x_tables.c9
-rw-r--r--net/netfilter/xt_bpf.c6
-rw-r--r--net/netfilter/xt_osf.c7
-rw-r--r--net/netlink/af_netlink.c3
-rw-r--r--net/openvswitch/flow.c15
-rw-r--r--net/rds/send.c3
-rw-r--r--net/sched/act_meta_mark.c1
-rw-r--r--net/sched/act_meta_skbtcindex.c1
-rw-r--r--net/sched/cls_api.c3
-rw-r--r--net/sched/cls_bpf.c93
-rw-r--r--net/sched/cls_u32.c1
-rw-r--r--net/sched/sch_api.c2
-rw-r--r--net/sched/sch_generic.c4
-rw-r--r--net/sched/sch_ingress.c15
-rw-r--r--net/sched/sch_red.c31
-rw-r--r--net/sctp/debug.c3
-rw-r--r--net/sctp/socket.c10
-rw-r--r--net/sctp/ulpqueue.c24
-rw-r--r--net/strparser/strparser.c2
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c1
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c1
-rw-r--r--net/sunrpc/svcauth_unix.c2
-rw-r--r--net/sunrpc/xprt.c28
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c6
-rw-r--r--net/sunrpc/xprtrdma/transport.c2
-rw-r--r--net/sunrpc/xprtrdma/verbs.c2
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h1
-rw-r--r--net/tipc/bearer.c5
-rw-r--r--net/tipc/group.c47
-rw-r--r--net/tipc/monitor.c6
-rw-r--r--net/tipc/socket.c4
-rw-r--r--net/wireless/Makefile39
-rw-r--r--net/wireless/certs/sforshee.hex86
-rw-r--r--net/wireless/certs/sforshee.x509bin680 -> 0 bytes
-rw-r--r--net/wireless/nl80211.c6
-rw-r--r--net/xfrm/xfrm_input.c69
-rw-r--r--net/xfrm/xfrm_policy.c9
-rw-r--r--net/xfrm/xfrm_state.c1
-rw-r--r--net/xfrm/xfrm_user.c26
-rwxr-xr-xscripts/checkpatch.pl22
-rwxr-xr-xscripts/faddr2line8
-rw-r--r--security/Kconfig10
-rw-r--r--sound/core/rawmidi.c15
-rw-r--r--sound/hda/hdac_i915.c2
-rw-r--r--sound/pci/hda/patch_conexant.c29
-rw-r--r--sound/pci/hda/patch_hdmi.c6
-rw-r--r--sound/pci/hda/patch_realtek.c49
-rw-r--r--sound/soc/amd/acp-pcm-dma.c7
-rw-r--r--sound/soc/atmel/Kconfig2
-rw-r--r--sound/soc/codecs/da7218.c2
-rw-r--r--sound/soc/codecs/msm8916-wcd-analog.c2
-rw-r--r--sound/soc/codecs/msm8916-wcd-digital.c4
-rw-r--r--sound/soc/codecs/nau8825.c1
-rw-r--r--sound/soc/codecs/rt5514-spi.c15
-rw-r--r--sound/soc/codecs/rt5514.c2
-rw-r--r--sound/soc/codecs/rt5645.c2
-rw-r--r--sound/soc/codecs/rt5663.c4
-rw-r--r--sound/soc/codecs/rt5663.h4
-rw-r--r--sound/soc/codecs/tlv320aic31xx.h2
-rw-r--r--sound/soc/codecs/twl4030.c4
-rw-r--r--sound/soc/codecs/wm_adsp.c12
-rw-r--r--sound/soc/fsl/fsl_asrc.h4
-rw-r--r--sound/soc/fsl/fsl_ssi.c44
-rw-r--r--sound/soc/intel/boards/kbl_rt5663_max98927.c2
-rw-r--r--sound/soc/intel/boards/kbl_rt5663_rt5514_max98927.c2
-rw-r--r--sound/soc/intel/skylake/skl-nhlt.c15
-rw-r--r--sound/soc/intel/skylake/skl-topology.c2
-rw-r--r--sound/soc/rockchip/rockchip_spdif.c18
-rw-r--r--sound/soc/sh/rcar/adg.c6
-rw-r--r--sound/soc/sh/rcar/core.c4
-rw-r--r--sound/soc/sh/rcar/dma.c86
-rw-r--r--sound/soc/sh/rcar/ssi.c16
-rw-r--r--sound/soc/sh/rcar/ssiu.c5
-rw-r--r--sound/usb/mixer.c27
-rw-r--r--sound/usb/quirks.c7
-rw-r--r--tools/arch/s390/include/uapi/asm/bpf_perf_event.h2
-rw-r--r--tools/arch/s390/include/uapi/asm/perf_regs.h44
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h1
-rw-r--r--tools/bpf/bpftool/map.c8
-rw-r--r--tools/bpf/bpftool/prog.c2
-rw-r--r--tools/include/linux/compiler.h21
-rw-r--r--tools/include/linux/lockdep.h1
-rw-r--r--tools/include/uapi/asm/bpf_perf_event.h7
-rw-r--r--tools/include/uapi/linux/kvm.h4
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat74
-rw-r--r--tools/kvm/kvm_stat/kvm_stat.txt4
-rw-r--r--tools/objtool/arch/x86/decode.c2
-rw-r--r--tools/objtool/arch/x86/lib/x86-opcode-map.txt15
-rw-r--r--tools/objtool/builtin-orc.c4
-rw-r--r--tools/objtool/orc_gen.c2
-rw-r--r--tools/perf/Makefile.config9
-rw-r--r--tools/perf/arch/s390/include/perf_regs.h2
-rwxr-xr-xtools/perf/check-headers.sh1
-rw-r--r--tools/perf/jvmti/jvmti_agent.c16
-rw-r--r--tools/perf/jvmti/jvmti_agent.h7
-rw-r--r--tools/perf/jvmti/libjvmti.c147
-rw-r--r--tools/perf/util/intel-pt-decoder/x86-opcode-map.txt13
-rw-r--r--tools/perf/util/mmap.h2
-rw-r--r--tools/testing/selftests/bpf/Makefile17
-rw-r--r--tools/testing/selftests/bpf/test_progs.c8
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c629
-rw-r--r--tools/testing/selftests/net/config1
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c12
-rw-r--r--tools/usb/usbip/libsrc/vhci_driver.c10
-rw-r--r--tools/usb/usbip/src/utils.c9
-rw-r--r--tools/virtio/ringtest/ptr_ring.c29
-rw-r--r--tools/vm/slabinfo-gnuplot.sh2
-rw-r--r--virt/kvm/arm/arch_timer.c40
-rw-r--r--virt/kvm/arm/arm.c2
-rw-r--r--virt/kvm/arm/mmio.c6
-rw-r--r--virt/kvm/arm/mmu.c10
806 files changed, 9729 insertions, 5644 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index b2598cc9834c..7242cbda15dd 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -109,6 +109,7 @@ parameter is applicable::
IPV6 IPv6 support is enabled.
ISAPNP ISA PnP code is enabled.
ISDN Appropriate ISDN support is enabled.
+ ISOL CPU Isolation is enabled.
JOY Appropriate joystick support is enabled.
KGDB Kernel debugger support is enabled.
KVM Kernel Virtual Machine support is enabled.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index dfb7c35b5826..56c1e6005132 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -328,11 +328,15 @@
not play well with APC CPU idle - disable it if you have
APC and your system crashes randomly.
- apic= [APIC,X86-32] Advanced Programmable Interrupt Controller
+ apic= [APIC,X86] Advanced Programmable Interrupt Controller
Change the output verbosity whilst booting
Format: { quiet (default) | verbose | debug }
Change the amount of debugging information output
when initialising the APIC and IO-APIC components.
+ For X86-32, this can also be used to specify an APIC
+ driver name.
+ Format: apic=driver_name
+ Examples: apic=bigsmp
apic_extnmi= [APIC,X86] External NMI delivery setting
Format: { bsp (default) | all | none }
@@ -1737,7 +1741,7 @@
isapnp= [ISAPNP]
Format: <RDP>,<reset>,<pci_scan>,<verbosity>
- isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance.
+ isolcpus= [KNL,SMP,ISOL] Isolate a given set of CPUs from disturbance.
[Deprecated - use cpusets instead]
Format: [flag-list,]<cpu-list>
@@ -2665,7 +2669,7 @@
Valid arguments: on, off
Default: on
- nohz_full= [KNL,BOOT]
+ nohz_full= [KNL,BOOT,SMP,ISOL]
The argument is a cpu list, as described above.
In kernels built with CONFIG_NO_HZ_FULL=y, set
the specified list of CPUs whose tick will be stopped
@@ -2711,6 +2715,8 @@
steal time is computed, but won't influence scheduler
behaviour
+ nopti [X86-64] Disable kernel page table isolation
+
nolapic [X86-32,APIC] Do not enable or use the local APIC.
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
@@ -3285,6 +3291,12 @@
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
+ pti= [X86_64]
+ Control user/kernel address space isolation:
+ on - enable
+ off - disable
+ auto - default setting
+
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index de50a8561774..9b55952039a6 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -230,7 +230,7 @@ If supported by your machine this will be exposed by the WMI bus with
a sysfs attribute called "force_power".
For example the intel-wmi-thunderbolt driver exposes this attribute in:
- /sys/devices/platform/PNP0C14:00/wmi_bus/wmi_bus-PNP0C14:00/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
+ /sys/bus/wmi/devices/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
To force the power to on, write 1 to this attribute file.
To disable force power, write 0 to this attribute file.
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
index 304bf22bb83c..fc1c884fea10 100644
--- a/Documentation/arm64/silicon-errata.txt
+++ b/Documentation/arm64/silicon-errata.txt
@@ -75,3 +75,4 @@ stable kernels.
| Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
| Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 |
+| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 |
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 779211fbb69f..2cddab7efb20 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -898,6 +898,13 @@ controller implements weight and absolute bandwidth limit models for
normal scheduling policy and absolute bandwidth allocation model for
realtime scheduling policy.
+WARNING: cgroup2 doesn't yet support control of realtime processes and
+the cpu controller can only be enabled when all RT processes are in
+the root cgroup. Be aware that system management software may already
+have placed RT processes into nonroot cgroups during the system boot
+process, and these processes may need to be moved to the root cgroup
+before the cpu controller can be enabled.
+
CPU Interface Files
~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
index 376fa2f50e6b..956bb046e599 100644
--- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
+++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
@@ -13,7 +13,6 @@ Required properties:
at25df321a
at25df641
at26df081a
- en25s64
mr25h128
mr25h256
mr25h10
@@ -33,7 +32,6 @@ Required properties:
s25fl008k
s25fl064k
sst25vf040b
- sst25wf040b
m25p40
m25p80
m25p16
diff --git a/Documentation/devicetree/bindings/sound/da7218.txt b/Documentation/devicetree/bindings/sound/da7218.txt
index 5ca5a709b6aa..3ab9dfef38d1 100644
--- a/Documentation/devicetree/bindings/sound/da7218.txt
+++ b/Documentation/devicetree/bindings/sound/da7218.txt
@@ -73,7 +73,7 @@ Example:
compatible = "dlg,da7218";
reg = <0x1a>;
interrupt-parent = <&gpio6>;
- interrupts = <11 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
wakeup-source;
VDD-supply = <&reg_audio>;
diff --git a/Documentation/devicetree/bindings/sound/da7219.txt b/Documentation/devicetree/bindings/sound/da7219.txt
index cf61681826b6..5b54d2d045c3 100644
--- a/Documentation/devicetree/bindings/sound/da7219.txt
+++ b/Documentation/devicetree/bindings/sound/da7219.txt
@@ -77,7 +77,7 @@ Example:
reg = <0x1a>;
interrupt-parent = <&gpio6>;
- interrupts = <11 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
VDD-supply = <&reg_audio>;
VDDMIC-supply = <&reg_audio>;
diff --git a/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
index 5bf13960f7f4..e3c48b20b1a6 100644
--- a/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
+++ b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
@@ -12,24 +12,30 @@ Required properties:
- "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc
- reg : Offset and length of the register set for the device
- interrupts : Should contain CSPI/eCSPI interrupt
-- cs-gpios : Specifies the gpio pins to be used for chipselects.
- clocks : Clock specifiers for both ipg and per clocks.
- clock-names : Clock names should include both "ipg" and "per"
See the clock consumer binding,
Documentation/devicetree/bindings/clock/clock-bindings.txt
-- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
- Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: DMA request names should include "tx" and "rx" if present.
-Obsolete properties:
-- fsl,spi-num-chipselects : Contains the number of the chipselect
+Recommended properties:
+- cs-gpios : GPIOs to use as chip selects, see spi-bus.txt. While the native chip
+select lines can be used, they appear to always generate a pulse between each
+word of a transfer. Most use cases will require GPIO based chip selects to
+generate a valid transaction.
Optional properties:
+- num-cs : Number of total chip selects, see spi-bus.txt.
+- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
+Documentation/devicetree/bindings/dma/dma.txt.
+- dma-names: DMA request names, if present, should include "tx" and "rx".
- fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register
controlling the SPI_READY handling. Note that to enable the DRCTL consideration,
the SPI_READY mode-flag needs to be set too.
Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst).
+Obsolete properties:
+- fsl,spi-num-chipselects : Contains the number of the chipselect
+
Example:
ecspi@70010000 {
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index 8caa60734647..e6a5f4912b6d 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -156,6 +156,40 @@ handle it in two different ways:
root of the overlay. Finally the directory is moved to the new
location.
+There are several ways to tune the "redirect_dir" feature.
+
+Kernel config options:
+
+- OVERLAY_FS_REDIRECT_DIR:
+ If this is enabled, then redirect_dir is turned on by default.
+- OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW:
+ If this is enabled, then redirects are always followed by default. Enabling
+ this results in a less secure configuration. Enable this option only when
+ worried about backward compatibility with kernels that have the redirect_dir
+ feature and follow redirects even if turned off.
+
+Module options (can also be changed through /sys/module/overlay/parameters/*):
+
+- "redirect_dir=BOOL":
+ See OVERLAY_FS_REDIRECT_DIR kernel config option above.
+- "redirect_always_follow=BOOL":
+ See OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW kernel config option above.
+- "redirect_max=NUM":
+ The maximum number of bytes in an absolute redirect (default is 256).
+
+Mount options:
+
+- "redirect_dir=on":
+ Redirects are enabled.
+- "redirect_dir=follow":
+ Redirects are not created, but followed.
+- "redirect_dir=off":
+ Redirects are not created and only followed if "redirect_always_follow"
+ feature is enabled in the kernel/module config.
+- "redirect_dir=nofollow":
+ Redirects are not created and not followed (equivalent to "redirect_dir=off"
+ if "redirect_always_follow" feature is not enabled).
+
Non-directories
---------------
diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt
deleted file mode 100644
index bdf1423d5f99..000000000000
--- a/Documentation/locking/crossrelease.txt
+++ /dev/null
@@ -1,874 +0,0 @@
-Crossrelease
-============
-
-Started by Byungchul Park <byungchul.park@lge.com>
-
-Contents:
-
- (*) Background
-
- - What causes deadlock
- - How lockdep works
-
- (*) Limitation
-
- - Limit lockdep
- - Pros from the limitation
- - Cons from the limitation
- - Relax the limitation
-
- (*) Crossrelease
-
- - Introduce crossrelease
- - Introduce commit
-
- (*) Implementation
-
- - Data structures
- - How crossrelease works
-
- (*) Optimizations
-
- - Avoid duplication
- - Lockless for hot paths
-
- (*) APPENDIX A: What lockdep does to work aggresively
-
- (*) APPENDIX B: How to avoid adding false dependencies
-
-
-==========
-Background
-==========
-
-What causes deadlock
---------------------
-
-A deadlock occurs when a context is waiting for an event to happen,
-which is impossible because another (or the) context who can trigger the
-event is also waiting for another (or the) event to happen, which is
-also impossible due to the same reason.
-
-For example:
-
- A context going to trigger event C is waiting for event A to happen.
- A context going to trigger event A is waiting for event B to happen.
- A context going to trigger event B is waiting for event C to happen.
-
-A deadlock occurs when these three wait operations run at the same time,
-because event C cannot be triggered if event A does not happen, which in
-turn cannot be triggered if event B does not happen, which in turn
-cannot be triggered if event C does not happen. After all, no event can
-be triggered since any of them never meets its condition to wake up.
-
-A dependency might exist between two waiters and a deadlock might happen
-due to an incorrect releationship between dependencies. Thus, we must
-define what a dependency is first. A dependency exists between them if:
-
- 1. There are two waiters waiting for each event at a given time.
- 2. The only way to wake up each waiter is to trigger its event.
- 3. Whether one can be woken up depends on whether the other can.
-
-Each wait in the example creates its dependency like:
-
- Event C depends on event A.
- Event A depends on event B.
- Event B depends on event C.
-
- NOTE: Precisely speaking, a dependency is one between whether a
- waiter for an event can be woken up and whether another waiter for
- another event can be woken up. However from now on, we will describe
- a dependency as if it's one between an event and another event for
- simplicity.
-
-And they form circular dependencies like:
-
- -> C -> A -> B -
- / \
- \ /
- ----------------
-
- where 'A -> B' means that event A depends on event B.
-
-Such circular dependencies lead to a deadlock since no waiter can meet
-its condition to wake up as described.
-
-CONCLUSION
-
-Circular dependencies cause a deadlock.
-
-
-How lockdep works
------------------
-
-Lockdep tries to detect a deadlock by checking dependencies created by
-lock operations, acquire and release. Waiting for a lock corresponds to
-waiting for an event, and releasing a lock corresponds to triggering an
-event in the previous section.
-
-In short, lockdep does:
-
- 1. Detect a new dependency.
- 2. Add the dependency into a global graph.
- 3. Check if that makes dependencies circular.
- 4. Report a deadlock or its possibility if so.
-
-For example, consider a graph built by lockdep that looks like:
-
- A -> B -
- \
- -> E
- /
- C -> D -
-
- where A, B,..., E are different lock classes.
-
-Lockdep will add a dependency into the graph on detection of a new
-dependency. For example, it will add a dependency 'E -> C' when a new
-dependency between lock E and lock C is detected. Then the graph will be:
-
- A -> B -
- \
- -> E -
- / \
- -> C -> D - \
- / /
- \ /
- ------------------
-
- where A, B,..., E are different lock classes.
-
-This graph contains a subgraph which demonstrates circular dependencies:
-
- -> E -
- / \
- -> C -> D - \
- / /
- \ /
- ------------------
-
- where C, D and E are different lock classes.
-
-This is the condition under which a deadlock might occur. Lockdep
-reports it on detection after adding a new dependency. This is the way
-how lockdep works.
-
-CONCLUSION
-
-Lockdep detects a deadlock or its possibility by checking if circular
-dependencies were created after adding each new dependency.
-
-
-==========
-Limitation
-==========
-
-Limit lockdep
--------------
-
-Limiting lockdep to work on only typical locks e.g. spin locks and
-mutexes, which are released within the acquire context, the
-implementation becomes simple but its capacity for detection becomes
-limited. Let's check pros and cons in next section.
-
-
-Pros from the limitation
-------------------------
-
-Given the limitation, when acquiring a lock, locks in a held_locks
-cannot be released if the context cannot acquire it so has to wait to
-acquire it, which means all waiters for the locks in the held_locks are
-stuck. It's an exact case to create dependencies between each lock in
-the held_locks and the lock to acquire.
-
-For example:
-
- CONTEXT X
- ---------
- acquire A
- acquire B /* Add a dependency 'A -> B' */
- release B
- release A
-
- where A and B are different lock classes.
-
-When acquiring lock A, the held_locks of CONTEXT X is empty thus no
-dependency is added. But when acquiring lock B, lockdep detects and adds
-a new dependency 'A -> B' between lock A in the held_locks and lock B.
-They can be simply added whenever acquiring each lock.
-
-And data required by lockdep exists in a local structure, held_locks
-embedded in task_struct. Forcing to access the data within the context,
-lockdep can avoid racy problems without explicit locks while handling
-the local data.
-
-Lastly, lockdep only needs to keep locks currently being held, to build
-a dependency graph. However, relaxing the limitation, it needs to keep
-even locks already released, because a decision whether they created
-dependencies might be long-deferred.
-
-To sum up, we can expect several advantages from the limitation:
-
- 1. Lockdep can easily identify a dependency when acquiring a lock.
- 2. Races are avoidable while accessing local locks in a held_locks.
- 3. Lockdep only needs to keep locks currently being held.
-
-CONCLUSION
-
-Given the limitation, the implementation becomes simple and efficient.
-
-
-Cons from the limitation
-------------------------
-
-Given the limitation, lockdep is applicable only to typical locks. For
-example, page locks for page access or completions for synchronization
-cannot work with lockdep.
-
-Can we detect deadlocks below, under the limitation?
-
-Example 1:
-
- CONTEXT X CONTEXT Y CONTEXT Z
- --------- --------- ----------
- mutex_lock A
- lock_page B
- lock_page B
- mutex_lock A /* DEADLOCK */
- unlock_page B held by X
- unlock_page B
- mutex_unlock A
- mutex_unlock A
-
- where A and B are different lock classes.
-
-No, we cannot.
-
-Example 2:
-
- CONTEXT X CONTEXT Y
- --------- ---------
- mutex_lock A
- mutex_lock A
- wait_for_complete B /* DEADLOCK */
- complete B
- mutex_unlock A
- mutex_unlock A
-
- where A is a lock class and B is a completion variable.
-
-No, we cannot.
-
-CONCLUSION
-
-Given the limitation, lockdep cannot detect a deadlock or its
-possibility caused by page locks or completions.
-
-
-Relax the limitation
---------------------
-
-Under the limitation, things to create dependencies are limited to
-typical locks. However, synchronization primitives like page locks and
-completions, which are allowed to be released in any context, also
-create dependencies and can cause a deadlock. So lockdep should track
-these locks to do a better job. We have to relax the limitation for
-these locks to work with lockdep.
-
-Detecting dependencies is very important for lockdep to work because
-adding a dependency means adding an opportunity to check whether it
-causes a deadlock. The more lockdep adds dependencies, the more it
-thoroughly works. Thus Lockdep has to do its best to detect and add as
-many true dependencies into a graph as possible.
-
-For example, considering only typical locks, lockdep builds a graph like:
-
- A -> B -
- \
- -> E
- /
- C -> D -
-
- where A, B,..., E are different lock classes.
-
-On the other hand, under the relaxation, additional dependencies might
-be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
-added thanks to the relaxation, the graph will be:
-
- A -> B -
- \
- -> E -> GX
- /
- FX -> C -> D -
-
- where A, B,..., E, FX and GX are different lock classes, and a suffix
- 'X' is added on non-typical locks.
-
-The latter graph gives us more chances to check circular dependencies
-than the former. However, it might suffer performance degradation since
-relaxing the limitation, with which design and implementation of lockdep
-can be efficient, might introduce inefficiency inevitably. So lockdep
-should provide two options, strong detection and efficient detection.
-
-Choosing efficient detection:
-
- Lockdep works with only locks restricted to be released within the
- acquire context. However, lockdep works efficiently.
-
-Choosing strong detection:
-
- Lockdep works with all synchronization primitives. However, lockdep
- suffers performance degradation.
-
-CONCLUSION
-
-Relaxing the limitation, lockdep can add additional dependencies giving
-additional opportunities to check circular dependencies.
-
-
-============
-Crossrelease
-============
-
-Introduce crossrelease
-----------------------
-
-In order to allow lockdep to handle additional dependencies by what
-might be released in any context, namely 'crosslock', we have to be able
-to identify those created by crosslocks. The proposed 'crossrelease'
-feature provoides a way to do that.
-
-Crossrelease feature has to do:
-
- 1. Identify dependencies created by crosslocks.
- 2. Add the dependencies into a dependency graph.
-
-That's all. Once a meaningful dependency is added into graph, then
-lockdep would work with the graph as it did. The most important thing
-crossrelease feature has to do is to correctly identify and add true
-dependencies into the global graph.
-
-A dependency e.g. 'A -> B' can be identified only in the A's release
-context because a decision required to identify the dependency can be
-made only in the release context. That is to decide whether A can be
-released so that a waiter for A can be woken up. It cannot be made in
-other than the A's release context.
-
-It's no matter for typical locks because each acquire context is same as
-its release context, thus lockdep can decide whether a lock can be
-released in the acquire context. However for crosslocks, lockdep cannot
-make the decision in the acquire context but has to wait until the
-release context is identified.
-
-Therefore, deadlocks by crosslocks cannot be detected just when it
-happens, because those cannot be identified until the crosslocks are
-released. However, deadlock possibilities can be detected and it's very
-worth. See 'APPENDIX A' section to check why.
-
-CONCLUSION
-
-Using crossrelease feature, lockdep can work with what might be released
-in any context, namely crosslock.
-
-
-Introduce commit
-----------------
-
-Since crossrelease defers the work adding true dependencies of
-crosslocks until they are actually released, crossrelease has to queue
-all acquisitions which might create dependencies with the crosslocks.
-Then it identifies dependencies using the queued data in batches at a
-proper time. We call it 'commit'.
-
-There are four types of dependencies:
-
-1. TT type: 'typical lock A -> typical lock B'
-
- Just when acquiring B, lockdep can see it's in the A's release
- context. So the dependency between A and B can be identified
- immediately. Commit is unnecessary.
-
-2. TC type: 'typical lock A -> crosslock BX'
-
- Just when acquiring BX, lockdep can see it's in the A's release
- context. So the dependency between A and BX can be identified
- immediately. Commit is unnecessary, too.
-
-3. CT type: 'crosslock AX -> typical lock B'
-
- When acquiring B, lockdep cannot identify the dependency because
- there's no way to know if it's in the AX's release context. It has
- to wait until the decision can be made. Commit is necessary.
-
-4. CC type: 'crosslock AX -> crosslock BX'
-
- When acquiring BX, lockdep cannot identify the dependency because
- there's no way to know if it's in the AX's release context. It has
- to wait until the decision can be made. Commit is necessary.
- But, handling CC type is not implemented yet. It's a future work.
-
-Lockdep can work without commit for typical locks, but commit step is
-necessary once crosslocks are involved. Introducing commit, lockdep
-performs three steps. What lockdep does in each step is:
-
-1. Acquisition: For typical locks, lockdep does what it originally did
- and queues the lock so that CT type dependencies can be checked using
- it at the commit step. For crosslocks, it saves data which will be
- used at the commit step and increases a reference count for it.
-
-2. Commit: No action is reauired for typical locks. For crosslocks,
- lockdep adds CT type dependencies using the data saved at the
- acquisition step.
-
-3. Release: No changes are required for typical locks. When a crosslock
- is released, it decreases a reference count for it.
-
-CONCLUSION
-
-Crossrelease introduces commit step to handle dependencies of crosslocks
-in batches at a proper time.
-
-
-==============
-Implementation
-==============
-
-Data structures
----------------
-
-Crossrelease introduces two main data structures.
-
-1. hist_lock
-
- This is an array embedded in task_struct, for keeping lock history so
- that dependencies can be added using them at the commit step. Since
- it's local data, it can be accessed locklessly in the owner context.
- The array is filled at the acquisition step and consumed at the
- commit step. And it's managed in circular manner.
-
-2. cross_lock
-
- One per lockdep_map exists. This is for keeping data of crosslocks
- and used at the commit step.
-
-
-How crossrelease works
-----------------------
-
-It's the key of how crossrelease works, to defer necessary works to an
-appropriate point in time and perform in at once at the commit step.
-Let's take a look with examples step by step, starting from how lockdep
-works without crossrelease for typical locks.
-
- acquire A /* Push A onto held_locks */
- acquire B /* Push B onto held_locks and add 'A -> B' */
- acquire C /* Push C onto held_locks and add 'B -> C' */
- release C /* Pop C from held_locks */
- release B /* Pop B from held_locks */
- release A /* Pop A from held_locks */
-
- where A, B and C are different lock classes.
-
- NOTE: This document assumes that readers already understand how
- lockdep works without crossrelease thus omits details. But there's
- one thing to note. Lockdep pretends to pop a lock from held_locks
- when releasing it. But it's subtly different from the original pop
- operation because lockdep allows other than the top to be poped.
-
-In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
-dependency every time acquiring a lock.
-
-After adding 'A -> B', a dependency graph will be:
-
- A -> B
-
- where A and B are different lock classes.
-
-And after adding 'B -> C', the graph will be:
-
- A -> B -> C
-
- where A, B and C are different lock classes.
-
-Let's performs commit step even for typical locks to add dependencies.
-Of course, commit step is not necessary for them, however, it would work
-well because this is a more general way.
-
- acquire A
- /*
- * Queue A into hist_locks
- *
- * In hist_locks: A
- * In graph: Empty
- */
-
- acquire B
- /*
- * Queue B into hist_locks
- *
- * In hist_locks: A, B
- * In graph: Empty
- */
-
- acquire C
- /*
- * Queue C into hist_locks
- *
- * In hist_locks: A, B, C
- * In graph: Empty
- */
-
- commit C
- /*
- * Add 'C -> ?'
- * Answer the following to decide '?'
- * What has been queued since acquire C: Nothing
- *
- * In hist_locks: A, B, C
- * In graph: Empty
- */
-
- release C
-
- commit B
- /*
- * Add 'B -> ?'
- * Answer the following to decide '?'
- * What has been queued since acquire B: C
- *
- * In hist_locks: A, B, C
- * In graph: 'B -> C'
- */
-
- release B
-
- commit A
- /*
- * Add 'A -> ?'
- * Answer the following to decide '?'
- * What has been queued since acquire A: B, C
- *
- * In hist_locks: A, B, C
- * In graph: 'B -> C', 'A -> B', 'A -> C'
- */
-
- release A
-
- where A, B and C are different lock classes.
-
-In this case, dependencies are added at the commit step as described.
-
-After commits for A, B and C, the graph will be:
-
- A -> B -> C
-
- where A, B and C are different lock classes.
-
- NOTE: A dependency 'A -> C' is optimized out.
-
-We can see the former graph built without commit step is same as the
-latter graph built using commit steps. Of course the former way leads to
-earlier finish for building the graph, which means we can detect a
-deadlock or its possibility sooner. So the former way would be prefered
-when possible. But we cannot avoid using the latter way for crosslocks.
-
-Let's look at how commit steps work for crosslocks. In this case, the
-commit step is performed only on crosslock AX as real. And it assumes
-that the AX release context is different from the AX acquire context.
-
- BX RELEASE CONTEXT BX ACQUIRE CONTEXT
- ------------------ ------------------
- acquire A
- /*
- * Push A onto held_locks
- * Queue A into hist_locks
- *
- * In held_locks: A
- * In hist_locks: A
- * In graph: Empty
- */
-
- acquire BX
- /*
- * Add 'the top of held_locks -> BX'
- *
- * In held_locks: A
- * In hist_locks: A
- * In graph: 'A -> BX'
- */
-
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- It must be guaranteed that the following operations are seen after
- acquiring BX globally. It can be done by things like barrier.
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- acquire C
- /*
- * Push C onto held_locks
- * Queue C into hist_locks
- *
- * In held_locks: C
- * In hist_locks: C
- * In graph: 'A -> BX'
- */
-
- release C
- /*
- * Pop C from held_locks
- *
- * In held_locks: Empty
- * In hist_locks: C
- * In graph: 'A -> BX'
- */
- acquire D
- /*
- * Push D onto held_locks
- * Queue D into hist_locks
- * Add 'the top of held_locks -> D'
- *
- * In held_locks: A, D
- * In hist_locks: A, D
- * In graph: 'A -> BX', 'A -> D'
- */
- acquire E
- /*
- * Push E onto held_locks
- * Queue E into hist_locks
- *
- * In held_locks: E
- * In hist_locks: C, E
- * In graph: 'A -> BX', 'A -> D'
- */
-
- release E
- /*
- * Pop E from held_locks
- *
- * In held_locks: Empty
- * In hist_locks: D, E
- * In graph: 'A -> BX', 'A -> D'
- */
- release D
- /*
- * Pop D from held_locks
- *
- * In held_locks: A
- * In hist_locks: A, D
- * In graph: 'A -> BX', 'A -> D'
- */
- commit BX
- /*
- * Add 'BX -> ?'
- * What has been queued since acquire BX: C, E
- *
- * In held_locks: Empty
- * In hist_locks: D, E
- * In graph: 'A -> BX', 'A -> D',
- * 'BX -> C', 'BX -> E'
- */
-
- release BX
- /*
- * In held_locks: Empty
- * In hist_locks: D, E
- * In graph: 'A -> BX', 'A -> D',
- * 'BX -> C', 'BX -> E'
- */
- release A
- /*
- * Pop A from held_locks
- *
- * In held_locks: Empty
- * In hist_locks: A, D
- * In graph: 'A -> BX', 'A -> D',
- * 'BX -> C', 'BX -> E'
- */
-
- where A, BX, C,..., E are different lock classes, and a suffix 'X' is
- added on crosslocks.
-
-Crossrelease considers all acquisitions after acqiuring BX are
-candidates which might create dependencies with BX. True dependencies
-will be determined when identifying the release context of BX. Meanwhile,
-all typical locks are queued so that they can be used at the commit step.
-And then two dependencies 'BX -> C' and 'BX -> E' are added at the
-commit step when identifying the release context.
-
-The final graph will be, with crossrelease:
-
- -> C
- /
- -> BX -
- / \
- A - -> E
- \
- -> D
-
- where A, BX, C,..., E are different lock classes, and a suffix 'X' is
- added on crosslocks.
-
-However, the final graph will be, without crossrelease:
-
- A -> D
-
- where A and D are different lock classes.
-
-The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
-'BX -> E' giving additional opportunities to check if they cause
-deadlocks. This way lockdep can detect a deadlock or its possibility
-caused by crosslocks.
-
-CONCLUSION
-
-We checked how crossrelease works with several examples.
-
-
-=============
-Optimizations
-=============
-
-Avoid duplication
------------------
-
-Crossrelease feature uses a cache like what lockdep already uses for
-dependency chains, but this time it's for caching CT type dependencies.
-Once that dependency is cached, the same will never be added again.
-
-
-Lockless for hot paths
-----------------------
-
-To keep all locks for later use at the commit step, crossrelease adopts
-a local array embedded in task_struct, which makes access to the data
-lockless by forcing it to happen only within the owner context. It's
-like how lockdep handles held_locks. Lockless implmentation is important
-since typical locks are very frequently acquired and released.
-
-
-=================================================
-APPENDIX A: What lockdep does to work aggresively
-=================================================
-
-A deadlock actually occurs when all wait operations creating circular
-dependencies run at the same time. Even though they don't, a potential
-deadlock exists if the problematic dependencies exist. Thus it's
-meaningful to detect not only an actual deadlock but also its potential
-possibility. The latter is rather valuable. When a deadlock occurs
-actually, we can identify what happens in the system by some means or
-other even without lockdep. However, there's no way to detect possiblity
-without lockdep unless the whole code is parsed in head. It's terrible.
-Lockdep does the both, and crossrelease only focuses on the latter.
-
-Whether or not a deadlock actually occurs depends on several factors.
-For example, what order contexts are switched in is a factor. Assuming
-circular dependencies exist, a deadlock would occur when contexts are
-switched so that all wait operations creating the dependencies run
-simultaneously. Thus to detect a deadlock possibility even in the case
-that it has not occured yet, lockdep should consider all possible
-combinations of dependencies, trying to:
-
-1. Use a global dependency graph.
-
- Lockdep combines all dependencies into one global graph and uses them,
- regardless of which context generates them or what order contexts are
- switched in. Aggregated dependencies are only considered so they are
- prone to be circular if a problem exists.
-
-2. Check dependencies between classes instead of instances.
-
- What actually causes a deadlock are instances of lock. However,
- lockdep checks dependencies between classes instead of instances.
- This way lockdep can detect a deadlock which has not happened but
- might happen in future by others but the same class.
-
-3. Assume all acquisitions lead to waiting.
-
- Although locks might be acquired without waiting which is essential
- to create dependencies, lockdep assumes all acquisitions lead to
- waiting since it might be true some time or another.
-
-CONCLUSION
-
-Lockdep detects not only an actual deadlock but also its possibility,
-and the latter is more valuable.
-
-
-==================================================
-APPENDIX B: How to avoid adding false dependencies
-==================================================
-
-Remind what a dependency is. A dependency exists if:
-
- 1. There are two waiters waiting for each event at a given time.
- 2. The only way to wake up each waiter is to trigger its event.
- 3. Whether one can be woken up depends on whether the other can.
-
-For example:
-
- acquire A
- acquire B /* A dependency 'A -> B' exists */
- release B
- release A
-
- where A and B are different lock classes.
-
-A depedency 'A -> B' exists since:
-
- 1. A waiter for A and a waiter for B might exist when acquiring B.
- 2. Only way to wake up each is to release what it waits for.
- 3. Whether the waiter for A can be woken up depends on whether the
- other can. IOW, TASK X cannot release A if it fails to acquire B.
-
-For another example:
-
- TASK X TASK Y
- ------ ------
- acquire AX
- acquire B /* A dependency 'AX -> B' exists */
- release B
- release AX held by Y
-
- where AX and B are different lock classes, and a suffix 'X' is added
- on crosslocks.
-
-Even in this case involving crosslocks, the same rule can be applied. A
-depedency 'AX -> B' exists since:
-
- 1. A waiter for AX and a waiter for B might exist when acquiring B.
- 2. Only way to wake up each is to release what it waits for.
- 3. Whether the waiter for AX can be woken up depends on whether the
- other can. IOW, TASK X cannot release AX if it fails to acquire B.
-
-Let's take a look at more complicated example:
-
- TASK X TASK Y
- ------ ------
- acquire B
- release B
- fork Y
- acquire AX
- acquire C /* A dependency 'AX -> C' exists */
- release C
- release AX held by Y
-
- where AX, B and C are different lock classes, and a suffix 'X' is
- added on crosslocks.
-
-Does a dependency 'AX -> B' exist? Nope.
-
-Two waiters are essential to create a dependency. However, waiters for
-AX and B to create 'AX -> B' cannot exist at the same time in this
-example. Thus the dependency 'AX -> B' cannot be created.
-
-It would be ideal if the full set of true ones can be considered. But
-we can ensure nothing but what actually happened. Relying on what
-actually happens at runtime, we can anyway add only true ones, though
-they might be a subset of true ones. It's similar to how lockdep works
-for typical locks. There might be more true dependencies than what
-lockdep has detected in runtime. Lockdep has no choice but to rely on
-what actually happens. Crossrelease also relies on it.
-
-CONCLUSION
-
-Relying on what actually happens, lockdep can avoid adding false
-dependencies.
diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
index 89fff7d611cc..0b3a1148f9f0 100644
--- a/Documentation/vm/zswap.txt
+++ b/Documentation/vm/zswap.txt
@@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is uncompressed using its
original compressor. Once all pages are removed from an old zpool, the zpool
and its compressor are freed.
+Some of the pages in zswap are same-value filled pages (i.e. contents of the
+page have same value or repetitive pattern). These pages include zero-filled
+pages and they are handled differently. During store operation, a page is
+checked if it is a same-value filled page before compressing it. If true, the
+compressed length of the page is set to zero and the pattern or same-filled
+value is stored.
+
+Same-value filled pages identification feature is enabled by default and can be
+disabled at boot time by setting the "same_filled_pages_enabled" attribute to 0,
+e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled at
+runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
+
+echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
+
+When zswap same-filled page identification is disabled at runtime, it will stop
+checking for the same-value filled pages during store operation. However, the
+existing pages which are marked as same-value filled pages remain stored
+unchanged in zswap until they are either loaded or invalidated.
+
A debugfs interface is provided for various statistic about pool size, number
-of pages stored, and various counters for the reasons pages are rejected.
+of pages stored, same-value filled pages and various counters for the reasons
+pages are rejected.
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 3448e675b462..ad41b3813f0a 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -1,6 +1,4 @@
-<previous description obsolete, deleted>
-
Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
@@ -14,13 +12,16 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables:
@@ -29,26 +30,29 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
-through to the most-significant implemented bit are set to either all ones
-or all zero. This causes hole between user space and kernel addresses.
+through to the most-significant implemented bit are sign extended.
+This causes hole between user space and kernel addresses if you interpret them
+as unsigned.
The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
@@ -58,9 +62,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
the processes using the page fault handler, with init_top_pgt as
reference.
-Current X86-64 implementations support up to 46 bits of address space (64 TB),
-which is our current limit. This expands into MBZ space in the page tables.
-
We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
@@ -72,5 +73,3 @@ following fixmap section.
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time.
-
--Andi Kleen, Jul 2004
diff --git a/MAINTAINERS b/MAINTAINERS
index 80212f8ce60f..58797b83dd8d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2613,24 +2613,22 @@ F: fs/bfs/
F: include/uapi/linux/bfs_fs.h
BLACKFIN ARCHITECTURE
-M: Steven Miao <realmz6@gmail.com>
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
T: git git://git.code.sf.net/p/adi-linux/code
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: arch/blackfin/
BLACKFIN EMAC DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/net/ethernet/adi/
BLACKFIN MEDIA DRIVER
-M: Scott Jiang <scott.jiang.linux@gmail.com>
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org/
-S: Supported
+S: Orphan
F: drivers/media/platform/blackfin/
F: drivers/media/i2c/adv7183*
F: drivers/media/i2c/vs6624*
@@ -2638,25 +2636,25 @@ F: drivers/media/i2c/vs6624*
BLACKFIN RTC DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/rtc/rtc-bfin.c
BLACKFIN SDH DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/mmc/host/bfin_sdh.c
BLACKFIN SERIAL DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/tty/serial/bfin_uart.c
BLACKFIN WATCHDOG DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/watchdog/bfin_wdt.c
BLINKM RGB LED DRIVER
@@ -5423,7 +5421,7 @@ F: drivers/media/tuners/fc2580*
FCOE SUBSYSTEM (libfc, libfcoe, fcoe)
M: Johannes Thumshirn <jth@kernel.org>
-L: fcoe-devel@open-fcoe.org
+L: linux-scsi@vger.kernel.org
W: www.Open-FCoE.org
S: Supported
F: drivers/scsi/libfc/
@@ -13128,6 +13126,7 @@ F: drivers/dma/dw/
SYNOPSYS DESIGNWARE ENTERPRISE ETHERNET DRIVER
M: Jie Deng <jiedeng@synopsys.com>
+M: Jose Abreu <Jose.Abreu@synopsys.com>
L: netdev@vger.kernel.org
S: Supported
F: drivers/net/ethernet/synopsys/
@@ -13512,6 +13511,7 @@ M: Mika Westerberg <mika.westerberg@linux.intel.com>
M: Yehezkel Bernat <yehezkel.bernat@intel.com>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
S: Maintained
+F: Documentation/admin-guide/thunderbolt.rst
F: drivers/thunderbolt/
F: include/linux/thunderbolt.h
diff --git a/Makefile b/Makefile
index 3f4d157add54..eb1f5973813e 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 4
PATCHLEVEL = 15
SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc6
NAME = Fearless Coyote
# *DOCUMENTATION*
@@ -789,6 +789,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
+# Make sure -fstack-check isn't enabled (like gentoo apparently did)
+KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
+
# conserve stack if available
KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts
index fbb3758ca2e3..4b8edc8982cf 100644
--- a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts
+++ b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts
@@ -121,7 +121,7 @@
switch0port10: port@10 {
reg = <10>;
label = "dsa";
- phy-mode = "xgmii";
+ phy-mode = "xaui";
link = <&switch1port10>;
};
};
@@ -208,7 +208,7 @@
switch1port10: port@10 {
reg = <10>;
label = "dsa";
- phy-mode = "xgmii";
+ phy-mode = "xaui";
link = <&switch0port10>;
};
};
diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S
index 1712f132b80d..b83fdc06286a 100644
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -85,7 +85,11 @@
.pushsection .text.fixup,"ax"
.align 4
9001: mov r4, #-EFAULT
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ ldr r5, [sp, #9*4] @ *err_ptr
+#else
ldr r5, [sp, #8*4] @ *err_ptr
+#endif
str r4, [r5]
ldmia sp, {r1, r2} @ retrieve dst, len
add r2, r2, r1
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a93339f5178f..c9a7e9e1414f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -557,7 +557,6 @@ config QCOM_QDF2400_ERRATUM_0065
If unsure, say Y.
-
config SOCIONEXT_SYNQUACER_PREITS
bool "Socionext Synquacer: Workaround for GICv3 pre-ITS"
default y
@@ -576,6 +575,17 @@ config HISILICON_ERRATUM_161600802
a 128kB offset to be applied to the target address in this commands.
If unsure, say Y.
+
+config QCOM_FALKOR_ERRATUM_E1041
+ bool "Falkor E1041: Speculative instruction fetches might cause errant memory access"
+ default y
+ help
+ Falkor CPU may speculatively fetch instructions from an improper
+ memory location when MMU translation is changed from SCTLR_ELn[M]=1
+ to SCTLR_ELn[M]=0. Prefix an ISB instruction to fix the problem.
+
+ If unsure, say Y.
+
endmenu
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index aef72d886677..8b168280976f 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -512,4 +512,14 @@ alternative_else_nop_endif
#endif
.endm
+/**
+ * Errata workaround prior to disable MMU. Insert an ISB immediately prior
+ * to executing the MSR that will change SCTLR_ELn[M] from a value of 1 to 0.
+ */
+ .macro pre_disable_mmu_workaround
+#ifdef CONFIG_QCOM_FALKOR_ERRATUM_E1041
+ isb
+#endif
+ .endm
+
#endif /* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index ac67cfc2585a..060e3a4008ab 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -60,6 +60,9 @@ enum ftr_type {
#define FTR_VISIBLE true /* Feature visible to the user space */
#define FTR_HIDDEN false /* Feature is hidden from the user */
+#define FTR_VISIBLE_IF_IS_ENABLED(config) \
+ (IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN)
+
struct arm64_ftr_bits {
bool sign; /* Value is signed ? */
bool visible;
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 235e77d98261..cbf08d7cbf30 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -91,6 +91,7 @@
#define BRCM_CPU_PART_VULCAN 0x516
#define QCOM_CPU_PART_FALKOR_V1 0x800
+#define QCOM_CPU_PART_FALKOR 0xC00
#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
@@ -99,6 +100,7 @@
#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
#define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
+#define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR)
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 149d05fb9421..bdcc7f1c9d06 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -42,6 +42,8 @@
#include <asm/cmpxchg.h>
#include <asm/fixmap.h>
#include <linux/mmdebug.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
extern void __pte_error(const char *file, int line, unsigned long val);
extern void __pmd_error(const char *file, int line, unsigned long val);
@@ -149,12 +151,20 @@ static inline pte_t pte_mkwrite(pte_t pte)
static inline pte_t pte_mkclean(pte_t pte)
{
- return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+ pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+ pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+ return pte;
}
static inline pte_t pte_mkdirty(pte_t pte)
{
- return set_pte_bit(pte, __pgprot(PTE_DIRTY));
+ pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
+ if (pte_write(pte))
+ pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+ return pte;
}
static inline pte_t pte_mkold(pte_t pte)
@@ -207,9 +217,6 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
}
}
-struct mm_struct;
-struct vm_area_struct;
-
extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);
/*
@@ -238,7 +245,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
* hardware updates of the pte (ptep_set_access_flags safely changes
* valid ptes without going through an invalid entry).
*/
- if (pte_valid(*ptep) && pte_valid(pte)) {
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(*ptep) && pte_valid(pte) &&
+ (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) {
VM_WARN_ONCE(!pte_young(pte),
"%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
__func__, pte_val(*ptep), pte_val(pte));
@@ -641,28 +649,23 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * ptep_set_wrprotect - mark read-only while preserving the hardware update of
- * the Access Flag.
+ * ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
*/
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
pte_t old_pte, pte;
- /*
- * ptep_set_wrprotect() is only called on CoW mappings which are
- * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE &&
- * PTE_RDONLY) or writable and software-dirty (PTE_WRITE &&
- * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and
- * protection_map[]. There is no race with the hardware update of the
- * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM)
- * is set.
- */
- VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep),
- "%s: potential race with hardware DBM", __func__);
pte = READ_ONCE(*ptep);
do {
old_pte = pte;
+ /*
+ * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
+ * clear), set the PTE_DIRTY bit.
+ */
+ if (pte_hw_dirty(pte))
+ pte = pte_mkdirty(pte);
pte = pte_wrprotect(pte);
pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
pte_val(old_pte), pte_val(pte));
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S
index 65f42d257414..2a752cb2a0f3 100644
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -37,6 +37,7 @@ ENTRY(__cpu_soft_restart)
mrs x12, sctlr_el1
ldr x13, =SCTLR_ELx_FLAGS
bic x12, x12, x13
+ pre_disable_mmu_workaround
msr sctlr_el1, x12
isb
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c5ba0097887f..a73a5928f09b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -145,7 +145,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),
S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index 4e6ad355bd05..6b9736c3fb56 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -96,6 +96,7 @@ ENTRY(entry)
mrs x0, sctlr_el2
bic x0, x0, #1 << 0 // clear SCTLR.M
bic x0, x0, #1 << 2 // clear SCTLR.C
+ pre_disable_mmu_workaround
msr sctlr_el2, x0
isb
b 2f
@@ -103,6 +104,7 @@ ENTRY(entry)
mrs x0, sctlr_el1
bic x0, x0, #1 << 0 // clear SCTLR.M
bic x0, x0, #1 << 2 // clear SCTLR.C
+ pre_disable_mmu_workaround
msr sctlr_el1, x0
isb
2:
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 540a1e010eb5..fae81f7964b4 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1043,7 +1043,7 @@ void fpsimd_update_current_state(struct fpsimd_state *state)
local_bh_disable();
- current->thread.fpsimd_state = *state;
+ current->thread.fpsimd_state.user_fpsimd = state->user_fpsimd;
if (system_supports_sve() && test_thread_flag(TIF_SVE))
fpsimd_to_sve(current);
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 67e86a0f57ac..e3cb9fbf96b6 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -750,6 +750,7 @@ __primary_switch:
* to take into account by discarding the current kernel mapping and
* creating a new one.
*/
+ pre_disable_mmu_workaround
msr sctlr_el1, x20 // disable the MMU
isb
bl __create_page_tables // recreate kernel mapping
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 749f81779420..74bb56f656ef 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -28,6 +28,7 @@
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/smp.h>
+#include <linux/uaccess.h>
#include <asm/compat.h>
#include <asm/current.h>
@@ -36,7 +37,6 @@
#include <asm/traps.h>
#include <asm/cputype.h>
#include <asm/system_misc.h>
-#include <asm/uaccess.h>
/* Breakpoint currently in use for each BRP. */
static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]);
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index ce704a4aeadd..f407e422a720 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -45,6 +45,7 @@ ENTRY(arm64_relocate_new_kernel)
mrs x0, sctlr_el2
ldr x1, =SCTLR_ELx_FLAGS
bic x0, x0, x1
+ pre_disable_mmu_workaround
msr sctlr_el2, x0
isb
1:
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 3f9615582377..870828c364c5 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -151,6 +151,7 @@ reset:
mrs x5, sctlr_el2
ldr x6, =SCTLR_ELx_FLAGS
bic x5, x5, x6 // Clear SCTL_M and etc
+ pre_disable_mmu_workaround
msr sctlr_el2, x5
isb
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index 321c9c05dd9e..f4363d40e2cd 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
{
u64 reg;
+ /* Clear pmscr in case of early return */
+ *pmscr_el1 = 0;
+
/* SPE present on this CPU? */
if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
ID_AA64DFR0_PMSVER_SHIFT))
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index ca74a2aace42..7b60d62ac593 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -389,7 +389,7 @@ void ptdump_check_wx(void)
.check_wx = true,
};
- walk_pgd(&st, &init_mm, 0);
+ walk_pgd(&st, &init_mm, VA_START);
note_page(&st, 0, 0, 0);
if (st.wx_pages || st.uxn_pages)
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 22168cd0dde7..9b7f89df49db 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -574,7 +574,6 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
struct siginfo info;
const struct fault_info *inf;
- int ret = 0;
inf = esr_to_fault_info(esr);
pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
@@ -589,7 +588,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
if (interrupts_enabled(regs))
nmi_enter();
- ret = ghes_notify_sea();
+ ghes_notify_sea();
if (interrupts_enabled(regs))
nmi_exit();
@@ -604,7 +603,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
info.si_addr = (void __user *)addr;
arm64_notify_die("", regs, &info, esr);
- return ret;
+ return 0;
}
static const struct fault_info fault_info[] = {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 5960bef0170d..00e7b900ca41 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -476,6 +476,8 @@ void __init arm64_memblock_init(void)
reserve_elfcorehdr();
+ high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
+
dma_contiguous_reserve(arm64_dma_phys_limit);
memblock_allow_resize();
@@ -502,7 +504,6 @@ void __init bootmem_init(void)
sparse_init();
zone_sizes_init(min, max);
- high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
memblock_dump_all();
}
diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c
index 9345b44b86f0..f57118e1f6b4 100644
--- a/arch/parisc/boot/compressed/misc.c
+++ b/arch/parisc/boot/compressed/misc.c
@@ -123,8 +123,8 @@ int puts(const char *s)
while ((nuline = strchr(s, '\n')) != NULL) {
if (nuline != s)
pdc_iodc_print(s, nuline - s);
- pdc_iodc_print("\r\n", 2);
- s = nuline + 1;
+ pdc_iodc_print("\r\n", 2);
+ s = nuline + 1;
}
if (*s != '\0')
pdc_iodc_print(s, strlen(s));
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index c980a02a52bc..598c8d60fa5e 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -35,7 +35,12 @@ struct thread_info {
/* thread information allocation */
+#ifdef CONFIG_IRQSTACKS
+#define THREAD_SIZE_ORDER 2 /* PA-RISC requires at least 16k stack */
+#else
#define THREAD_SIZE_ORDER 3 /* PA-RISC requires at least 32k stack */
+#endif
+
/* Be sure to hunt all references to this down when you change the size of
* the kernel stack */
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index a4fd296c958e..f3cecf5117cf 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi)
STREG %r19,PT_SR7(%r16)
intr_return:
- /* NOTE: Need to enable interrupts incase we schedule. */
- ssm PSW_SM_I, %r0
-
/* check for reschedule */
mfctl %cr30,%r1
LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */
@@ -907,6 +904,11 @@ intr_check_sig:
LDREG PT_IASQ1(%r16), %r20
cmpib,COND(=),n 0,%r20,intr_restore /* backward */
+ /* NOTE: We need to enable interrupts if we have to deliver
+ * signals. We used to do this earlier but it caused kernel
+ * stack overflows. */
+ ssm PSW_SM_I, %r0
+
copy %r0, %r25 /* long in_syscall = 0 */
#ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */
@@ -958,6 +960,10 @@ intr_do_resched:
cmpib,COND(=) 0, %r20, intr_do_preempt
nop
+ /* NOTE: We need to enable interrupts if we schedule. We used
+ * to do this earlier but it caused kernel stack overflows. */
+ ssm PSW_SM_I, %r0
+
#ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */
#endif
diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S
index e3a8e5e4d5de..8d072c44f300 100644
--- a/arch/parisc/kernel/hpmc.S
+++ b/arch/parisc/kernel/hpmc.S
@@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc)
__INITRODATA
+ .align 4
.export os_hpmc_size
os_hpmc_size:
.word .os_hpmc_end-.os_hpmc
diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c
index 5a657986ebbf..143f90e2f9f3 100644
--- a/arch/parisc/kernel/unwind.c
+++ b/arch/parisc/kernel/unwind.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/kallsyms.h>
#include <linux/sort.h>
-#include <linux/sched.h>
#include <linux/uaccess.h>
#include <asm/assembly.h>
diff --git a/arch/parisc/lib/delay.c b/arch/parisc/lib/delay.c
index 7eab4bb8abe6..66e506520505 100644
--- a/arch/parisc/lib/delay.c
+++ b/arch/parisc/lib/delay.c
@@ -16,9 +16,7 @@
#include <linux/preempt.h>
#include <linux/init.h>
-#include <asm/processor.h>
#include <asm/delay.h>
-
#include <asm/special_insns.h> /* for mfctl() */
#include <asm/processor.h> /* for boot_cpu_data */
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 6177d43f0ce8..e2a2b8400490 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
#endif
}
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
{
+ return 0;
}
#ifndef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 5acb5a176dbe..72be0c32e902 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1403,7 +1403,7 @@ void show_regs(struct pt_regs * regs)
printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
regs->nip, regs->link, regs->ctr);
- printk("REGS: %p TRAP: %04lx %s (%s)\n",
+ printk("REGS: %px TRAP: %04lx %s (%s)\n",
regs, regs->trap, print_tainted(), init_utsname()->release);
printk("MSR: "REG" ", regs->msr);
print_msr_bits(regs->msr);
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index bf457843e032..0d750d274c4e 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
/* Return the per-cpu state for state saving/migration */
return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
- (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+ (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
+ (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
}
int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
@@ -1558,7 +1559,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
/*
* Restore P and Q. If the interrupt was pending, we
- * force both P and Q, which will trigger a resend.
+ * force Q and !P, which will trigger a resend.
*
* That means that a guest that had both an interrupt
* pending (queued) and Q set will restore with only
@@ -1566,7 +1567,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
* is perfectly fine as coalescing interrupts that haven't
* been presented yet is always allowed.
*/
- if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+ if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
state->old_p = true;
if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
state->old_q = true;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 46d74e81aff1..d183b4801bdb 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -763,7 +763,8 @@ emit_clear:
func = (u8 *) __bpf_call_base + imm;
/* Save skb pointer if we need to re-cache skb data */
- if (bpf_helper_changes_pkt_data(func))
+ if ((ctx->seen & SEEN_SKB) &&
+ bpf_helper_changes_pkt_data(func))
PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -772,7 +773,8 @@ emit_clear:
PPC_MR(b2p[BPF_REG_0], 3);
/* refresh skb cache */
- if (bpf_helper_changes_pkt_data(func)) {
+ if ((ctx->seen & SEEN_SKB) &&
+ bpf_helper_changes_pkt_data(func)) {
/* reload skb pointer to r3 */
PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_skb_loads(image, ctx);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 153812966365..fce545774d50 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
int ret;
__u64 target;
- if (is_kernel_addr(addr))
- return branch_target((unsigned int *)addr);
+ if (is_kernel_addr(addr)) {
+ if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
+ return 0;
+
+ return branch_target(&instr);
+ }
/* Userspace: need copy instruction here then translate it */
pagefault_disable();
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 0ead3cd73caa..be4e7f84f70a 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -310,6 +310,19 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
return 0;
/*
+ * Check whether nest_imc is registered. We could end up here if the
+ * cpuhotplug callback registration fails. i.e, callback invokes the
+ * offline path for all successfully registered nodes. At this stage,
+ * nest_imc pmu will not be registered and we should return here.
+ *
+ * We return with a zero since this is not an offline failure. And
+ * cpuhp_setup_state() returns the actual failure reason to the caller,
+ * which in turn will call the cleanup routine.
+ */
+ if (!nest_pmus)
+ return 0;
+
+ /*
* Now that this cpu is one of the designated,
* find a next cpu a) which is online and b) in same chip.
*/
@@ -1171,6 +1184,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
if (nest_pmus == 1) {
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
kfree(nest_imc_refc);
+ kfree(per_nest_pmu_arr);
}
if (nest_pmus > 0)
@@ -1195,7 +1209,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
kfree(pmu_ptr);
- kfree(per_nest_pmu_arr);
return;
}
@@ -1309,6 +1322,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
ret = nest_pmu_cpumask_init();
if (ret) {
mutex_unlock(&nest_init_lock);
+ kfree(nest_imc_refc);
+ kfree(per_nest_pmu_arr);
goto err_free;
}
}
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 44cbf4c12ea1..df95102e732c 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
}
static struct lock_class_key fsl_msi_irq_class;
+static struct lock_class_key fsl_msi_irq_request_class;
static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
int offset, int irq_index)
@@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
dev_err(&dev->dev, "No memory for MSI cascade data\n");
return -ENOMEM;
}
- irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class);
+ irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class,
+ &fsl_msi_irq_request_class);
cascade_data->index = offset;
cascade_data->msi_data = msi;
cascade_data->virq = virt_msir;
diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h
index 773c4e039cd7..c0319cbf1eec 100644
--- a/arch/riscv/include/asm/barrier.h
+++ b/arch/riscv/include/asm/barrier.h
@@ -38,6 +38,25 @@
#define smp_rmb() RISCV_FENCE(r,r)
#define smp_wmb() RISCV_FENCE(w,w)
+/*
+ * This is a very specific barrier: it's currently only used in two places in
+ * the kernel, both in the scheduler. See include/linux/spinlock.h for the two
+ * orderings it guarantees, but the "critical section is RCsc" guarantee
+ * mandates a barrier on RISC-V. The sequence looks like:
+ *
+ * lr.aq lock
+ * sc lock <= LOCKED
+ * smp_mb__after_spinlock()
+ * // critical section
+ * lr lock
+ * sc.rl lock <= UNLOCKED
+ *
+ * The AQ/RL pair provides a RCpc critical section, but there's not really any
+ * way we can take advantage of that here because the ordering is only enforced
+ * on that one lock. Thus, we're just doing a full fence.
+ */
+#define smp_mb__after_spinlock() RISCV_FENCE(rw,rw)
+
#include <asm-generic/barrier.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 8fbb6749910d..cb7b0c63014e 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -38,10 +38,6 @@
#include <asm/tlbflush.h>
#include <asm/thread_info.h>
-#ifdef CONFIG_HVC_RISCV_SBI
-#include <asm/hvc_riscv_sbi.h>
-#endif
-
#ifdef CONFIG_DUMMY_CONSOLE
struct screen_info screen_info = {
.orig_video_lines = 30,
@@ -212,13 +208,6 @@ static void __init setup_bootmem(void)
void __init setup_arch(char **cmdline_p)
{
-#if defined(CONFIG_HVC_RISCV_SBI)
- if (likely(early_console == NULL)) {
- early_console = &riscv_sbi_early_console_dev;
- register_console(early_console);
- }
-#endif
-
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c
index a2ae936a093e..79c78668258e 100644
--- a/arch/riscv/kernel/sys_riscv.c
+++ b/arch/riscv/kernel/sys_riscv.c
@@ -70,7 +70,7 @@ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end,
bool local = (flags & SYS_RISCV_FLUSH_ICACHE_LOCAL) != 0;
/* Check the reserved flags. */
- if (unlikely(flags & !SYS_RISCV_FLUSH_ICACHE_ALL))
+ if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_ALL))
return -EINVAL;
flush_icache_mm(mm, local);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 57d7bc92e0b8..0a6b0286c32e 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1264,12 +1264,6 @@ static inline pud_t pud_mkwrite(pud_t pud)
return pud;
}
-#define pud_write pud_write
-static inline int pud_write(pud_t pud)
-{
- return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0;
-}
-
static inline pud_t pud_mkclean(pud_t pud)
{
if (pud_large(pud)) {
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index f04db3779b34..59eea9c65d3e 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index e81c16838b90..9557d8b516df 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -55,8 +55,7 @@ struct bpf_jit {
#define SEEN_LITERAL 8 /* code uses literals */
#define SEEN_FUNC 16 /* calls C functions */
#define SEEN_TAIL_CALL 32 /* code uses tail calls */
-#define SEEN_SKB_CHANGE 64 /* code changes skb data */
-#define SEEN_REG_AX 128 /* code uses constant blinding */
+#define SEEN_REG_AX 64 /* code uses constant blinding */
#define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
/*
@@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
REG_15, 152);
}
- if (jit->seen & SEEN_SKB)
+ if (jit->seen & SEEN_SKB) {
emit_load_skb_data_hlen(jit);
- if (jit->seen & SEEN_SKB_CHANGE)
/* stg %b1,ST_OFF_SKBP(%r0,%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
STK_OFF_SKBP);
+ }
}
/*
@@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb9040000, BPF_REG_0, REG_2);
- if (bpf_helper_changes_pkt_data((void *)func)) {
- jit->seen |= SEEN_SKB_CHANGE;
+ if ((jit->seen & SEEN_SKB) &&
+ bpf_helper_changes_pkt_data((void *)func)) {
/* lg %b1,ST_OFF_SKBP(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
REG_15, STK_OFF_SKBP);
diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
index e5547b22cd18..0ddbbb031822 100644
--- a/arch/sparc/lib/hweight.S
+++ b/arch/sparc/lib/hweight.S
@@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
.previous
ENTRY(__arch_hweight64)
- sethi %hi(__sw_hweight16), %g1
- jmpl %g1 + %lo(__sw_hweight16), %g0
+ sethi %hi(__sw_hweight64), %g1
+ jmpl %g1 + %lo(__sw_hweight64), %g0
nop
ENDPROC(__arch_hweight64)
EXPORT_SYMBOL(__arch_hweight64)
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index be3136f142a9..a8103a84b4ac 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
if (!printk_ratelimit())
return;
- printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+ printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->pc, (void *)regs->u_regs[UREG_I7],
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 815c03d7a765..41363f46797b 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
if (!printk_ratelimit())
return;
- printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+ printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->tpc, (void *)regs->u_regs[UREG_I7],
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index 33c0f8bb0f33..5335ba3c850e 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
if (!(pmd_val(pmd) & _PAGE_VALID))
return 0;
- if (!pmd_access_permitted(pmd, write))
+ if (write && !pmd_write(pmd))
return 0;
refs = 0;
@@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
if (!(pud_val(pud) & _PAGE_VALID))
return 0;
- if (!pud_access_permitted(pud, write))
+ if (write && !pud_write(pud))
return 0;
refs = 0;
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 5765e7e711f7..ff5f9cb3039a 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
u8 *func = ((u8 *)__bpf_call_base) + imm;
ctx->saw_call = true;
+ if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+ emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
emit_call((u32 *)func, ctx);
emit_nop(ctx);
emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
- if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind)
- load_skb_regs(ctx, bpf2sparc[BPF_REG_6]);
+ if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+ load_skb_regs(ctx, L7);
break;
}
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 50a32c33d729..73c57f614c9e 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -1,4 +1,5 @@
generic-y += barrier.h
+generic-y += bpf_perf_event.h
generic-y += bug.h
generic-y += clkdev.h
generic-y += current.h
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index b668e351fd6c..fca34b2177e2 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
/*
* Needed since we do not use the asm-generic/mm_hooks.h:
*/
-static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
uml_setup_stubs(mm);
+ return 0;
}
extern void arch_exit_mmap(struct mm_struct *mm);
static inline void arch_unmap(struct mm_struct *mm,
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 4e6fcb32620f..428644175956 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
if (!printk_ratelimit())
return;
- printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x",
+ printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
(void *)UPT_IP(regs), (void *)UPT_SP(regs),
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index 59b06b48f27d..5c205a9cb5a6 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -81,9 +81,10 @@ do { \
} \
} while (0)
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
{
+ return 0;
}
static inline void arch_unmap(struct mm_struct *mm,
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8eed3f94bfc7..d4fc98c50378 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -926,7 +926,8 @@ config MAXSMP
config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP
range 2 8 if SMP && X86_32 && !X86_BIGSMP
- range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK
+ range 2 64 if SMP && X86_32 && X86_BIGSMP
+ range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
default "1" if !SMP
default "8192" if MAXSMP
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 6293a8768a91..672441c008c7 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER
config UNWINDER_GUESS
bool "Guess unwinder"
depends on EXPERT
+ depends on !STACKDEPOT
---help---
This option enables the "guess" unwinder for unwinding kernel stack
traces. It scans the stack and reports every kernel text address it
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 1e9c322e973a..f25e1530e064 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -80,6 +80,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
vmlinux-objs-y += $(obj)/mem_encrypt.o
+ vmlinux-objs-y += $(obj)/pgtable_64.o
endif
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 20919b4f3133..fc313e29fe2c 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -305,10 +305,18 @@ ENTRY(startup_64)
leaq boot_stack_end(%rbx), %rsp
#ifdef CONFIG_X86_5LEVEL
- /* Check if 5-level paging has already enabled */
- movq %cr4, %rax
- testl $X86_CR4_LA57, %eax
- jnz lvl5
+ /*
+ * Check if we need to enable 5-level paging.
+ * RSI holds real mode data and need to be preserved across
+ * a function call.
+ */
+ pushq %rsi
+ call l5_paging_required
+ popq %rsi
+
+ /* If l5_paging_required() returned zero, we're done here. */
+ cmpq $0, %rax
+ je lvl5
/*
* At this point we are in long mode with 4-level paging enabled,
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b50c42455e25..98761a1576ce 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -169,6 +169,16 @@ void __puthex(unsigned long value)
}
}
+static bool l5_supported(void)
+{
+ /* Check if leaf 7 is supported. */
+ if (native_cpuid_eax(0) < 7)
+ return 0;
+
+ /* Check if la57 is supported. */
+ return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31));
+}
+
#if CONFIG_X86_NEED_RELOCS
static void handle_relocations(void *output, unsigned long output_len,
unsigned long virt_addr)
@@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
console_init();
debug_putstr("early console in extract_kernel\n");
+ if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) {
+ error("This linux kernel as configured requires 5-level paging\n"
+ "This CPU does not support the required 'cr4.la57' feature\n"
+ "Unable to boot - please use a kernel appropriate for your CPU\n");
+ }
+
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index d5364ca2e3f9..b5e5e02f8cde 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -23,6 +23,9 @@
*/
#undef CONFIG_AMD_MEM_ENCRYPT
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
#include "misc.h"
/* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
new file mode 100644
index 000000000000..b4469a37e9a1
--- /dev/null
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -0,0 +1,28 @@
+#include <asm/processor.h>
+
+/*
+ * __force_order is used by special_insns.h asm code to force instruction
+ * serialization.
+ *
+ * It is not referenced from the code, but GCC < 5 with -fPIE would fail
+ * due to an undefined symbol. Define it to make these ancient GCCs work.
+ */
+unsigned long __force_order;
+
+int l5_paging_required(void)
+{
+ /* Check if leaf 7 is supported. */
+
+ if (native_cpuid_eax(0) < 7)
+ return 0;
+
+ /* Check if la57 is supported. */
+ if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+ return 0;
+
+ /* Check if 5-level paging has already been enabled. */
+ if (native_read_cr4() & X86_CR4_LA57)
+ return 0;
+
+ return 1;
+}
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index 49f4970f693b..6a10d52a4145 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -44,9 +44,9 @@ FDINITRD=$6
# Make sure the files actually exist
verify "$FBZIMAGE"
-verify "$MTOOLSRC"
genbzdisk() {
+ verify "$MTOOLSRC"
mformat a:
syslinux $FIMAGE
echo "$KCMDLINE" | mcopy - a:syslinux.cfg
@@ -57,6 +57,7 @@ genbzdisk() {
}
genfdimage144() {
+ verify "$MTOOLSRC"
dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
mformat v:
syslinux $FIMAGE
@@ -68,6 +69,7 @@ genfdimage144() {
}
genfdimage288() {
+ verify "$MTOOLSRC"
dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
mformat w:
syslinux $FIMAGE
@@ -78,39 +80,43 @@ genfdimage288() {
mcopy $FBZIMAGE w:linux
}
-genisoimage() {
+geniso() {
tmp_dir=`dirname $FIMAGE`/isoimage
rm -rf $tmp_dir
mkdir $tmp_dir
- for i in lib lib64 share end ; do
+ for i in lib lib64 share ; do
for j in syslinux ISOLINUX ; do
if [ -f /usr/$i/$j/isolinux.bin ] ; then
isolinux=/usr/$i/$j/isolinux.bin
- cp $isolinux $tmp_dir
fi
done
for j in syslinux syslinux/modules/bios ; do
if [ -f /usr/$i/$j/ldlinux.c32 ]; then
ldlinux=/usr/$i/$j/ldlinux.c32
- cp $ldlinux $tmp_dir
fi
done
if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
break
fi
- if [ $i = end -a -z "$isolinux" ] ; then
- echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
- exit 1
- fi
done
+ if [ -z "$isolinux" ] ; then
+ echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
+ exit 1
+ fi
+ if [ -z "$ldlinux" ] ; then
+ echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
+ exit 1
+ fi
+ cp $isolinux $tmp_dir
+ cp $ldlinux $tmp_dir
cp $FBZIMAGE $tmp_dir/linux
echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
if [ -f "$FDINITRD" ] ; then
cp "$FDINITRD" $tmp_dir/initrd.img
fi
- mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \
- -c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \
- $tmp_dir
+ genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
+ -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
+ -boot-info-table $tmp_dir
isohybrid $FIMAGE 2>/dev/null || true
rm -rf $tmp_dir
}
@@ -119,6 +125,6 @@ case $1 in
bzdisk) genbzdisk;;
fdimage144) genfdimage144;;
fdimage288) genfdimage288;;
- isoimage) genisoimage;;
+ isoimage) geniso;;
*) echo 'Unknown image format'; exit 1;
esac
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index 399a29d067d6..cb91a64a99e7 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -59,13 +59,6 @@ static int encrypt(struct blkcipher_desc *desc,
salsa20_ivsetup(ctx, walk.iv);
- if (likely(walk.nbytes == nbytes))
- {
- salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
- walk.dst.virt.addr, nbytes);
- return blkcipher_walk_done(desc, &walk, 0);
- }
-
while (walk.nbytes >= 64) {
salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
walk.dst.virt.addr,
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3fd8bc560fae..45a63e00a6af 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,6 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
/*
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
+#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+
+.macro SET_NOFLUSH_BIT reg:req
+ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
+.macro ADJUST_KERNEL_CR3 reg:req
+ ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+ /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ andq $(~PTI_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+ ADJUST_KERNEL_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+#define THIS_CPU_user_pcid_flush_mask \
+ PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * Test if the ASID needs a flush.
+ */
+ movq \scratch_reg, \scratch_reg2
+ andq $(0x7FF), \scratch_reg /* mask ASID */
+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jnc .Lnoflush_\@
+
+ /* Flush needed, clear the bit */
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ movq \scratch_reg2, \scratch_reg
+ jmp .Lwrcr3_\@
+
+.Lnoflush_\@:
+ movq \scratch_reg2, \scratch_reg
+ SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_\@:
+ /* Flip the PGD and ASID to the user version */
+ orq $(PTI_SWITCH_MASK), \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ pushq %rax
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+ popq %rax
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ movq %cr3, \scratch_reg
+ movq \scratch_reg, \save_reg
+ /*
+ * Is the "switch mask" all zero? That means that both of
+ * these are zero:
+ *
+ * 1. The user/kernel PCID bit, and
+ * 2. The user/kernel "bit" that points CR3 to the
+ * bottom half of the 8k PGD
+ *
+ * That indicates a kernel CR3 value, not a user CR3.
+ */
+ testq $(PTI_SWITCH_MASK), \scratch_reg
+ jz .Ldone_\@
+
+ ADJUST_KERNEL_CR3 \scratch_reg
+ movq \scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * KERNEL pages can always resume with NOFLUSH as we do
+ * explicit flushes.
+ */
+ bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
+ jnc .Lnoflush_\@
+
+ /*
+ * Check if there's a pending flush for the user ASID we're
+ * about to set.
+ */
+ movq \save_reg, \scratch_reg
+ andq $(0x7FF), \scratch_reg
+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jnc .Lnoflush_\@
+
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jmp .Lwrcr3_\@
+
+.Lnoflush_\@:
+ SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
+ /*
+ * The CR3 write could be avoided when not changing its value,
+ * but would require a CR3 read *and* a scratch register.
+ */
+ movq \save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
#endif /* CONFIG_X86_64 */
/*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4838037f97f6..ace8f321a5a1 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -941,9 +941,10 @@ ENTRY(debug)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
- cmpl $SIZEOF_SYSENTER_stack, %ecx
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
jb .Ldebug_from_sysenter_stack
TRACE_IRQS_OFF
@@ -984,9 +985,10 @@ ENTRY(nmi)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
- cmpl $SIZEOF_SYSENTER_stack, %ecx
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f81d50d7ceac..f048e384ff54 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -23,7 +23,6 @@
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
-#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
@@ -40,6 +39,8 @@
#include <asm/frame.h>
#include <linux/err.h>
+#include "calling.h"
+
.code64
.section .entry.text, "ax"
@@ -140,6 +141,67 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
+ .pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline. This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address). So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline. We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
+ SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+ UNWIND_HINT_EMPTY
+ swapgs
+
+ /* Stash the user RSP. */
+ movq %rsp, RSP_SCRATCH
+
+ /* Note: using %rsp as a scratch reg. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+ /* Load the top of the task stack into RSP */
+ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+ /* Start building the simulated IRET frame. */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq RSP_SCRATCH /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+
+ /*
+ * x86 lacks a near absolute jump, and we can't jump to the real
+ * entry text with a relative jump. We could push the target
+ * address and then use retq, but this destroys the pipeline on
+ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
+ * spill RDI and restore it in a second-stage trampoline.
+ */
+ pushq %rdi
+ movq $entry_SYSCALL_64_stage2, %rdi
+ jmp *%rdi
+END(entry_SYSCALL_64_trampoline)
+
+ .popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+ UNWIND_HINT_EMPTY
+ popq %rdi
+ jmp entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@@ -149,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
*/
swapgs
+ /*
+ * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+ * is not required to switch CR3.
+ */
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -330,8 +396,25 @@ syscall_return_via_sysret:
popq %rsi /* skip rcx */
popq %rdx
popq %rsi
+
+ /*
+ * Now all regs are restored except RSP and RDI.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ pushq RSP-RDI(%rdi) /* RSP */
+ pushq (%rdi) /* RDI */
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
popq %rdi
- movq RSP-ORIG_RAX(%rsp), %rsp
+ popq %rsp
USERGS_SYSRET64
END(entry_SYSCALL_64)
@@ -466,12 +549,13 @@ END(irq_entries_start)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
- pushfq
- testl $X86_EFLAGS_IF, (%rsp)
+ pushq %rax
+ SAVE_FLAGS(CLBR_RAX)
+ testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
.Lokay_\@:
- addq $8, %rsp
+ popq %rax
#endif
.endm
@@ -563,6 +647,13 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
cld
+
+ testb $3, CS-ORIG_RAX(%rsp)
+ jz 1f
+ SWAPGS
+ call switch_to_thread_stack
+1:
+
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
@@ -572,12 +663,8 @@ END(irq_entries_start)
jz 1f
/*
- * IRQ from user mode. Switch to kernel gsbase and inform context
- * tracking that we're in kernel mode.
- */
- SWAPGS
-
- /*
+ * IRQ from user mode.
+ *
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
* (which can take locks). Since TRACE_IRQS_OFF idempotent,
@@ -630,10 +717,43 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2
1:
#endif
- SWAPGS
POP_EXTRA_REGS
- POP_C_REGS
- addq $8, %rsp /* skip regs->orig_ax */
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+
+ /*
+ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ /* Copy the IRET frame to the trampoline stack. */
+ pushq 6*8(%rdi) /* SS */
+ pushq 5*8(%rdi) /* RSP */
+ pushq 4*8(%rdi) /* EFLAGS */
+ pushq 3*8(%rdi) /* CS */
+ pushq 2*8(%rdi) /* RIP */
+
+ /* Push user RDI on the trampoline stack. */
+ pushq (%rdi)
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+ /* Restore RDI. */
+ popq %rdi
+ SWAPGS
INTERRUPT_RETURN
@@ -713,7 +833,9 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
- SWAPGS
+ SWAPGS /* to kernel GS */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
+
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
@@ -729,7 +851,6 @@ native_irq_return_ldt:
/* Now RAX == RSP. */
andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
- popq %rdi /* Restore user RDI */
/*
* espfix_stack[31:16] == 0. The page tables are set up such that
@@ -740,7 +861,11 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
- SWAPGS
+
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ SWAPGS /* to user GS */
+ popq %rdi /* Restore user RDI */
+
movq %rax, %rsp
UNWIND_HINT_IRET_REGS offset=8
@@ -829,7 +954,35 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/*
* Exception entry points.
*/
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+/*
+ * Switch to the thread stack. This is called with the IRET frame and
+ * orig_ax on the stack. (That is, RDI..R12 are not on the stack and
+ * space has not been allocated for them.)
+ */
+ENTRY(switch_to_thread_stack)
+ UNWIND_HINT_FUNC
+
+ pushq %rdi
+ /* Need to switch before accessing the thread stack. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+
+ pushq 7*8(%rdi) /* regs->ss */
+ pushq 6*8(%rdi) /* regs->rsp */
+ pushq 5*8(%rdi) /* regs->eflags */
+ pushq 4*8(%rdi) /* regs->cs */
+ pushq 3*8(%rdi) /* regs->ip */
+ pushq 2*8(%rdi) /* regs->orig_ax */
+ pushq 8(%rdi) /* return address */
+ UNWIND_HINT_FUNC
+
+ movq (%rdi), %rdi
+ ret
+END(switch_to_thread_stack)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@@ -848,11 +1001,12 @@ ENTRY(\sym)
ALLOC_PT_GPREGS_ON_STACK
- .if \paranoid
- .if \paranoid == 1
+ .if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
- jnz 1f
+ jnz .Lfrom_usermode_switch_stack_\@
.endif
+
+ .if \paranoid
call paranoid_entry
.else
call error_entry
@@ -894,20 +1048,15 @@ ENTRY(\sym)
jmp error_exit
.endif
- .if \paranoid == 1
+ .if \paranoid < 2
/*
- * Paranoid entry from userspace. Switch stacks and treat it
+ * Entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
-1:
+.Lfrom_usermode_switch_stack_\@:
call error_entry
-
- movq %rsp, %rdi /* pt_regs pointer */
- call sync_regs
- movq %rax, %rsp /* switch stack */
-
movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
@@ -1119,7 +1268,11 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
-1: ret
+
+1:
+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+ ret
END(paranoid_entry)
/*
@@ -1141,6 +1294,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
@@ -1168,8 +1322,18 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
+ /* We have user CR3. Change to kernel CR3. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
.Lerror_entry_from_usermode_after_swapgs:
+ /* Put us onto the real thread stack. */
+ popq %r12 /* save return addr in %12 */
+ movq %rsp, %rdi /* arg0 = pt_regs pointer */
+ call sync_regs
+ movq %rax, %rsp /* switch stack */
+ ENCODE_FRAME_POINTER
+ pushq %r12
+
/*
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
@@ -1206,6 +1370,7 @@ ENTRY(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
.Lbstep_iret:
@@ -1215,10 +1380,11 @@ ENTRY(error_entry)
.Lerror_bad_iret:
/*
- * We came from an IRET to user mode, so we have user gsbase.
- * Switch to kernel gsbase:
+ * We came from an IRET to user mode, so we have user
+ * gsbase and CR3. Switch to kernel gsbase and CR3:
*/
SWAPGS
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1250,6 +1416,10 @@ END(error_exit)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
* so we can use real assembly here.
+ *
+ * Registers:
+ * %r14: Used to save/restore the CR3 of the interrupted context
+ * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
@@ -1313,6 +1483,7 @@ ENTRY(nmi)
swapgs
cld
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1565,6 +1736,8 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi
+ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 568e130d932c..40f17009ec20 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -48,7 +48,11 @@
*/
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ SWAPGS
+
+ /* We are about to clobber %rsp anyway, clobbering here is OK */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -216,6 +220,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
pushq $0 /* pt_regs->r15 = 0 */
/*
+ * We just saved %rdi so it is safe to clobber. It is not
+ * preserved during the C calls inside TRACE_IRQS_OFF anyway.
+ */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
+ /*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
*/
@@ -256,10 +266,22 @@ sysret32_from_system_call:
* when the system call started, which is already known to user
* code. We zero R8-R10 to avoid info leaks.
*/
+ movq RSP-ORIG_RAX(%rsp), %rsp
+
+ /*
+ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+ * on the process stack which is not mapped to userspace and
+ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
+ * switch until after after the last reference to the process
+ * stack.
+ *
+ * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+ */
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
- movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
END(entry_SYSCALL_compat)
@@ -306,8 +328,11 @@ ENTRY(entry_INT80_compat)
*/
movl %eax, %eax
- /* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */
+
+ /* switch to thread stack expects orig_ax to be pushed */
+ call switch_to_thread_stack
+
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index f279ba2643dc..577fa8adb785 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -37,6 +37,7 @@
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
+#include <asm/paravirt.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
@@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
WARN_ON_ONCE(address != regs->ip);
+ /* This should be unreachable in NATIVE mode. */
+ if (WARN_ON(vsyscall_mode == NATIVE))
+ return false;
+
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
@@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range. Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present. If so, we skip calling
+ * this.
+ */
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
+ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+ p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+#if CONFIG_PGTABLE_LEVELS >= 5
+ p4d->p4d |= _PAGE_USER;
+#endif
+ pud = pud_offset(p4d, VSYSCALL_ADDR);
+ set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+ pmd = pmd_offset(pud, VSYSCALL_ADDR);
+ set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- if (vsyscall_mode != NONE)
+ if (vsyscall_mode != NONE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
+ set_vsyscall_pgtable_user_bits(swapper_pg_dir);
+ }
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 09c26a4f139c..731153a4681e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = {
__init int intel_pmu_init(void)
{
+ struct attribute **extra_attr = NULL;
+ struct attribute **to_free = NULL;
union cpuid10_edx edx;
union cpuid10_eax eax;
union cpuid10_ebx ebx;
@@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void)
unsigned int unused;
struct extra_reg *er;
int version, i;
- struct attribute **extra_attr = NULL;
char *name;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void)
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
extra_attr = merge_attr(extra_attr, skl_format_attr);
+ to_free = extra_attr;
x86_pmu.cpu_events = get_hsw_events_attrs();
intel_pmu_pebs_data_source_skl(
boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
@@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
+ kfree(to_free);
return 0;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3674a4b6f8bd..8f0aace08b87 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,16 +3,18 @@
#include <linux/types.h>
#include <linux/slab.h>
+#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "../perf_event.h"
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
-#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer);
-static int alloc_pebs_buffer(int cpu)
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ phys_addr_t pa;
+ size_t msz = 0;
+
+ pa = virt_to_phys(addr);
+ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+ size_t msz = 0;
+
+ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+ unsigned int order = get_order(size);
int node = cpu_to_node(cpu);
- int max;
- void *buffer, *ibuffer;
+ struct page *page;
+
+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+ return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+ if (buffer)
+ free_pages((unsigned long)buffer, get_order(size));
+}
+
+static int alloc_pebs_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ size_t bsiz = x86_pmu.pebs_buffer_size;
+ int max, node = cpu_to_node(cpu);
+ void *buffer, *ibuffer, *cea;
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
if (unlikely(!buffer))
return -ENOMEM;
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
- kfree(buffer);
+ dsfree_pages(buffer, bsiz);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
}
-
- max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
-
- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+ hwev->ds_pebs_vaddr = buffer;
+ /* Update the cpu entry area mapping */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds->pebs_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
ds->pebs_index = ds->pebs_buffer_base;
- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
- max * x86_pmu.pebs_record_size;
-
+ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
return 0;
}
static void release_pebs_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *cea;
if (!ds || !x86_pmu.pebs)
return;
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
+ dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
+ hwev->ds_pebs_vaddr = NULL;
}
static int alloc_bts_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
- int node = cpu_to_node(cpu);
- int max, thresh;
- void *buffer;
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *buffer, *cea;
+ int max;
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
}
-
- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
- thresh = max / 16;
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ hwev->ds_bts_vaddr = buffer;
+ /* Update the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds->bts_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum = ds->bts_buffer_base +
- max * BTS_RECORD_SIZE;
- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
- thresh * BTS_RECORD_SIZE;
-
+ max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
+ ds->bts_absolute_maximum = ds->bts_buffer_base + max;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
return 0;
}
static void release_bts_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *cea;
if (!ds || !x86_pmu.bts)
return;
- kfree((void *)(unsigned long)ds->bts_buffer_base);
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds_clear_cea(cea, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
+ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+ hwev->ds_bts_vaddr = NULL;
}
static int alloc_ds_buffer(int cpu)
{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
-
- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
- if (unlikely(!ds))
- return -ENOMEM;
+ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
+ memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
-
return 0;
}
static void release_ds_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- return;
-
per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
}
void release_ds_buffers(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f7aaadf9331f..8e4ea143ed96 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,8 @@
#include <linux/perf_event.h>
+#include <asm/intel_ds.h>
+
/* To enable MSR tracing please use the generic trace points. */
/*
@@ -77,8 +79,6 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 8
#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
/*
@@ -95,23 +95,6 @@ struct amd_nb {
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
- u64 bts_buffer_base;
- u64 bts_index;
- u64 bts_absolute_maximum;
- u64 bts_interrupt_threshold;
- u64 pebs_buffer_base;
- u64 pebs_index;
- u64 pebs_absolute_maximum;
- u64 pebs_interrupt_threshold;
- u64 pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
#define PEBS_REGS \
(PERF_REG_X86_AX | \
PERF_REG_X86_BX | \
@@ -216,6 +199,8 @@ struct cpu_hw_events {
* Intel DebugStore bits
*/
struct debug_store *ds;
+ void *ds_pebs_vaddr;
+ void *ds_bts_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 219faaec51df..386a6900e206 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -136,6 +136,7 @@
#endif
#ifndef __ASSEMBLY__
+#ifndef __BPF__
/*
* This output constraint should be used for any inline asm which has a "call"
* instruction. Otherwise the asm may be inserted before the frame pointer
@@ -145,5 +146,6 @@
register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
#endif
+#endif
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
new file mode 100644
index 000000000000..4a7884b8dca5
--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <linux/percpu-defs.h>
+#include <asm/processor.h>
+#include <asm/intel_ds.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code. Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+ * The GDT is just below entry_stack and thus serves (on x86_64) as
+ * a a read-only guard page.
+ */
+ struct entry_stack_page entry_stack_page;
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+ char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+ /*
+ * Exception stacks used for IST entries.
+ *
+ * In the future, this should have a separate slot for each stack
+ * with guard pages between them.
+ */
+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+ /*
+ * Per CPU debug store for Intel performance monitoring. Wastes a
+ * full page at the moment.
+ */
+ struct debug_store cpu_debug_store;
+ /*
+ * The actual PEBS/BTS buffers must be mapped to user space
+ * Reserve enough fixmap PTEs.
+ */
+ struct debug_store_buffers cpu_debug_buffers;
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#define CPU_ENTRY_AREA_MAP_SIZE \
+ (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
+
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+ return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bf6a76202a77..ea9a7dde62e5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
/*
* Static testing of CPU features. Used the same as boot_cpu_has().
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 800104c8a3ed..07cdd1715705 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -197,11 +197,12 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
-
+#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +341,6 @@
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4011cb03ef08..13c5ee878a47 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -7,6 +7,7 @@
#include <asm/mmu.h>
#include <asm/fixmap.h>
#include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
#include <linux/smp.h>
#include <linux/percpu.h>
@@ -20,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
desc->type = (info->read_exec_only ^ 1) << 1;
desc->type |= info->contents << 2;
+ /* Set the ACCESS bit so it can be mapped RO */
+ desc->type |= 1;
desc->s = 1;
desc->dpl = 0x3;
@@ -60,17 +63,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
return this_cpu_ptr(&gdt_page)->gdt;
}
-/* Get the fixmap index for a specific processor */
-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
-{
- return FIX_GDT_REMAP_BEGIN + cpu;
-}
-
/* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
- unsigned int idx = get_cpu_gdt_ro_index(cpu);
- return (struct desc_struct *)__fix_to_virt(idx);
+ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
}
/* Provide the current read-only GDT */
@@ -185,7 +181,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
#endif
}
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{
struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 14d6d5007314..b027633e7300 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -50,6 +50,12 @@
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI 0
+#else
+# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -60,7 +66,7 @@
#define DISABLED_MASK4 (DISABLE_PCID)
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
-#define DISABLED_MASK7 0
+#define DISABLED_MASK7 (DISABLE_PTI)
#define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX)
#define DISABLED_MASK10 0
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 0211029076ea..6777480d8a42 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_ESPFIX_H
#define _ASM_X86_ESPFIX_H
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_ESPFIX64
#include <asm/percpu.h>
@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void);
extern void init_espfix_ap(int cpu);
-
-#endif /* CONFIG_X86_64 */
+#else
+static inline void init_espfix_ap(int cpu) { }
+#endif
#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b0c505fe9a95..64c4a30e0d39 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -44,7 +44,6 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE)
#endif
-
/*
* Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at
@@ -84,7 +83,6 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
- FIX_RO_IDT, /* Virtual mapping for read-only IDT */
#ifdef CONFIG_X86_32
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -100,9 +98,6 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
- /* Fixmap entries to remap the GDTs, one per processor. */
- FIX_GDT_REMAP_BEGIN,
- FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
@@ -143,7 +138,7 @@ enum fixed_addresses {
extern void reserve_top_address(unsigned long reserve);
#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
extern int fixmaps_set;
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 1b0a5abcd8ae..96aa6b9884dc 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,16 +20,7 @@
#ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H
-#ifdef CONFIG_HYPERVISOR_GUEST
-
-#include <asm/kvm_para.h>
-#include <asm/x86_init.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * x86 hypervisor information
- */
-
+/* x86 hypervisor types */
enum x86_hypervisor_type {
X86_HYPER_NATIVE = 0,
X86_HYPER_VMWARE,
@@ -39,6 +30,12 @@ enum x86_hypervisor_type {
X86_HYPER_KVM,
};
+#ifdef CONFIG_HYPERVISOR_GUEST
+
+#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
+#include <asm/xen/hypervisor.h>
+
struct hypervisor_x86 {
/* Hypervisor name */
const char *name;
@@ -58,7 +55,15 @@ struct hypervisor_x86 {
extern enum x86_hypervisor_type x86_hyper_type;
extern void init_hypervisor_platform(void);
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return x86_hyper_type == type;
+}
#else
static inline void init_hypervisor_platform(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return type == X86_HYPER_NATIVE;
+}
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
new file mode 100644
index 000000000000..62a9f4966b42
--- /dev/null
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS 8
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_event_reset[MAX_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+ char bts_buffer[BTS_BUFFER_SIZE];
+ char pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif
diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h
new file mode 100644
index 000000000000..989cfa86de85
--- /dev/null
+++ b/arch/x86/include/asm/invpcid.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVPCID
+#define _ASM_X86_INVPCID
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+{
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+ * mappings, we don't want the compiler to reorder any subsequent
+ * memory accesses before the TLB flush.
+ *
+ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+ * invpcid (%rcx), %rax in long mode.
+ */
+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+ : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR 0
+#define INVPCID_TYPE_SINGLE_CTXT 1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
+#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+ unsigned long addr)
+{
+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
+#endif /* _ASM_X86_INVPCID */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index 139feef467f7..c066ffae222b 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs);
extern int mp_irqdomain_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool early);
+ struct irq_data *irq_data, bool reserve);
extern void mp_irqdomain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data);
extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c8ef23f2c28f..89f08955fff7 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \
sysretl
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x) pushfq; popq %rax
+#endif
#else
#define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index f86a8caa561e..395c9631e000 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 9ea26f167497..5ff3e8af2c20 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,6 +3,7 @@
#define _ASM_X86_MMU_H
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/mutex.h>
#include <linux/atomic.h>
@@ -27,7 +28,8 @@ typedef struct {
atomic64_t tlb_gen;
#ifdef CONFIG_MODIFY_LDT_SYSCALL
- struct ldt_struct *ldt;
+ struct rw_semaphore ldt_usr_sem;
+ struct ldt_struct *ldt;
#endif
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 6d16d15d09a0..c931b88982a0 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -50,22 +50,53 @@ struct ldt_struct {
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
- struct desc_struct *entries;
- unsigned int nr_entries;
+ struct desc_struct *entries;
+ unsigned int nr_entries;
+
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot). We use two slots so that we can allocate
+ * and map, and enable a new LDT without invalidating the mapping
+ * of an older, still-in-use LDT.
+ *
+ * slot will be -1 if this LDT doesn't have an alias mapping.
+ */
+ int slot;
};
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+#ifdef CONFIG_X86_64
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+#else
+ BUG();
+#endif
+}
+
/*
* Used for LDT copy/destruction.
*/
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+static inline void init_new_context_ldt(struct mm_struct *mm)
+{
+ mm->context.ldt = NULL;
+ init_rwsem(&mm->context.ldt_usr_sem);
+}
+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context_ldt(struct task_struct *tsk,
- struct mm_struct *mm)
+static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline int ldt_dup_context(struct mm_struct *oldmm,
+ struct mm_struct *mm)
{
return 0;
}
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
@@ -90,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see.
*/
- if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->nr_entries);
- else
+ if (unlikely(ldt)) {
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
+
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
clear_LDT();
+ }
#else
clear_LDT();
#endif
@@ -132,18 +184,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ mutex_init(&mm->context.lock);
+
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
- #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
- #endif
- return init_new_context_ldt(tsk, mm);
+#endif
+ init_new_context_ldt(mm);
+ return 0;
}
static inline void destroy_context(struct mm_struct *mm)
{
@@ -176,15 +231,16 @@ do { \
} while (0)
#endif
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
paravirt_arch_dup_mmap(oldmm, mm);
+ return ldt_dup_context(oldmm, mm);
}
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
@@ -282,33 +338,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
}
/*
- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
- * bits. This serves two purposes. It prevents a nasty situation in
- * which PCID-unaware code saves CR3, loads some other value (with PCID
- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
- * the saved ASID was nonzero. It also means that any bugs involving
- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
- * deterministically.
- */
-
-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
-{
- if (static_cpu_has(X86_FEATURE_PCID)) {
- VM_WARN_ON_ONCE(asid > 4094);
- return __sme_pa(mm->pgd) | (asid + 1);
- } else {
- VM_WARN_ON_ONCE(asid != 0);
- return __sme_pa(mm->pgd);
- }
-}
-
-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
-{
- VM_WARN_ON_ONCE(asid > 4094);
- return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
-}
-
-/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
@@ -317,7 +346,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
*/
static inline unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 283efcaac8af..892df375b615 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -927,6 +927,15 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 4b5e1eafada7..aff42e1da6ee 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
*/
extern gfp_t __userpte_alloc_gfp;
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
/*
* Allocate and free page tables.
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 95e2dfd75521..e42b8943cb1a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
void ptdump_walk_pgd_level_checkwx(void);
#ifdef CONFIG_DEBUG_WX
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
static inline int p4d_bad(p4d_t p4d)
{
- return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ unsigned long ignore_flags = _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
* pgd_offset() returns a (pgd_t *)
* pgd_index() is used get the offset into the pgd page's array of pgd_t's;
*/
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+/*
+ * a shortcut to get a pgd_t in a given mm
+ */
+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
/*
* a shortcut which implies the use of the kernel's pgd, instead
* of a process's
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+ /* Clone the user space pgd as well */
+ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+ count * sizeof(pgd_t));
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index f2ca9b28fd68..ce245b0cdfca 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
#define LAST_PKMAP 1024
#endif
-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
- & PMD_MASK)
+/*
+ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
+ * to avoid include recursion hell
+ */
+#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
+
+#define CPU_ENTRY_AREA_BASE \
+ ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
+
+#define PKMAP_BASE \
+ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
#ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else
-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
#endif
#define MODULES_VADDR VMALLOC_START
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e9f05331e732..81462e9a34f6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif
}
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
+ * the user one is in the last 4k. To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr |= BIT(bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr &= ~BIT(bit);
+ return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return pgd;
+ return __pti_set_user_pgd(pgdp, pgd);
+}
+#else
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif
+
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
+ p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
+#else
*p4dp = p4d;
+#endif
}
static inline void native_p4d_clear(p4d_t *p4d)
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ *pgdp = pti_set_user_pgd(pgdp, pgd);
+#else
*pgdp = pgd;
+#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6d5f45dcd4a1..b97a539bcdee 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,32 +76,45 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+
#ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(16384, UL)
-#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+# define VMALLOC_SIZE_TB _AC(12800, UL)
+# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
+# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+# define LDT_PGD_ENTRY _AC(-112, UL)
+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else
-#define VMALLOC_SIZE_TB _AC(32, UL)
-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+# define VMALLOC_SIZE_TB _AC(32, UL)
+# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
+# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+# define LDT_PGD_ENTRY _AC(-4, UL)
+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif
+
#ifdef CONFIG_RANDOMIZE_MEMORY
-#define VMALLOC_START vmalloc_base
-#define VMEMMAP_START vmemmap_base
+# define VMALLOC_START vmalloc_base
+# define VMEMMAP_START vmemmap_base
#else
-#define VMALLOC_START __VMALLOC_BASE
-#define VMEMMAP_START __VMEMMAP_BASE
+# define VMALLOC_START __VMALLOC_BASE
+# define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
-#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+
+#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+
+#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
-#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
-#define MODULES_LEN (MODULES_END - MODULES_VADDR)
-#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
-#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
-#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
+#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+
+#define CPU_ENTRY_AREA_PGD _AC(-3, UL)
+#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+
+#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
#define EARLY_DYNAMIC_PAGE_TABLES 64
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 43212a43ee69..6a60fea90b9d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -38,6 +38,11 @@
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_SWITCH_BIT 11
+#endif
+
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cc16fa882e3e..d3a67fba200a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -163,9 +163,9 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
-extern struct tss_struct doublefault_tss;
-extern __u32 cpu_caps_cleared[NCAPINTS];
-extern __u32 cpu_caps_set[NCAPINTS];
+extern struct x86_hw_tss doublefault_tss;
+extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
write_cr3(__sme_pa(pgdir));
}
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
@@ -305,7 +310,13 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
+
+ /*
+ * We store cpu_current_top_of_stack in sp1 so it's always accessible.
+ * Linux does not use ring 1, so sp1 is not otherwise needed.
+ */
u64 sp1;
+
u64 sp2;
u64 reserved2;
u64 ist[7];
@@ -323,12 +334,22 @@ struct x86_hw_tss {
#define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000
+struct entry_stack {
+ unsigned long words[64];
+};
+
+struct entry_stack_page {
+ struct entry_stack stack;
+} __aligned(PAGE_SIZE);
+
struct tss_struct {
/*
- * The hardware state:
+ * The fixed hardware portion. This must not cross a page boundary
+ * at risk of violating the SDM's advice and potentially triggering
+ * errata.
*/
struct x86_hw_tss x86_tss;
@@ -339,18 +360,9 @@ struct tss_struct {
* be within the limit.
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+} __aligned(PAGE_SIZE);
-#ifdef CONFIG_X86_32
- /*
- * Space for the temporary SYSENTER stack.
- */
- unsigned long SYSENTER_stack_canary;
- unsigned long SYSENTER_stack[64];
-#endif
-
-} ____cacheline_aligned;
-
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* sizeof(unsigned long) coming from an extra "long" at the end
@@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#else
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif
/*
@@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
static inline void
native_load_sp0(unsigned long sp0)
{
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
@@ -535,12 +550,12 @@ static inline void native_swapgs(void)
static inline unsigned long current_top_of_stack(void)
{
-#ifdef CONFIG_X86_64
- return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
-#else
- /* sp0 on x86_32 is special in and around vm86 mode. */
+ /*
+ * We can't read directly from tss.sp0: sp0 on x86_32 is special in
+ * and around vm86 mode and sp0 on x86_64 is special because of the
+ * entry trampoline.
+ */
return this_cpu_read_stable(cpu_current_top_of_stack);
-#endif
}
static inline bool on_thread_stack(void)
@@ -837,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x)
#else
/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
new file mode 100644
index 000000000000..0b5ef05b2d2d
--- /dev/null
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _ASM_X86_PTI_H
+#define _ASM_X86_PTI_H
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern void pti_init(void);
+extern void pti_check_boottime_disable(void);
+#else
+static inline void pti_check_boottime_disable(void) { }
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 8da111b3c342..f73706878772 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -16,6 +16,7 @@ enum stack_type {
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ,
+ STACK_TYPE_ENTRY,
STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};
@@ -28,6 +29,8 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
+bool in_entry_stack(unsigned long *stack, struct stack_info *info);
+
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 982c325dad33..8be6afb58471 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -12,7 +12,13 @@
/* image of the saved processor state */
struct saved_context {
- u16 es, fs, gs, ss;
+ /*
+ * On x86_32, all segment registers, with the possible exception of
+ * gs, are saved at kernel entry in pt_regs.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+ u16 gs;
+#endif
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
bool misc_enable_saved;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 7306e911faee..a7af9f53c0cb 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -20,8 +20,20 @@
*/
struct saved_context {
struct pt_regs regs;
- u16 ds, es, fs, gs, ss;
- unsigned long gs_base, gs_kernel_base, fs_base;
+
+ /*
+ * User CS and SS are saved in current_pt_regs(). The rest of the
+ * segment selectors need to be saved and restored here.
+ */
+ u16 ds, es, fs, gs;
+
+ /*
+ * Usermode FSBASE and GSBASE may not match the fs and gs selectors,
+ * so we save them separately. We save the kernelmode GSBASE to
+ * restore percpu access after resume.
+ */
+ unsigned long kernelmode_gs_base, usermode_gs_base, fs_base;
+
unsigned long cr0, cr2, cr3, cr4, cr8;
u64 misc_enable;
bool misc_enable_saved;
@@ -30,8 +42,7 @@ struct saved_context {
u16 gdt_pad; /* Unused */
struct desc_ptr gdt_desc;
u16 idt_pad;
- u16 idt_limit;
- unsigned long idt_base;
+ struct desc_ptr idt;
u16 ldt;
u16 tss;
unsigned long tr;
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8c6bd6863db9..eb5f7999a893 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
/* This runs runs on the previous thread's stack. */
-static inline void prepare_switch_to(struct task_struct *prev,
- struct task_struct *next)
+static inline void prepare_switch_to(struct task_struct *next)
{
#ifdef CONFIG_VMAP_STACK
/*
@@ -70,7 +69,7 @@ struct fork_frame {
#define switch_to(prev, next, last) \
do { \
- prepare_switch_to(prev, next); \
+ prepare_switch_to(next); \
\
((last) = __switch_to_asm((prev), (next))); \
} while (0)
@@ -79,10 +78,10 @@ do { \
static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return;
- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
@@ -90,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task)
{
+ /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
load_sp0(task->thread.sp0);
#else
- load_sp0(task_top_of_stack(task));
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ load_sp0(task_top_of_stack(task));
#endif
}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 70f425947dc5..00223333821a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif
#endif
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 877b5c1a1b12..4a08dd2ab32a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -9,70 +9,130 @@
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
+#include <asm/invpcid.h>
+#include <asm/pti.h>
+#include <asm/processor-flags.h>
-static inline void __invpcid(unsigned long pcid, unsigned long addr,
- unsigned long type)
-{
- struct { u64 d[2]; } desc = { { pcid, addr } };
+/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
+ * the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * the value we write into the PCID part of CR3; corresponds to the
+ * ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * for KPTI each mm has two address spaces and thus needs two
+ * PCID values, but we can still do with a single ASID denomination
+ * for each mm. Corresponds to kPCID + 2048.
+ *
+ */
- /*
- * The memory clobber is because the whole point is to invalidate
- * stale TLB entries and, especially if we're flushing global
- * mappings, we don't want the compiler to reorder any subsequent
- * memory accesses before the TLB flush.
- *
- * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
- * invpcid (%rcx), %rax in long mode.
- */
- asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
- : : "m" (desc), "a" (type), "c" (&desc) : "memory");
-}
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
-#define INVPCID_TYPE_INDIV_ADDR 0
-#define INVPCID_TYPE_SINGLE_CTXT 1
-#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
-#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
-/* Flush all mappings for a given pcid and addr, not including globals. */
-static inline void invpcid_flush_one(unsigned long pcid,
- unsigned long addr)
-{
- __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
-}
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+ * lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
-/* Flush all mappings for a given PCID, not including globals. */
-static inline void invpcid_flush_single_context(unsigned long pcid)
+/*
+ * Given @asid, compute kPCID
+ */
+static inline u16 kern_pcid(u16 asid)
{
- __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Make sure that the dynamic ASID space does not confict with the
+ * bit we are using to switch between user and kernel ASIDs.
+ */
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+
+ /*
+ * The ASID being passed in here should have respected the
+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+ */
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+#endif
+ /*
+ * The dynamically-assigned ASIDs that get passed in are small
+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
+ * so do not bother to clear it.
+ *
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
+ * PCID bits. This serves two purposes. It prevents a nasty
+ * situation in which PCID-unaware code saves CR3, loads some other
+ * value (with PCID == 0), and then restores CR3, thus corrupting
+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
+ * that any bugs involving loading a PCID-enabled CR3 with
+ * CR4.PCIDE off will trigger deterministically.
+ */
+ return asid + 1;
}
-/* Flush all mappings, including globals, for all PCIDs. */
-static inline void invpcid_flush_all(void)
+/*
+ * Given @asid, compute uPCID
+ */
+static inline u16 user_pcid(u16 asid)
{
- __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+ u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+#endif
+ return ret;
}
-/* Flush all mappings for all PCIDs except globals. */
-static inline void invpcid_flush_all_nonglobals(void)
+struct pgd_t;
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{
- __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ return __sme_pa(pgd) | kern_pcid(asid);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(pgd);
+ }
}
-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
{
- u64 new_tlb_gen;
-
- /*
- * Bump the generation count. This also serves as a full barrier
- * that synchronizes with switch_mm(): callers are required to order
- * their read of mm_cpumask after their writes to the paging
- * structures.
- */
- smp_mb__before_atomic();
- new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
- smp_mb__after_atomic();
-
- return new_tlb_gen;
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
+ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
}
#ifdef CONFIG_PARAVIRT
@@ -99,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
return !static_cpu_has(X86_FEATURE_PCID);
}
-/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
- */
-#define TLB_NR_DYN_ASIDS 6
-
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -139,6 +193,24 @@ struct tlb_state {
bool is_lazy;
/*
+ * If set we changed the page tables in such a way that we
+ * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+ * This tells us to go invalidate all the non-loaded ctxs[]
+ * on the next context switch.
+ *
+ * The current ctx was kept up-to-date as it ran and does not
+ * need to be invalidated.
+ */
+ bool invalidate_other;
+
+ /*
+ * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+ * the corresponding user PCID needs a flush next time we
+ * switch to it; see SWITCH_TO_USER_CR3.
+ */
+ unsigned short user_pcid_flush_mask;
+
+ /*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
*/
@@ -219,6 +291,14 @@ static inline unsigned long cr4_read_shadow(void)
}
/*
+ * Mark all other ASIDs as invalid, preserves the current.
+ */
+static inline void invalidate_other_asid(void)
+{
+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
+/*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot
* up after us can get the correct flags. This should only be used
@@ -237,37 +317,63 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
extern void initialize_tlbstate_and_flush(void);
-static inline void __native_flush_tlb(void)
+/*
+ * Given an ASID, flush the corresponding user ASID. We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
{
+ /* There is no user ASID if address space separation is off */
+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ return;
+
/*
- * If current->mm == NULL then we borrow a mm which may change during a
- * task switch and therefore we must not be preempted while we write CR3
- * back:
+ * We only have a single ASID if PCID is off and the CR3
+ * write will have flushed it.
*/
- preempt_disable();
- native_write_cr3(__native_read_cr3());
- preempt_enable();
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ __set_bit(kern_pcid(asid),
+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
-static inline void __native_flush_tlb_global_irq_disabled(void)
+/*
+ * flush the entire current user mapping
+ */
+static inline void __native_flush_tlb(void)
{
- unsigned long cr4;
+ /*
+ * Preemption or interrupts must be disabled to protect the access
+ * to the per CPU variable and to prevent being preempted between
+ * read_cr3() and write_cr3().
+ */
+ WARN_ON_ONCE(preemptible());
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
+ native_write_cr3(__native_read_cr3());
}
+/*
+ * flush everything
+ */
static inline void __native_flush_tlb_global(void)
{
- unsigned long flags;
+ unsigned long cr4, flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@@ -280,36 +386,69 @@ static inline void __native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- __native_flush_tlb_global_irq_disabled();
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ /* toggle PGE */
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ native_write_cr4(cr4);
raw_local_irq_restore(flags);
}
+/*
+ * flush one page in the user mapping
+ */
static inline void __native_flush_tlb_single(unsigned long addr)
{
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+ * Just use invalidate_user_asid() in case we are called early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+ invalidate_user_asid(loaded_mm_asid);
+ else
+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
}
+/*
+ * flush everything
+ */
static inline void __flush_tlb_all(void)
{
- if (boot_cpu_has(X86_FEATURE_PGE))
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global();
- else
+ } else {
+ /*
+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+ */
__flush_tlb();
-
- /*
- * Note: if we somehow had PCID but not PGE, then this wouldn't work --
- * we'd end up flushing kernel translations for the current ASID but
- * we might fail to flush kernel translations for other cached ASIDs.
- *
- * To avoid this issue, we force PCID off if PGE is off.
- */
+ }
}
+/*
+ * flush one page in the kernel mapping
+ */
static inline void __flush_tlb_one(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * __flush_tlb_single() will have cleared the TLB entry for this ASID,
+ * but since kernel space is replicated across all, we must also
+ * invalidate all others.
+ */
+ invalidate_other_asid();
}
#define TLB_FLUSH_ALL -1UL
@@ -370,6 +509,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+ /*
+ * Bump the generation count. This also serves as a full barrier
+ * that synchronizes with switch_mm(): callers are required to order
+ * their read of mm_cpumask after their writes to the paging
+ * structures.
+ */
+ return atomic64_inc_return(&mm->context.tlb_gen);
+}
+
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 84b9ec0c1bc0..22647a642e98 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed,
DECLARE_EVENT_CLASS(vector_activate,
TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
- bool early),
+ bool reserve),
- TP_ARGS(irq, is_managed, can_reserve, early),
+ TP_ARGS(irq, is_managed, can_reserve, reserve),
TP_STRUCT__entry(
__field( unsigned int, irq )
__field( bool, is_managed )
__field( bool, can_reserve )
- __field( bool, early )
+ __field( bool, reserve )
),
TP_fast_assign(
__entry->irq = irq;
__entry->is_managed = is_managed;
__entry->can_reserve = can_reserve;
- __entry->early = early;
+ __entry->reserve = reserve;
),
- TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d",
+ TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
__entry->irq, __entry->is_managed, __entry->can_reserve,
- __entry->early)
+ __entry->reserve)
);
#define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \
DEFINE_EVENT_FN(vector_activate, name, \
TP_PROTO(unsigned int irq, bool is_managed, \
- bool can_reserve, bool early), \
- TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL); \
+ bool can_reserve, bool reserve), \
+ TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL); \
DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 1fadd310ff68..31051f35cbb7 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e9cc6fe1fc6f..c1688c2d0a12 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -7,6 +7,9 @@
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
struct unwind_state {
struct stack_info stack_info;
unsigned long stack_mask;
@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
}
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+/*
+ * WARNING: The entire pt_regs may not be safe to dereference. In some cases,
+ * only the iret frame registers are accessible. Use with caution!
+ */
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{
if (unwind_done(state))
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d9a7c659009c..b986b2ca688a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -7,6 +7,7 @@
#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
/*
* Called on instruction fetch fault in vsyscall page.
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 7e1e730396ae..bcba3c643e63 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -78,7 +78,12 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
+
+#define X86_CR3_PCID_BITS 12
+#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6e272f3ea984..880441f24146 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg)
apic_verbosity = APIC_DEBUG;
else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
+#ifdef CONFIG_X86_64
else {
pr_warning("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
+#endif
return 0;
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index aa85690e9b64..25a87028cb3f 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
+ .irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 1, /* logical */
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 7b659c4480c9..5078b5ce63a7 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
+ .irq_delivery_mode = dest_Fixed,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 201579dc5242..8a7963421460 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
}
int mp_irqdomain_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool early)
+ struct irq_data *irq_data, bool reserve)
{
unsigned long flags;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 9b18be764422..ce503c99f5c4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL :
MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU :
- MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_DEST_ID(cfg->dest_apicid);
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED :
- MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_DELIVERY_FIXED |
MSI_DATA_VECTOR(cfg->vector);
}
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fa22017de806..02e8acb134f8 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
+ .irq_delivery_mode = dest_Fixed,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 750449152b04..f8b03bb8e725 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd)
irq_matrix_reserve(vector_matrix);
apicd->can_reserve = true;
apicd->has_reserved = true;
+ irqd_set_can_reserve(irqd);
trace_vector_reserve(irqd->irq, 0);
vector_assign_managed_shutdown(irqd);
}
@@ -368,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)
int ret;
ret = assign_irq_vector_any_locked(irqd);
- if (!ret)
+ if (!ret) {
apicd->has_reserved = false;
+ /*
+ * Core might have disabled reservation mode after
+ * allocating the irq descriptor. Ideally this should
+ * happen before allocation time, but that would require
+ * completely convoluted ways of transporting that
+ * information.
+ */
+ if (!irqd_can_reserve(irqd))
+ apicd->can_reserve = false;
+ }
return ret;
}
@@ -398,21 +409,21 @@ static int activate_managed(struct irq_data *irqd)
}
static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
- bool early)
+ bool reserve)
{
struct apic_chip_data *apicd = apic_chip_data(irqd);
unsigned long flags;
int ret = 0;
trace_vector_activate(irqd->irq, apicd->is_managed,
- apicd->can_reserve, early);
+ apicd->can_reserve, reserve);
/* Nothing to do for fixed assigned vectors */
if (!apicd->can_reserve && !apicd->is_managed)
return 0;
raw_spin_lock_irqsave(&vector_lock, flags);
- if (early || irqd_is_managed_and_shutdown(irqd))
+ if (reserve || irqd_is_managed_and_shutdown(irqd))
vector_assign_managed_shutdown(irqd);
else if (apicd->is_managed)
ret = activate_managed(irqd);
@@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
} else {
/* Release the vector */
apicd->can_reserve = true;
+ irqd_set_can_reserve(irqd);
clear_irq_vector(irqd);
realloc = true;
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 622f13ca8a94..8b04234e010b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
+ .irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 1, /* logical */
.disable_esr = 0,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 8ea78275480d..76417a9aab73 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
#include <asm/sigframe.h>
#include <asm/bootparam.h>
#include <asm/suspend.h>
+#include <asm/tlbflush.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -93,4 +94,13 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+ /* TLB state for the entry code */
+ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dedf428b20b6..fa1261eefa16 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -47,13 +47,8 @@ void foo(void)
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- offsetofend(struct tss_struct, SYSENTER_stack));
-
- /* Offset from cpu_tss to SYSENTER_stack */
- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
- /* Size of SYSENTER_stack */
- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+ offsetofend(struct cpu_entry_area, entry_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 630212fa9b9d..bf51e51d808d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -23,6 +23,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
BLANK();
#endif
@@ -63,6 +66,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fa998ca8aa5a..c47de4ebf63a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS];
-__u32 cpu_caps_set[NCAPINTS];
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu)
{
@@ -490,28 +490,23 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
-/* Setup the fixmap mapping only once per-processor */
-static inline void setup_fixmap_gdt(int cpu)
-{
-#ifdef CONFIG_X86_64
- /* On 64-bit systems, we use a read-only fixmap GDT. */
- pgprot_t prot = PAGE_KERNEL_RO;
-#else
- /*
- * On native 32-bit systems, the GDT cannot be read-only because
- * our double fault handler uses a task gate, and entering through
- * a task gate needs to change an available TSS to busy. If the GDT
- * is read-only, that will triple fault.
- *
- * On Xen PV, the GDT must be read-only because the hypervisor requires
- * it.
- */
- pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
- PAGE_KERNEL_RO : PAGE_KERNEL;
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
- __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
-}
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+#endif
/* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu)
@@ -747,7 +742,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{
int i;
- for (i = 0; i < NCAPINTS; i++) {
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
}
@@ -927,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
}
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+ /* Assume for now that ALL x86 CPUs are insecure */
+ setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+
fpu__init_system(c);
#ifdef CONFIG_X86_32
@@ -1250,7 +1249,7 @@ void enable_sep_cpu(void)
return;
cpu = get_cpu();
- tss = &per_cpu(cpu_tss, cpu);
+ tss = &per_cpu(cpu_tss_rw, cpu);
/*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1259,11 +1258,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
-
- wrmsr(MSR_IA32_SYSENTER_ESP,
- (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
- 0);
-
+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@@ -1357,25 +1352,22 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
-};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
+ extern char _entry_trampoline[];
+ extern char entry_SYSCALL_64_trampoline[];
+
+ int cpu = smp_processor_id();
+ unsigned long SYSCALL64_entry_trampoline =
+ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+ else
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1386,7 +1378,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1530,7 +1522,7 @@ void cpu_init(void)
if (cpu)
load_ucode_ap();
- t = &per_cpu(cpu_tss, cpu);
+ t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1569,7 +1561,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!oist->ist[0]) {
- char *estacks = per_cpu(exception_stacks, cpu);
+ char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v];
@@ -1580,7 +1572,7 @@ void cpu_init(void)
}
}
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/*
* <= is required because the CPU will access up to
@@ -1596,11 +1588,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, me);
/*
- * Initialize the TSS. Don't bother initializing sp0, as the initial
- * task never enters user mode.
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
*/
- set_tss_desc(cpu, t);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1612,7 +1605,6 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
- setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
@@ -1622,7 +1614,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu);
@@ -1657,12 +1649,12 @@ void cpu_init(void)
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
*/
- set_tss_desc(cpu, t);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_mm_ldt(&init_mm);
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
@@ -1674,7 +1666,6 @@ void cpu_init(void)
fpu__init_cpu();
- setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#endif
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 7dbcb7adf797..8ccdca6d3f9e 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
}
#else
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
- __native_flush_tlb_global_irq_disabled();
-}
-
static inline void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc;
@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
if (rev != mc->hdr.rev)
return -1;
-#ifdef CONFIG_X86_64
- /* Flush global tlb. This is precaution. */
- flush_tlb_early();
-#endif
uci->cpu_sig.rev = rev;
if (early)
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 0e662c55ae90..0b8cedb20d6d 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -50,25 +50,23 @@ static void doublefault_fn(void)
cpu_relax();
}
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-
- .__cr3 = __pa_nodebug(swapper_pg_dir),
- }
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+ .sp0 = STACK_START,
+ .ss0 = __KERNEL_DS,
+ .ldt = 0,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+ .ip = (unsigned long) doublefault_fn,
+ /* 0x2 bit is always set */
+ .flags = X86_EFLAGS_SF | 0x2,
+ .sp = STACK_START,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
};
/* dummy for do_double_fault() call */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index f13b4c00a5de..5fa110699ed2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,6 +18,7 @@
#include <linux/nmi.h>
#include <linux/sysfs.h>
+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
@@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
+bool in_entry_stack(unsigned long *stack, struct stack_info *info)
+{
+ struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
+
+ void *begin = ss;
+ void *end = ss + 1;
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_ENTRY;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
@@ -50,6 +69,28 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
+void show_iret_regs(struct pt_regs *regs)
+{
+ printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+ regs->sp, regs->flags);
+}
+
+static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
+{
+ if (on_stack(info, regs, sizeof(*regs)))
+ __show_regs(regs, 0);
+ else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
+ /*
+ * When an interrupt or exception occurs in entry code, the
+ * full pt_regs might not have been saved yet. In that case
+ * just print the iret frame.
+ */
+ show_iret_regs(regs);
+ }
+}
+
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl)
{
@@ -71,31 +112,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
+ * - entry stack
*
- * x86-32 can have up to three stacks:
+ * x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
+ * - entry stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
- /*
- * If we overflowed the task stack into a guard page, jump back
- * to the bottom of the usable stack.
- */
- if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
- stack = task_stack_page(task);
-
- if (get_stack_info(stack, task, &stack_info, &visit_mask))
- break;
+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
+ }
stack_name = stack_type_name(stack_info.type);
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
- __show_regs(regs, 0);
+ if (regs)
+ show_regs_safe(&stack_info, regs);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -119,7 +164,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/*
* Don't print regs->ip again if it was already printed
- * by __show_regs() below.
+ * by show_regs_safe() below.
*/
if (regs && stack == &regs->ip)
goto next;
@@ -155,8 +200,8 @@ next:
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
- __show_regs(regs, 0);
+ if (regs)
+ show_regs_safe(&stack_info, regs);
}
if (stack_name)
@@ -252,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
unsigned long sp;
#endif
printk(KERN_DEFAULT
- "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
- IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
+ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
+ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index daefae83a3aa..04170f63e3a1 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
+ if (type == STACK_TYPE_ENTRY)
+ return "ENTRY_TRAMPOLINE";
+
return NULL;
}
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
if (in_hardirq_stack(stack, info))
goto recursion_check;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 88ce2ffdb110..563e28d14f2c 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
+ if (type == STACK_TYPE_ENTRY) {
+ /*
+ * On 64-bit, we have a generic entry stack that we
+ * use for all the kernel entry points, including
+ * SYSENTER.
+ */
+ return "ENTRY_TRAMPOLINE";
+ }
+
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
goto unknown;
recursion_check:
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7dca675fe78d..04a625f0fcda 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL 0
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -350,13 +371,14 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
+ .fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
#endif
#ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 3feb648781c4..2f723301eb58 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(cpu_tss, get_cpu());
+ tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 49cfd9fe7589..68e1867cca80 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
- /*
- * NB: Unlike exception entries, IRQ entries do not reliably
- * handle context tracking in the low-level entry code. This is
- * because syscall entries execute briefly with IRQs on before
- * updating context tracking state, so we can take an IRQ from
- * kernel mode with CONTEXT_USER. The low-level entry code only
- * updates the context if we came from user mode, so we won't
- * switch to CONTEXT_KERNEL. We'll fix that once the syscall
- * code is cleaned up enough that we can cleanly defer enabling
- * IRQs.
- */
-
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 020efbf5786b..d86e344f5b3d 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom,
- estack_top, estack_bottom);
+ estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n");
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 1c1eae961340..26d713ecad34 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -5,6 +5,11 @@
* Copyright (C) 2002 Andi Kleen
*
* This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ * contex.ldt_usr_sem
+ * mmap_sem
+ * context.lock
*/
#include <linux/errno.h>
@@ -19,6 +24,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -42,17 +48,15 @@ static void refresh_ldt_segments(void)
#endif
}
-/* context.lock is held for us, so we don't need any locking. */
+/* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
- mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+ load_mm_ldt(mm);
refresh_ldt_segments();
}
@@ -89,25 +93,143 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return NULL;
}
+ /* The new LDT isn't aliased for PTI yet. */
+ new_ldt->slot = -1;
+
new_ldt->nr_entries = num_entries;
return new_ldt;
}
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ *
+ * There is no corresponding unmap function. Even if the LDT is freed, we
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
+ * This is harmless: the LDT is always in ordinary memory, and no one will
+ * access the freed slot.
+ *
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
+ * it useful, and the flush would slow down modify_ldt().
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ bool is_vmalloc, had_top_level_entry;
+ unsigned long va;
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ int i;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return 0;
+
+ /*
+ * Any given ldt_struct should have map_ldt_struct() called at most
+ * once.
+ */
+ WARN_ON(ldt->slot != -1);
+
+ /*
+ * Did we already have the top level entry allocated? We can't
+ * use pgd_none() for this because it doens't do anything on
+ * 4-level page table kernels.
+ */
+ pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ had_top_level_entry = (pgd->pgd != 0);
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn;
+ pte_t pte, *ptep;
+
+ va = (unsigned long)ldt_slot_va(slot) + offset;
+ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+ /*
+ * Treat the PTI LDT range as a *userspace* range.
+ * get_locked_pte() will allocate all needed pagetables
+ * and account for them in this mm.
+ */
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+ /*
+ * Map it RO so the easy to find address is not a primary
+ * target via some kernel interface which misses a
+ * permission check.
+ */
+ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_top_level_entry);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_top_level_entry);
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+ }
+ }
+
+ va = (unsigned long)ldt_slot_va(slot);
+ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+
+ ldt->slot = slot;
+#endif
+ return 0;
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
-/* context.lock is held */
-static void install_ldt(struct mm_struct *current_mm,
- struct ldt_struct *ldt)
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
{
+ mutex_lock(&mm->context.lock);
+
/* Synchronizes with READ_ONCE in load_mm_ldt. */
- smp_store_release(&current_mm->context.ldt, ldt);
+ smp_store_release(&mm->context.ldt, ldt);
- /* Activate the LDT for all CPUs using current_mm. */
- on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+ /* Activate the LDT for all CPUs using currents mm. */
+ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
+
+ mutex_unlock(&mm->context.lock);
}
static void free_ldt_struct(struct ldt_struct *ldt)
@@ -124,27 +246,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
}
/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
*/
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
struct ldt_struct *new_ldt;
- struct mm_struct *old_mm;
int retval = 0;
- mutex_init(&mm->context.lock);
- old_mm = current->mm;
- if (!old_mm) {
- mm->context.ldt = NULL;
+ if (!old_mm)
return 0;
- }
mutex_lock(&old_mm->context.lock);
- if (!old_mm->context.ldt) {
- mm->context.ldt = NULL;
+ if (!old_mm->context.ldt)
goto out_unlock;
- }
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
@@ -156,6 +271,12 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval) {
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
mm->context.ldt = new_ldt;
out_unlock:
@@ -174,13 +295,18 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
unsigned long entries_size;
int retval;
- mutex_lock(&mm->context.lock);
+ down_read(&mm->context.ldt_usr_sem);
if (!mm->context.ldt) {
retval = 0;
@@ -209,7 +335,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
retval = bytecount;
out_unlock:
- mutex_unlock(&mm->context.lock);
+ up_read(&mm->context.ldt_usr_sem);
return retval;
}
@@ -269,7 +395,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
ldt.avl = 0;
}
- mutex_lock(&mm->context.lock);
+ if (down_write_killable(&mm->context.ldt_usr_sem))
+ return -EINTR;
old_ldt = mm->context.ldt;
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
@@ -286,12 +413,31 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
+ /*
+ * If we are using PTI, map the new LDT into the userspace pagetables.
+ * If there is already an LDT, use the other slot so that other CPUs
+ * will continue to use the old LDT until install_ldt() switches
+ * them over to the new LDT.
+ */
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ /*
+ * This only can fail for the first LDT setup. If an LDT is
+ * already installed then the PTE page is already
+ * populated. Mop up a half populated page table.
+ */
+ if (!WARN_ON_ONCE(old_ldt))
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+
install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
error = 0;
out_unlock:
- mutex_unlock(&mm->context.lock);
+ up_write(&mm->context.ldt_usr_sem);
out:
return error;
}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 00bc751c861c..edfede768688 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -48,8 +48,6 @@ static void load_segments(void)
"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
"\tmovl %%eax,%%ds\n"
"\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
: : : "eax", "memory");
#undef STR
@@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0), 0);
idt_invalidate(phys_to_virt(0));
+ set_gdt(phys_to_virt(0), 0);
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index ac0be8283325..9edadabf04f6 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index bb988a24db92..aed9d94bd46f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -47,7 +47,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower
@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
* Poison it.
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+#ifdef CONFIG_X86_64
+ /*
+ * .sp1 is cpu_current_top_of_stack. The init task never
+ * runs user code, but cpu_current_top_of_stack should still
+ * be well defined before the first context switch.
+ */
+ .sp1 = TOP_OF_INIT_STACK,
+#endif
+
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
@@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
-#ifdef CONFIG_X86_32
- .SYSENTER_stack_canary = STACK_END_MAGIC,
-#endif
};
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (bp) {
- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 45bf0c5f93e1..5224c6099184 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eeeb34f85c25..c75466232016 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
- regs->sp, regs->flags);
+ show_iret_regs(regs);
+
if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
+ if (!all)
+ return;
+
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
- if (!all)
- return;
-
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = __read_cr3();
@@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
@@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
+ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
update_sp0(next_p);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 05a97d5fe298..ed556d50d7ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
/* Maximum number of SMT threads on any online core */
-int __max_smt_threads __read_mostly;
+int __read_mostly __max_smt_threads = 1;
/* Flag to indicate if a complete sched domain rebuild is required */
bool x86_topology_update;
@@ -126,14 +126,10 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0xa, 0xf);
spin_unlock_irqrestore(&rtc_lock, flags);
- local_flush_tlb();
- pr_debug("1.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
start_eip >> 4;
- pr_debug("2.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
start_eip & 0xf;
- pr_debug("3.\n");
}
static inline void smpboot_restore_warm_reset_vector(void)
@@ -141,11 +137,6 @@ static inline void smpboot_restore_warm_reset_vector(void)
unsigned long flags;
/*
- * Install writable page 0 entry to set BIOS data area.
- */
- local_flush_tlb();
-
- /*
* Paranoid: Set warm reset code and vector here back
* to default values.
*/
@@ -932,12 +923,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
- /*
- * Enable the espfix hack for this CPU
- */
-#ifdef CONFIG_X86_ESPFIX64
+ /* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
-#endif
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -1304,7 +1291,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
* Today neither Intel nor AMD support heterogenous systems so
* extrapolate the boot cpu's data to all packages.
*/
- ncpus = cpu_data(0).booted_cores * smp_num_siblings;
+ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
pr_info("Max logical packages: %u\n", __max_logical_packages);
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 77835bc021c7..20161ef53537 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk,
{
int ret;
+ /*
+ * If the task doesn't have a stack (e.g., a zombie), the stack is
+ * "reliably" empty.
+ */
if (!try_get_task_stack(tsk))
- return -EINVAL;
+ return 0;
ret = __save_stack_trace_reliable(trace, tsk);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9a9c9b076955..a5b802a12212 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info) || LDT_zero(info)) {
+ if (LDT_empty(info) || LDT_zero(info))
memset(desc, 0, sizeof(*desc));
- } else {
+ else
fill_ldt(desc, info);
-
- /*
- * Always set the accessed bit so that the CPU
- * doesn't try to write to the (read-only) GDT.
- */
- desc->type |= 1;
- }
++info;
++desc;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 989514c94a55..446c9ef8cfc3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -51,6 +51,7 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
+#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
@@ -348,23 +349,42 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
- * end up promoting it to a doublefault. In that case, modify
- * the stack to make it look like we just entered the #GP
- * handler from user space, similar to bad_iret.
+ * end up promoting it to a doublefault. In that case, take
+ * advantage of the fact that we're not using the normal (TSS.sp0)
+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
+ * and then modify our own IRET frame so that, when we return,
+ * we land directly at the #GP(0) vector with the stack already
+ * set up according to its expectations.
+ *
+ * The net result is that our #GP handler will think that we
+ * entered from usermode with the bad user context.
*
* No need for ist_enter here because we don't use RCU.
*/
- if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
+ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
- struct pt_regs *normal_regs = task_pt_regs(current);
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
- /* Fake a #GP(0) from userspace. */
- memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
+ /*
+ * regs->sp points to the failing IRET frame on the
+ * ESPFIX64 stack. Copy it to the entry stack. This fills
+ * in gpregs->ss through gpregs->ip.
+ *
+ */
+ memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
+
+ /*
+ * Adjust our frame so that we return straight to the #GP
+ * vector with the expected RSP value. This is safe because
+ * we won't enable interupts or schedule before we invoke
+ * general_protection, so nothing will clobber the stack
+ * frame we just set up.
+ */
regs->ip = (unsigned long)general_protection;
- regs->sp = (unsigned long)&normal_regs->orig_ax;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
@@ -389,7 +409,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
- * deliv- ered, the faulting linear address of the second fault will
+ * delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
@@ -605,14 +625,15 @@ NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch off the IST stack if the
- * interrupted code was in user mode. The actual stack switch is done in
- * entry_64.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = task_pt_regs(current);
- *regs = *eregs;
+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+ if (regs != eregs)
+ *regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
@@ -628,13 +649,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
- * correctly, we want move our stack frame to task_pt_regs
- * and we want to pretend that the exception came from the
- * iret target.
+ * correctly, we want to move our stack frame to where it would
+ * be had we entered directly on the entry stack (rather than
+ * just below the IRET frame) and we want to pretend that the
+ * exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
- container_of(task_pt_regs(current),
- struct bad_iret_stack, regs);
+ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@@ -795,14 +816,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
-#if defined(CONFIG_X86_32)
- /*
- * This is the most likely code path that involves non-trivial use
- * of the SYSENTER stack. Check that we haven't overrun it.
- */
- WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
- "Overran or corrupted SYSENTER stack\n");
-#endif
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@@ -929,6 +942,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
void __init trap_init(void)
{
+ /* Init cpu_entry_area before IST entries are set up */
+ setup_cpu_entry_areas();
+
idt_setup_traps();
/*
@@ -936,8 +952,9 @@ void __init trap_init(void)
* "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */
- __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
- idt_descr.address = fix_to_virt(FIX_RO_IDT);
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
/*
* Should be a barrier for any external CPU state:
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a3f973b2c97a..be86a865087a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL;
}
-static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
+ void *addr = (void *)_addr;
- /*
- * If the address isn't on the current stack, switch to the next one.
- *
- * We may have to traverse multiple stacks to deal with the possibility
- * that info->next_sp could point to an empty stack and the address
- * could be on a subsequent stack.
- */
- while (!on_stack(info, (void *)addr, len))
- if (get_stack_info(info->next_sp, state->task, info,
- &state->stack_mask))
- return false;
+ if (!on_stack(info, addr, len) &&
+ (get_stack_info(addr, state->task, info, &state->stack_mask)))
+ return false;
return true;
}
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
return true;
}
-#define REGS_SIZE (sizeof(struct pt_regs))
-#define SP_OFFSET (offsetof(struct pt_regs, sp))
-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
-
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
- unsigned long *ip, unsigned long *sp, bool full)
+ unsigned long *ip, unsigned long *sp)
{
- size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
- size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
- struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
-
- if (IS_ENABLED(CONFIG_X86_64)) {
- if (!stack_access_ok(state, addr, regs_size))
- return false;
+ struct pt_regs *regs = (struct pt_regs *)addr;
- *ip = regs->ip;
- *sp = regs->sp;
+ /* x86-32 support will be more complicated due to the &regs->sp hack */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
- return true;
- }
-
- if (!stack_access_ok(state, addr, sp_offset))
+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
*ip = regs->ip;
+ *sp = regs->sp;
+ return true;
+}
- if (user_mode(regs)) {
- if (!stack_access_ok(state, addr + sp_offset,
- REGS_SIZE - SP_OFFSET))
- return false;
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
- *sp = regs->sp;
- } else
- *sp = (unsigned long)&regs->sp;
+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+ return false;
+ *ip = regs->ip;
+ *sp = regs->sp;
return true;
}
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
- struct pt_regs *ptregs;
bool indirect = false;
if (unwind_done(state))
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS:
- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS_IRET:
- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
- ptregs = container_of((void *)sp, struct pt_regs, ip);
- if ((unsigned long)ptregs >= prev_sp &&
- on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
- state->regs = ptregs;
- state->full_regs = false;
- } else
- state->regs = NULL;
-
+ state->regs = (void *)sp - IRET_FRAME_OFFSET;
+ state->full_regs = false;
state->signal = true;
break;
@@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
if (get_stack_info((unsigned long *)state->sp, state->task,
- &state->stack_info, &state->stack_mask))
- return;
+ &state->stack_info, &state->stack_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ if (get_stack_info(next_page, state->task, &state->stack_info,
+ &state->stack_mask))
+ return;
+ }
/*
* The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a4009fb9be87..1e413a9326aa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
+#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+
#else
#define X64_ALIGN_RODATA_BEGIN
#define X64_ALIGN_RODATA_END
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+
#endif
PHDRS {
@@ -102,11 +108,22 @@ SECTIONS
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
+ ALIGN_ENTRY_TEXT_BEGIN
ENTRY_TEXT
IRQENTRY_TEXT
+ ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ _entry_trampoline = .;
+ *(.entry_trampoline)
+ . = ALIGN(PAGE_SIZE);
+ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
/* End of text section */
_etext = .;
} :text = 0x9090
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abe74f779f9d..b514b2b2845a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2390,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
}
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
- u64 cr0, u64 cr4)
+ u64 cr0, u64 cr3, u64 cr4)
{
int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
/*
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
@@ -2411,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
bad = ctxt->ops->set_cr(ctxt, 4, cr4);
if (bad)
return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
}
return X86EMUL_CONTINUE;
@@ -2421,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
struct desc_struct desc;
struct desc_ptr dt;
u16 selector;
- u32 val, cr0, cr4;
+ u32 val, cr0, cr3, cr4;
int i;
cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+ cr3 = GET_SMSTATE(u32, smbase, 0x7ff8);
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
@@ -2467,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
- return rsm_enter_protected_mode(ctxt, cr0, cr4);
+ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
{
struct desc_struct desc;
struct desc_ptr dt;
- u64 val, cr0, cr4;
+ u64 val, cr0, cr3, cr4;
u32 base3;
u16 selector;
int i, r;
@@ -2491,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
+ cr3 = GET_SMSTATE(u64, smbase, 0x7f50);
cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
val = GET_SMSTATE(u64, smbase, 0x7ed0);
@@ -2519,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
ctxt->ops->set_gdt(ctxt, &dt);
- r = rsm_enter_protected_mode(ctxt, cr0, cr4);
+ r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
if (r != X86EMUL_CONTINUE)
return r;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e5e66e5c6640..c4deb1f34faa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if(make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
- return 1;
+ return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, 0, 0,
vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
@@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
- return 1;
+ return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
@@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
- return 1;
+ return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
@@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu) < 0) {
spin_unlock(&vcpu->kvm->mmu_lock);
- return 1;
+ return -ENOSPC;
}
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
0, ACC_ALL);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8eba631c4dbd..023afa0c8887 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
- (unsigned long)this_cpu_ptr(&cpu_tss));
+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index faf843c9b916..1cec2c62a0b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4384,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
- trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
handled += n;
addr += n;
len -= n;
@@ -4643,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
{
if (vcpu->mmio_read_completed) {
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
- vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+ vcpu->mmio_fragments[0].gpa, val);
vcpu->mmio_read_completed = 0;
return 1;
}
@@ -4665,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
{
- trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
return vcpu_mmio_write(vcpu, gpa, bytes, val);
}
static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
- trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
return X86EMUL_IO_NEEDED;
}
@@ -7264,13 +7264,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
- struct fpu *fpu = &current->thread.fpu;
int r;
- fpu__initialize(fpu);
-
kvm_sigset_activate(vcpu);
+ kvm_load_guest_fpu(vcpu);
+
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
@@ -7296,14 +7295,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
- kvm_load_guest_fpu(vcpu);
-
if (unlikely(vcpu->arch.complete_userspace_io)) {
int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
vcpu->arch.complete_userspace_io = NULL;
r = cui(vcpu);
if (r <= 0)
- goto out_fpu;
+ goto out;
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
@@ -7312,9 +7309,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
else
r = vcpu_run(vcpu);
-out_fpu:
- kvm_put_guest_fpu(vcpu);
out:
+ kvm_put_guest_fpu(vcpu);
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
@@ -7384,7 +7380,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
#endif
kvm_rip_write(vcpu, regs->rip);
- kvm_set_rflags(vcpu, regs->rflags);
+ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
vcpu->arch.exception.pending = false;
@@ -7498,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
+int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) {
+ /*
+ * When EFER.LME and CR0.PG are set, the processor is in
+ * 64-bit mode (though maybe in a 32-bit code segment).
+ * CR4.PAE and EFER.LMA must be set.
+ */
+ if (!(sregs->cr4 & X86_CR4_PAE_BIT)
+ || !(sregs->efer & EFER_LMA))
+ return -EINVAL;
+ } else {
+ /*
+ * Not in 64-bit mode: EFER.LMA is clear and the code
+ * segment cannot be 64-bit.
+ */
+ if (sregs->efer & EFER_LMA || sregs->cs.l)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -7510,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
(sregs->cr4 & X86_CR4_OSXSAVE))
return -EINVAL;
+ if (kvm_valid_sregs(vcpu, sregs))
+ return -EINVAL;
+
apic_base_msr.data = sregs->apic_base;
apic_base_msr.host_initiated = true;
if (kvm_set_apic_base(vcpu, &apic_base_msr))
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 553f8fd23cc4..4846eff7e4c8 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
/*
- * Use cpu_tss as a cacheline-aligned, seldomly
+ * Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index c4d55919fac1..e0b85930dd77 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
-ff:
+ff: UD0
EndTable
Table: 3-byte opcode 1 (0x0f 0x38)
@@ -717,7 +717,7 @@ AVXcode: 2
7e: vpermt2d/q Vx,Hx,Wx (66),(ev)
7f: vpermt2ps/d Vx,Hx,Wx (66),(ev)
80: INVEPT Gy,Mdq (66)
-81: INVPID Gy,Mdq (66)
+81: INVVPID Gy,Mdq (66)
82: INVPCID Gy,Mdq (66)
83: vpmultishiftqb Vx,Hx,Wx (66),(ev)
88: vexpandps/d Vpd,Wpd (66),(ev)
@@ -970,6 +970,15 @@ GrpTable: Grp9
EndTable
GrpTable: Grp10
+# all are UD1
+0: UD1
+1: UD1
+2: UD1
+3: UD1
+4: UD1
+5: UD1
+6: UD1
+7: UD1
EndTable
# Grp11A and Grp11B are expressed as Grp11 in Intel SDM
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 8e13b8cc6bed..27e9e90a8d35 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
new file mode 100644
index 000000000000..b9283cc27622
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+ unsigned long va = (unsigned long) cea_vaddr;
+
+ set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+ int npages;
+ void *cea;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+ npages = sizeof(struct debug_store) / PAGE_SIZE;
+ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+ PAGE_KERNEL);
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+ /*
+ * Force the population of PMDs for not yet allocated per cpu
+ * memory like debug store buffers.
+ */
+ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+ for (; npages; npages--, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+ extern char _entry_trampoline[];
+
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
+ *
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
+ */
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+ gdt_prot);
+
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+ per_cpu_ptr(&entry_stack_storage, cpu), 1,
+ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+ *
+ * Avoid placing a page boundary in the part of the TSS that the
+ * processor reads during a task switch (the first 104 bytes). The
+ * processor may not correctly perform address translations if a
+ * boundary occurs in this area. During a task switch, the processor
+ * reads and writes into the first 104 bytes of each TSS (using
+ * contiguous physical addresses beginning with the physical address
+ * of the first byte of the TSS). So, after TSS access begins, if
+ * part of the 104 bytes is not physically contiguous, the processor
+ * will access incorrect information without generating a page-fault
+ * exception.
+ *
+ * There are also a lot of errata involving the TSS spanning a page
+ * boundary. Assert that we're not doing that.
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+ BUILD_BUG_ON(sizeof(exception_stacks) !=
+ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+ &per_cpu(exception_stacks, cpu),
+ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+ percpu_setup_debug_store(cpu);
+}
+
+static __init void setup_cpu_entry_area_ptes(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long start, end;
+
+ BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
+ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + CPU_ENTRY_AREA_MAP_SIZE;
+
+ /* Careful here: start + PMD_SIZE might wrap around */
+ for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
+ populate_extra_pte(start);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+ unsigned int cpu;
+
+ setup_cpu_entry_area_ptes();
+
+ for_each_possible_cpu(cpu)
+ setup_cpu_entry_area(cpu);
+}
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index bfcffdf6c577..421f2664ffa0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
static int ptdump_show(struct seq_file *m, void *v)
{
- ptdump_walk_pgd_level(m, NULL);
+ ptdump_walk_pgd_level_debugfs(m, NULL, false);
return 0;
}
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
.release = single_release,
};
-static struct dentry *pe;
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curknl,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curusr,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
{
- pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
- &ptdump_fops);
- if (!pe)
+ dir = debugfs_create_dir("page_tables", NULL);
+ if (!dir)
return -ENOMEM;
+ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+ &ptdump_fops);
+ if (!pe_knl)
+ goto err;
+
+ pe_curknl = debugfs_create_file("current_kernel", 0400,
+ dir, NULL, &ptdump_curknl_fops);
+ if (!pe_curknl)
+ goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pe_curusr = debugfs_create_file("current_user", 0400,
+ dir, NULL, &ptdump_curusr_fops);
+ if (!pe_curusr)
+ goto err;
+#endif
return 0;
+err:
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
}
static void __exit pt_dump_debug_exit(void)
{
- debugfs_remove_recursive(pe);
+ debugfs_remove_recursive(dir);
}
module_init(pt_dump_debug_init);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 5e3ac6fe6c9e..f56902c1f04b 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -44,68 +44,97 @@ struct addr_marker {
unsigned long max_lines;
};
-/* indices for address_markers; keep sync'd w/ address_markers below */
+/* Address space markers hints */
+
+#ifdef CONFIG_X86_64
+
enum address_markers_idx {
USER_SPACE_NR = 0,
-#ifdef CONFIG_X86_64
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
-# ifdef CONFIG_X86_ESPFIX64
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
+ CPU_ENTRY_AREA_NR,
+#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
-# endif
+#endif
+#ifdef CONFIG_EFI
+ EFI_END_NR,
+#endif
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
-#else
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
+ [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
+#ifdef CONFIG_KASAN
+ [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
+ [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
+#endif
+ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
+#ifdef CONFIG_X86_ESPFIX64
+ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
+#endif
+#ifdef CONFIG_EFI
+ [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
+#endif
+ [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
+ [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
+ [MODULES_END_NR] = { MODULES_END, "End Modules" },
+ [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
+};
+
+#else /* CONFIG_X86_64 */
+
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
VMALLOC_START_NR,
VMALLOC_END_NR,
-# ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
-# endif
- FIXADDR_START_NR,
#endif
+ CPU_ENTRY_AREA_NR,
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
};
-/* Address space markers hints */
static struct addr_marker address_markers[] = {
- { 0, "User Space" },
-#ifdef CONFIG_X86_64
- { 0x8000000000000000UL, "Kernel Space" },
- { 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
- { 0/* VMALLOC_START */, "vmalloc() Area" },
- { 0/* VMEMMAP_START */, "Vmemmap" },
-#ifdef CONFIG_KASAN
- { KASAN_SHADOW_START, "KASAN shadow" },
- { KASAN_SHADOW_END, "KASAN shadow end" },
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
+#ifdef CONFIG_HIGHMEM
+ [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
-# ifdef CONFIG_X86_ESPFIX64
- { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
-# endif
-# ifdef CONFIG_EFI
- { EFI_VA_END, "EFI Runtime Services" },
-# endif
- { __START_KERNEL_map, "High Kernel Mapping" },
- { MODULES_VADDR, "Modules" },
- { MODULES_END, "End Modules" },
-#else
- { PAGE_OFFSET, "Kernel Mapping" },
- { 0/* VMALLOC_START */, "vmalloc() Area" },
- { 0/*VMALLOC_END*/, "vmalloc() End" },
-# ifdef CONFIG_HIGHMEM
- { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
-# endif
- { 0/*FIXADDR_START*/, "Fixmap Area" },
-#endif
- { -1, NULL } /* End of list */
+ [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
+ [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
};
+#endif /* !CONFIG_X86_64 */
+
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
@@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
static const char * const level_name[] =
{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
- if (!pgprot_val(prot)) {
+ if (!(pr & _PAGE_PRESENT)) {
/* Not present */
pt_dump_cont_printf(m, dmsg, " ");
} else {
@@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
- bool checkwx)
+ bool checkwx, bool dmesg)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
if (pgd) {
start = pgd;
- st.to_dmesg = true;
+ st.to_dmesg = dmesg;
}
st.check_wx = checkwx;
@@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
- ptdump_walk_pgd_level_core(m, pgd, false);
+ ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (user && static_cpu_has(X86_FEATURE_PTI))
+ pgd = kernel_to_user_pgdp(pgd);
+#endif
+ ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+static void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pgd_t *pgd = (pgd_t *) &init_top_pgt;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("x86/mm: Checking user space page tables\n");
+ pgd = kernel_to_user_pgdp(pgd);
+ ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
}
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
void ptdump_walk_pgd_level_checkwx(void)
{
- ptdump_walk_pgd_level_core(NULL, NULL, true);
+ ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+ ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
@@ -525,8 +578,8 @@ static int __init pt_dump_init(void)
address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+ address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
#endif
-
return 0;
}
__initcall(pt_dump_init);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index febf6980e653..06fe3d51d385 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;
- printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6fdf91ef130a..8ca324d07282 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
+#include <asm/pti.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -160,6 +161,12 @@ struct map_range {
static int page_size_mask;
+static void enable_global_pages(void)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ __supported_pte_mask |= _PAGE_GLOBAL;
+}
+
static void __init probe_page_size_mask(void)
{
/*
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
- } else
- __supported_pte_mask &= ~_PAGE_GLOBAL;
+ enable_global_pages();
+ }
/* Enable 1 GB linear kernel mappings if available: */
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
static void setup_pcid(void)
{
-#ifdef CONFIG_X86_64
- if (boot_cpu_has(X86_FEATURE_PCID)) {
- if (boot_cpu_has(X86_FEATURE_PGE)) {
- /*
- * This can't be cr4_set_bits_and_update_boot() --
- * the trampoline code can't handle CR4.PCIDE and
- * it wouldn't do any good anyway. Despite the name,
- * cr4_set_bits_and_update_boot() doesn't actually
- * cause the bits in question to remain set all the
- * way through the secondary boot asm.
- *
- * Instead, we brute-force it and set CR4.PCIDE
- * manually in start_secondary().
- */
- cr4_set_bits(X86_CR4_PCIDE);
- } else {
- /*
- * flush_tlb_all(), as currently implemented, won't
- * work if PCID is on but PGE is not. Since that
- * combination doesn't exist on real hardware, there's
- * no reason to try to fully support it, but it's
- * polite to avoid corrupting data if we're on
- * an improperly configured VM.
- */
- setup_clear_cpu_cap(X86_FEATURE_PCID);
- }
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * This can't be cr4_set_bits_and_update_boot() -- the
+ * trampoline code can't handle CR4.PCIDE and it wouldn't
+ * do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually cause
+ * the bits in question to remain set all the way through
+ * the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE manually in
+ * start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+
+ /*
+ * INVPCID's single-context modes (2/3) only work if we set
+ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
+ * on systems that have X86_CR4_PCIDE clear, or that have
+ * no INVPCID support at all.
+ */
+ if (boot_cpu_has(X86_FEATURE_INVPCID))
+ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't work if
+ * PCID is on but PGE is not. Since that combination
+ * doesn't exist on real hardware, there's no reason to try
+ * to fully support it, but it's polite to avoid corrupting
+ * data if we're on an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
}
-#endif
}
#ifdef CONFIG_X86_32
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
{
unsigned long end;
+ pti_check_boottime_disable();
probe_page_size_mask();
setup_pcid();
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8a64a6f2848d..135c9a7898c7 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/page_types.h>
+#include <asm/cpu_entry_area.h>
#include <asm/init.h>
#include "mm_internal.h"
@@ -766,6 +767,7 @@ void __init mem_init(void)
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
@@ -777,6 +779,10 @@ void __init mem_init(void)
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
+ CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
+ CPU_ENTRY_AREA_MAP_SIZE >> 10,
+
#ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 6e4573b1da34..c45b6ec5357b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr)
return;
}
+ mmiotrace_iounmap(addr);
+
addr = (volatile void __iomem *)
(PAGE_MASK & (unsigned long __force)addr);
- mmiotrace_iounmap(addr);
-
/* Use the vm area unlocked, assuming the caller
ensures there isn't another iounmap for the same address
in parallel. Reuse of the virtual address is prevented by
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 99dfed6dfef8..47388f0c0e59 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES];
@@ -277,6 +278,7 @@ void __init kasan_early_init(void)
void __init kasan_init(void)
{
int i;
+ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
#ifdef CONFIG_KASAN_INLINE
register_die_notifier(&kasan_die_notifier);
@@ -321,16 +323,33 @@ void __init kasan_init(void)
map_range(&pfn_mapped[i]);
}
+ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+ PAGE_SIZE);
+
+ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
+ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+ PAGE_SIZE);
+
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
- kasan_mem_to_shadow((void *)__START_KERNEL_map));
+ shadow_cpu_entry_begin);
+
+ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+ (unsigned long)shadow_cpu_entry_end, 0);
+
+ kasan_populate_zero_shadow(shadow_cpu_entry_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
- (void *)KASAN_SHADOW_END);
+ (void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index c21c2ed04612..58477ec3d66d 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p)
unsigned long flags;
int ret = 0;
unsigned long size = 0;
+ unsigned long addr = p->addr & PAGE_MASK;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
unsigned int l;
pte_t *pte;
spin_lock_irqsave(&kmmio_lock, flags);
- if (get_kmmio_probe(p->addr)) {
+ if (get_kmmio_probe(addr)) {
ret = -EEXIST;
goto out;
}
- pte = lookup_address(p->addr, &l);
+ pte = lookup_address(addr, &l);
if (!pte) {
ret = -EINVAL;
goto out;
@@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
kmmio_count++;
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
- if (add_kmmio_fault_page(p->addr + size))
+ if (add_kmmio_fault_page(addr + size))
pr_err("Unable to set page fault.\n");
size += page_level_size(l);
}
@@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
{
unsigned long flags;
unsigned long size = 0;
+ unsigned long addr = p->addr & PAGE_MASK;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
struct kmmio_fault_page *release_list = NULL;
struct kmmio_delayed_release *drelease;
unsigned int l;
pte_t *pte;
- pte = lookup_address(p->addr, &l);
+ pte = lookup_address(addr, &l);
if (!pte)
return;
spin_lock_irqsave(&kmmio_lock, flags);
while (size < size_lim) {
- release_kmmio_fault_page(p->addr + size, &release_list);
+ release_kmmio_fault_page(addr + size, &release_list);
size += page_level_size(l);
}
list_del_rcu(&p->list);
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index d9a9e9fc75dd..391b13402e40 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -405,13 +405,13 @@ bool sme_active(void)
{
return sme_me_mask && !sev_enabled;
}
-EXPORT_SYMBOL_GPL(sme_active);
+EXPORT_SYMBOL(sme_active);
bool sev_active(void)
{
return sme_me_mask && sev_enabled;
}
-EXPORT_SYMBOL_GPL(sev_active);
+EXPORT_SYMBOL(sev_active);
static const struct dma_map_ops sev_dma_ops = {
.alloc = sev_alloc,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 96d456a94b03..004abf9ebf12 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
+
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 6b9bf023a700..c3c5274410a9 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/spinlock.h>
+#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000000..bce8aea65606
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ * https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
+ * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
+ * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+ * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+ *
+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
+ * Andy Lutomirsky <luto@amacapital.net>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
+
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK 0
+#endif
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ pr_info("%s\n", reason);
+}
+
+static void __init pti_print_if_secure(const char *reason)
+{
+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ pr_info("%s\n", reason);
+}
+
+void __init pti_check_boottime_disable(void)
+{
+ char arg[5];
+ int ret;
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+ pti_print_if_insecure("disabled on XEN PV.");
+ return;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+ if (ret > 0) {
+ if (ret == 3 && !strncmp(arg, "off", 3)) {
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+ if (ret == 2 && !strncmp(arg, "on", 2)) {
+ pti_print_if_secure("force enabled on command line.");
+ goto enable;
+ }
+ if (ret == 4 && !strncmp(arg, "auto", 4))
+ goto autosel;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+
+autosel:
+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ return;
+enable:
+ setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ /*
+ * Changes to the high (kernel) portion of the kernelmode page
+ * tables are not automatically propagated to the usermode tables.
+ *
+ * Users should keep in mind that, unlike the kernelmode tables,
+ * there is no vmalloc_fault equivalent for the usermode tables.
+ * Top-level entries added to init_mm's usermode pgd after boot
+ * will not be automatically propagated to other mms.
+ */
+ if (!pgdp_maps_userspace(pgdp))
+ return pgd;
+
+ /*
+ * The user page tables get the full PGD, accessible from
+ * userspace:
+ */
+ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+ /*
+ * If this is normal user memory, make it NX in the kernel
+ * pagetables so that, if we somehow screw up and return to
+ * usermode with the kernel CR3 loaded, we'll get a page fault
+ * instead of allowing user code to execute with the wrong CR3.
+ *
+ * As exceptions, we don't set NX if:
+ * - _PAGE_USER is not set. This could be an executable
+ * EFI runtime mapping or something similar, and the kernel
+ * may execute from it
+ * - we don't have NX support
+ * - we're clearing the PGD (i.e. the new pgd is not present).
+ */
+ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+ (__supported_pte_mask & _PAGE_NX))
+ pgd.pgd |= _PAGE_NX;
+
+ /* return the copy of the PGD we want the kernel to use: */
+ return pgd;
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (address < PAGE_OFFSET) {
+ WARN_ONCE(1, "attempt to walk user address\n");
+ return NULL;
+ }
+
+ if (pgd_none(*pgd)) {
+ unsigned long new_p4d_page = __get_free_page(gfp);
+ if (!new_p4d_page)
+ return NULL;
+
+ if (pgd_none(*pgd)) {
+ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+ new_p4d_page = 0;
+ }
+ if (new_p4d_page)
+ free_page(new_p4d_page);
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+ pud_t *pud;
+
+ BUILD_BUG_ON(p4d_large(*p4d) != 0);
+ if (p4d_none(*p4d)) {
+ unsigned long new_pud_page = __get_free_page(gfp);
+ if (!new_pud_page)
+ return NULL;
+
+ if (p4d_none(*p4d)) {
+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+ new_pud_page = 0;
+ }
+ if (new_pud_page)
+ free_page(new_pud_page);
+ }
+
+ pud = pud_offset(p4d, address);
+ /* The user page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+
+ if (pud_none(*pud)) {
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ new_pmd_page = 0;
+ }
+ if (new_pmd_page)
+ free_page(new_pmd_page);
+ }
+
+ return pmd_offset(pud, address);
+}
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down. Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables. It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+ pte_t *pte;
+
+ /* We can't do anything sensible if we hit a large mapping. */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+
+ if (pmd_none(*pmd)) {
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ new_pte_page = 0;
+ }
+ if (new_pte_page)
+ free_page(new_pte_page);
+ }
+
+ pte = pte_offset_kernel(pmd, address);
+ if (pte_flags(*pte) & _PAGE_USER) {
+ WARN_ONCE(1, "attempt to walk to user pte\n");
+ return NULL;
+ }
+ return pte;
+}
+
+static void __init pti_setup_vsyscall(void)
+{
+ pte_t *pte, *target_pte;
+ unsigned int level;
+
+ pte = lookup_address(VSYSCALL_ADDR, &level);
+ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+ return;
+
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ if (WARN_ON(!target_pte))
+ return;
+
+ *target_pte = *pte;
+ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
+static void __init
+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+{
+ unsigned long addr;
+
+ /*
+ * Clone the populated PMDs which cover start to end. These PMD areas
+ * can have holes.
+ */
+ for (addr = start; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd, *target_pmd;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(addr);
+ if (WARN_ON(pgd_none(*pgd)))
+ return;
+ p4d = p4d_offset(pgd, addr);
+ if (WARN_ON(p4d_none(*p4d)))
+ return;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ target_pmd = pti_user_pagetable_walk_pmd(addr);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode
+ * tables will share the last-level page tables of this
+ * address range
+ */
+ *target_pmd = pmd_clear_flags(*pmd, clear);
+ }
+}
+
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+ p4d_t *kernel_p4d, *user_p4d;
+ pgd_t *kernel_pgd;
+
+ user_p4d = pti_user_pagetable_walk_p4d(addr);
+ kernel_pgd = pgd_offset_k(addr);
+ kernel_p4d = p4d_offset(kernel_pgd, addr);
+ *user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
+/*
+ * Clone the ESPFIX P4D into the user space visinble page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+ pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void __init pti_clone_entry_text(void)
+{
+ pti_clone_pmds((unsigned long) __entry_text_start,
+ (unsigned long) __irqentry_text_end, _PAGE_RW);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("enabled\n");
+
+ pti_clone_user_shared();
+ pti_clone_entry_text();
+ pti_setup_espfix64();
+ pti_setup_vsyscall();
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3118392cdf75..a1561957dccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts. We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_asid_other(void)
+{
+ u16 asid;
+
+ /*
+ * This is only expected to be set if we have disabled
+ * kernel _PAGE_GLOBAL pages.
+ */
+ if (!static_cpu_has(X86_FEATURE_PTI)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ /* Do not need to flush the current asid */
+ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+ continue;
+ /*
+ * Make sure the next time we go to switch to
+ * this asid, we do a flush:
+ */
+ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+ }
+ this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
+ if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ clear_asid_other();
+
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+ unsigned long new_mm_cr3;
+
+ if (need_flush) {
+ invalidate_user_asid(new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid);
+ } else {
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -128,7 +182,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- write_cr3(build_cr3(next, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- write_cr3(build_cr3_noflush(next, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
@@ -288,7 +342,7 @@ void initialize_tlbstate_and_flush(void)
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm, 0));
+ write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
@@ -551,7 +605,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_single(addr);
+ __flush_tlb_one(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 1e996df687a3..e663d6bf1328 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -665,6 +665,16 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
unsigned i;
u32 base, limit, high;
struct resource *res, *conflict;
+ struct pci_dev *other;
+
+ /* Check that we are the only device of that type */
+ other = pci_get_device(dev->vendor, dev->device, NULL);
+ if (other != dev ||
+ (other = pci_get_device(dev->vendor, dev->device, other))) {
+ /* This is a multi-socket system, don't touch it for now */
+ pci_dev_put(other);
+ return;
+ }
for (i = 0; i < 8; i++) {
pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base);
@@ -696,8 +706,13 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
res->end = 0xfd00000000ull - 1;
/* Just grab the free area behind system memory for this */
- while ((conflict = request_resource_conflict(&iomem_resource, res)))
+ while ((conflict = request_resource_conflict(&iomem_resource, res))) {
+ if (conflict->end >= res->end) {
+ kfree(res);
+ return;
+ }
res->start = conflict->end + 1;
+ }
dev_info(&dev->dev, "adding root bus resource %pR\n", res);
@@ -714,10 +729,10 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
pci_bus_add_resource(dev->bus, res, 0);
}
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
#endif
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6a151ce70e86..d87ac96e37ed 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -196,6 +196,9 @@ static pgd_t *efi_pgd;
* because we want to avoid inserting EFI region mappings (EFI_VA_END
* to EFI_VA_START) into the standard kernel page tables. Everything
* else can be shared, see efi_sync_low_kernel_mappings().
+ *
+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+ * allocation.
*/
int __init efi_alloc_page_tables(void)
{
@@ -208,7 +211,7 @@ int __init efi_alloc_page_tables(void)
return 0;
gfp_mask = GFP_KERNEL | __GFP_ZERO;
- efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
if (!efi_pgd)
return -ENOMEM;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index f44c0bc95aa2..8538a6723171 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
local_flush_tlb();
stat->d_alltlb++;
} else {
- __flush_tlb_one(msg->address);
+ __flush_tlb_single(msg->address);
stat->d_onetlb++;
}
stat->d_requestee++;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 5f6fd860820a..e4cb9f4cde8a 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -128,7 +128,7 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
* on the specified blade to allow the sending of MSIs to the specified CPU.
*/
static int uv_domain_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool early)
+ struct irq_data *irq_data, bool reserve)
{
uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
return 0;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 5191de14f4df..a7d966964c6f 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -82,12 +82,8 @@ static void __save_processor_state(struct saved_context *ctxt)
/*
* descriptor tables
*/
-#ifdef CONFIG_X86_32
store_idt(&ctxt->idt);
-#else
-/* CONFIG_X86_64 */
- store_idt((struct desc_ptr *)&ctxt->idt_limit);
-#endif
+
/*
* We save it here, but restore it only in the hibernate case.
* For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit
@@ -103,22 +99,18 @@ static void __save_processor_state(struct saved_context *ctxt)
/*
* segment registers
*/
-#ifdef CONFIG_X86_32
- savesegment(es, ctxt->es);
- savesegment(fs, ctxt->fs);
+#ifdef CONFIG_X86_32_LAZY_GS
savesegment(gs, ctxt->gs);
- savesegment(ss, ctxt->ss);
-#else
-/* CONFIG_X86_64 */
- asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
- asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
- asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
- asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
- asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+#endif
+#ifdef CONFIG_X86_64
+ savesegment(gs, ctxt->gs);
+ savesegment(fs, ctxt->fs);
+ savesegment(ds, ctxt->ds);
+ savesegment(es, ctxt->es);
rdmsrl(MSR_FS_BASE, ctxt->fs_base);
- rdmsrl(MSR_GS_BASE, ctxt->gs_base);
- rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+ rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
mtrr_save_fixed_ranges(NULL);
rdmsrl(MSR_EFER, ctxt->efer);
@@ -160,17 +152,19 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
- set_tss_desc(cpu, t); /*
- * This just modifies memory; should not be
- * necessary. But... This is necessary, because
- * 386 hardware has concept of busy TSS or some
- * similar stupidity.
- */
+
+ /*
+ * We need to reload TR, which requires that we change the
+ * GDT entry to indicate "available" first.
+ *
+ * XXX: This could probably all be replaced by a call to
+ * force_reload_TR().
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
#ifdef CONFIG_X86_64
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
@@ -178,6 +172,9 @@ static void fix_processor_context(void)
write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
syscall_init(); /* This sets MSR_*STAR and related */
+#else
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ enable_sep_cpu();
#endif
load_TR_desc(); /* This does ltr */
load_mm_ldt(current->active_mm); /* This does lldt */
@@ -190,9 +187,12 @@ static void fix_processor_context(void)
}
/**
- * __restore_processor_state - restore the contents of CPU registers saved
- * by __save_processor_state()
- * @ctxt - structure to load the registers contents from
+ * __restore_processor_state - restore the contents of CPU registers saved
+ * by __save_processor_state()
+ * @ctxt - structure to load the registers contents from
+ *
+ * The asm code that gets us here will have restored a usable GDT, although
+ * it will be pointing to the wrong alias.
*/
static void notrace __restore_processor_state(struct saved_context *ctxt)
{
@@ -215,57 +215,50 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
write_cr2(ctxt->cr2);
write_cr0(ctxt->cr0);
+ /* Restore the IDT. */
+ load_idt(&ctxt->idt);
+
/*
- * now restore the descriptor tables to their proper values
- * ltr is done i fix_processor_context().
+ * Just in case the asm code got us here with the SS, DS, or ES
+ * out of sync with the GDT, update them.
*/
-#ifdef CONFIG_X86_32
- load_idt(&ctxt->idt);
-#else
-/* CONFIG_X86_64 */
- load_idt((const struct desc_ptr *)&ctxt->idt_limit);
-#endif
+ loadsegment(ss, __KERNEL_DS);
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
-#ifdef CONFIG_X86_64
/*
- * We need GSBASE restored before percpu access can work.
- * percpu access can happen in exception handlers or in complicated
- * helpers like load_gs_index().
+ * Restore percpu access. Percpu access can happen in exception
+ * handlers or in complicated helpers like load_gs_index().
*/
- wrmsrl(MSR_GS_BASE, ctxt->gs_base);
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
+#else
+ loadsegment(fs, __KERNEL_PERCPU);
+ loadsegment(gs, __KERNEL_STACK_CANARY);
#endif
+ /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */
fix_processor_context();
/*
- * Restore segment registers. This happens after restoring the GDT
- * and LDT, which happen in fix_processor_context().
+ * Now that we have descriptor tables fully restored and working
+ * exception handling, restore the usermode segments.
*/
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_64
+ loadsegment(ds, ctxt->es);
loadsegment(es, ctxt->es);
loadsegment(fs, ctxt->fs);
- loadsegment(gs, ctxt->gs);
- loadsegment(ss, ctxt->ss);
-
- /*
- * sysenter MSRs
- */
- if (boot_cpu_has(X86_FEATURE_SEP))
- enable_sep_cpu();
-#else
-/* CONFIG_X86_64 */
- asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
- asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
- asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
load_gs_index(ctxt->gs);
- asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
/*
- * Restore FSBASE and user GSBASE after reloading the respective
- * segment selectors.
+ * Restore FSBASE and GSBASE after restoring the selectors, since
+ * restoring the selectors clobbers the bases. Keep in mind
+ * that MSR_KERNEL_GS_BASE is horribly misnamed.
*/
wrmsrl(MSR_FS_BASE, ctxt->fs_base);
- wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
+#elif defined(CONFIG_X86_32_LAZY_GS)
+ loadsegment(gs, ctxt->gs);
#endif
do_fpu_end();
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 6b830d4cb4c8..de58533d3664 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -57,7 +57,7 @@ static u32 xen_apic_read(u32 reg)
return 0;
if (reg == APIC_LVR)
- return 0x10;
+ return 0x14;
#ifdef CONFIG_X86_32
if (reg == APIC_LDR)
return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d669e9d89001..c9081c6671f0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1,8 +1,12 @@
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+#include <linux/bootmem.h>
+#endif
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/interface/memory.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -331,3 +335,80 @@ void xen_arch_unregister_cpu(int num)
}
EXPORT_SYMBOL(xen_arch_unregister_cpu);
#endif
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+void __init arch_xen_balloon_init(struct resource *hostmem_resource)
+{
+ struct xen_memory_map memmap;
+ int rc;
+ unsigned int i, last_guest_ram;
+ phys_addr_t max_addr = PFN_PHYS(max_pfn);
+ struct e820_table *xen_e820_table;
+ const struct e820_entry *entry;
+ struct resource *res;
+
+ if (!xen_initial_domain())
+ return;
+
+ xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL);
+ if (!xen_e820_table)
+ return;
+
+ memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries);
+ set_xen_guest_handle(memmap.buffer, xen_e820_table->entries);
+ rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap);
+ if (rc) {
+ pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc);
+ goto out;
+ }
+
+ last_guest_ram = 0;
+ for (i = 0; i < memmap.nr_entries; i++) {
+ if (xen_e820_table->entries[i].addr >= max_addr)
+ break;
+ if (xen_e820_table->entries[i].type == E820_TYPE_RAM)
+ last_guest_ram = i;
+ }
+
+ entry = &xen_e820_table->entries[last_guest_ram];
+ if (max_addr >= entry->addr + entry->size)
+ goto out; /* No unallocated host RAM. */
+
+ hostmem_resource->start = max_addr;
+ hostmem_resource->end = entry->addr + entry->size;
+
+ /*
+ * Mark non-RAM regions between the end of dom0 RAM and end of host RAM
+ * as unavailable. The rest of that region can be used for hotplug-based
+ * ballooning.
+ */
+ for (; i < memmap.nr_entries; i++) {
+ entry = &xen_e820_table->entries[i];
+
+ if (entry->type == E820_TYPE_RAM)
+ continue;
+
+ if (entry->addr >= hostmem_resource->end)
+ break;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ goto out;
+
+ res->name = "Unavailable host RAM";
+ res->start = entry->addr;
+ res->end = (entry->addr + entry->size < hostmem_resource->end) ?
+ entry->addr + entry->size : hostmem_resource->end;
+ rc = insert_resource(hostmem_resource, res);
+ if (rc) {
+ pr_warn("%s: Can't insert [%llx - %llx) (%d)\n",
+ __func__, res->start, res->end, rc);
+ kfree(res);
+ goto out;
+ }
+ }
+
+ out:
+ kfree(xen_e820_table);
+}
+#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f2414c6c5e7c..c047f42552e1 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -88,6 +88,8 @@
#include "multicalls.h"
#include "pmu.h"
+#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
+
void *xen_initial_gdt;
static int xen_cpu_up_prepare_pv(unsigned int cpu);
@@ -826,7 +828,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
void xen_set_iopl_mask(unsigned mask)
@@ -1258,6 +1260,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
/* Work out if we support NX */
+ get_cpu_cap(&boot_cpu_data);
x86_configure_nx();
/* Get mfn list */
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index fc048ec686e7..4d62c071b166 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1902,6 +1902,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2);
+ /*
+ * Zap execute permission from the ident map. Due to the sharing of
+ * L1 entries we need to do this in the L2.
+ */
+ if (__supported_pte_mask & _PAGE_NX) {
+ for (i = 0; i < PTRS_PER_PMD; ++i) {
+ if (pmd_none(level2_ident_pgt[i]))
+ continue;
+ level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX);
+ }
+ }
+
/* Copy the initial P->M table mappings if necessary. */
i = pgd_index(xen_start_info->mfn_list);
if (i && i < pgd_index(__START_KERNEL_map))
@@ -2261,7 +2273,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
- case FIX_RO_IDT:
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
# ifdef CONFIG_HIGHMEM
@@ -2272,7 +2283,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
- case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index c114ca767b3b..6e0d2086eacb 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -808,7 +808,6 @@ char * __init xen_memory_setup(void)
addr = xen_e820_table.entries[0].addr;
size = xen_e820_table.entries[0].size;
while (i < xen_e820_table.nr_entries) {
- bool discard = false;
chunk_size = size;
type = xen_e820_table.entries[i].type;
@@ -824,11 +823,10 @@ char * __init xen_memory_setup(void)
xen_add_extra_mem(pfn_s, n_pfns);
xen_max_p2m_pfn = pfn_s + n_pfns;
} else
- discard = true;
+ type = E820_TYPE_UNUSABLE;
}
- if (!discard)
- xen_align_and_add_e820_region(addr, chunk_size, type);
+ xen_align_and_add_e820_region(addr, chunk_size, type);
addr += chunk_size;
size -= chunk_size;
diff --git a/block/bio.c b/block/bio.c
index 8bfdea58159b..9ef6cf3addb3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
bio->bi_disk = bio_src->bi_disk;
bio->bi_partno = bio_src->bi_partno;
bio_set_flag(bio, BIO_CLONED);
+ if (bio_flagged(bio_src, BIO_THROTTLED))
+ bio_set_flag(bio, BIO_THROTTLED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
diff --git a/block/blk-map.c b/block/blk-map.c
index b21f8e86f120..d3a94719f03f 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,22 +12,29 @@
#include "blk.h"
/*
- * Append a bio to a passthrough request. Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request. Only works if the bio can be merged
+ * into the request based on the driver constraints.
*/
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
{
- blk_queue_bounce(rq->q, &bio);
+ struct bio *orig_bio = *bio;
+
+ blk_queue_bounce(rq->q, bio);
if (!rq->bio) {
- blk_rq_bio_prep(rq->q, rq, bio);
+ blk_rq_bio_prep(rq->q, rq, *bio);
} else {
- if (!ll_back_merge_fn(rq->q, rq, bio))
+ if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+ if (orig_bio != *bio) {
+ bio_put(*bio);
+ *bio = orig_bio;
+ }
return -EINVAL;
+ }
- rq->biotail->bi_next = bio;
- rq->biotail = bio;
- rq->__data_len += bio->bi_iter.bi_size;
+ rq->biotail->bi_next = *bio;
+ rq->biotail = *bio;
+ rq->__data_len += (*bio)->bi_iter.bi_size;
}
return 0;
@@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
* We link the bounce buffer in and could have to traverse it
* later so we have to get a ref to prevent it from being freed
*/
- ret = blk_rq_append_bio(rq, bio);
- bio_get(bio);
+ ret = blk_rq_append_bio(rq, &bio);
if (ret) {
- bio_endio(bio);
__blk_rq_unmap_user(orig_bio);
- bio_put(bio);
return ret;
}
+ bio_get(bio);
return 0;
}
@@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
int do_copy = 0;
- struct bio *bio;
+ struct bio *bio, *orig_bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
@@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (do_copy)
rq->rq_flags |= RQF_COPY_USER;
- ret = blk_rq_append_bio(rq, bio);
+ orig_bio = bio;
+ ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
/* request is too big */
- bio_put(bio);
+ bio_put(orig_bio);
return ret;
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 825bc29767e6..d19f416d6101 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2226,13 +2226,7 @@ again:
out_unlock:
spin_unlock_irq(q->queue_lock);
out:
- /*
- * As multiple blk-throtls may stack in the same issue path, we
- * don't want bios to leave with the flag set. Clear the flag if
- * being issued.
- */
- if (!throttled)
- bio_clear_flag(bio, BIO_THROTTLED);
+ bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)
diff --git a/block/bounce.c b/block/bounce.c
index fceb1a96480b..1d05c422c932 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
unsigned i = 0;
bool bounce = false;
int sectors = 0;
+ bool passthrough = bio_is_passthrough(*bio_orig);
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_PAGES)
@@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
if (!bounce)
return;
- if (sectors < bio_sectors(*bio_orig)) {
+ if (!passthrough && sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
bio_chain(bio, *bio_orig);
generic_make_request(*bio_orig);
*bio_orig = bio;
}
- bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+ bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+ bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b4df317c2916..f95c60774ce8 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -100,9 +100,13 @@ struct kyber_hctx_data {
unsigned int cur_domain;
unsigned int batching;
wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
+ struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
};
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
+ void *key);
+
static int rq_sched_domain(const struct request *rq)
{
unsigned int op = rq->cmd_flags;
@@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
INIT_LIST_HEAD(&khd->rqs[i]);
+ init_waitqueue_func_entry(&khd->domain_wait[i],
+ kyber_domain_wake);
+ khd->domain_wait[i].private = hctx;
INIT_LIST_HEAD(&khd->domain_wait[i].entry);
atomic_set(&khd->wait_index[i], 0);
}
@@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
int nr;
nr = __sbitmap_queue_get(domain_tokens);
- if (nr >= 0)
- return nr;
/*
* If we failed to get a domain token, make sure the hardware queue is
* run when one becomes available. Note that this is serialized on
* khd->lock, but we still need to be careful about the waker.
*/
- if (list_empty_careful(&wait->entry)) {
- init_waitqueue_func_entry(wait, kyber_domain_wake);
- wait->private = hctx;
+ if (nr < 0 && list_empty_careful(&wait->entry)) {
ws = sbq_wait_ptr(domain_tokens,
&khd->wait_index[sched_domain]);
+ khd->domain_ws[sched_domain] = ws;
add_wait_queue(&ws->wait, wait);
/*
* Try again in case a token was freed before we got on the wait
- * queue. The waker may have already removed the entry from the
- * wait queue, but list_del_init() is okay with that.
+ * queue.
*/
nr = __sbitmap_queue_get(domain_tokens);
- if (nr >= 0) {
- unsigned long flags;
+ }
- spin_lock_irqsave(&ws->wait.lock, flags);
- list_del_init(&wait->entry);
- spin_unlock_irqrestore(&ws->wait.lock, flags);
- }
+ /*
+ * If we got a token while we were on the wait queue, remove ourselves
+ * from the wait queue to ensure that all wake ups make forward
+ * progress. It's possible that the waker already deleted the entry
+ * between the !list_empty_careful() check and us grabbing the lock, but
+ * list_del_init() is okay with that.
+ */
+ if (nr >= 0 && !list_empty_careful(&wait->entry)) {
+ ws = khd->domain_ws[sched_domain];
+ spin_lock_irq(&ws->wait.lock);
+ list_del_init(&wait->entry);
+ spin_unlock_irq(&ws->wait.lock);
}
+
return nr;
}
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 358749c38894..444a387df219 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -672,14 +672,15 @@ void af_alg_free_areq_sgls(struct af_alg_async_req *areq)
}
tsgl = areq->tsgl;
- for_each_sg(tsgl, sg, areq->tsgl_entries, i) {
- if (!sg_page(sg))
- continue;
- put_page(sg_page(sg));
- }
+ if (tsgl) {
+ for_each_sg(tsgl, sg, areq->tsgl_entries, i) {
+ if (!sg_page(sg))
+ continue;
+ put_page(sg_page(sg));
+ }
- if (areq->tsgl && areq->tsgl_entries)
sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl));
+ }
}
EXPORT_SYMBOL_GPL(af_alg_free_areq_sgls);
@@ -1137,12 +1138,6 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
if (!af_alg_readable(sk))
break;
- if (!ctx->used) {
- err = af_alg_wait_for_data(sk, flags);
- if (err)
- return err;
- }
-
seglen = min_t(size_t, (maxsize - len),
msg_data_left(msg));
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 805f485ddf1b..ddcc45f77edd 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -111,6 +111,12 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
size_t usedpages = 0; /* [in] RX bufs to be used from user */
size_t processed = 0; /* [in] TX bufs to be consumed */
+ if (!ctx->used) {
+ err = af_alg_wait_for_data(sk, flags);
+ if (err)
+ return err;
+ }
+
/*
* Data length provided by caller via sendmsg/sendpage that has not
* yet been processed.
@@ -285,6 +291,10 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
/* AIO operation */
sock_hold(sk);
areq->iocb = msg->msg_iocb;
+
+ /* Remember output size that will be generated. */
+ areq->outlen = outlen;
+
aead_request_set_callback(&areq->cra_u.aead_req,
CRYPTO_TFM_REQ_MAY_BACKLOG,
af_alg_async_cb, areq);
@@ -292,12 +302,8 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
crypto_aead_decrypt(&areq->cra_u.aead_req);
/* AIO operation in progress */
- if (err == -EINPROGRESS || err == -EBUSY) {
- /* Remember output size that will be generated. */
- areq->outlen = outlen;
-
+ if (err == -EINPROGRESS || err == -EBUSY)
return -EIOCBQUEUED;
- }
sock_put(sk);
} else {
@@ -503,6 +509,7 @@ static void aead_release(void *private)
struct aead_tfm *tfm = private;
crypto_free_aead(tfm->aead);
+ crypto_put_default_null_skcipher2();
kfree(tfm);
}
@@ -535,7 +542,6 @@ static void aead_sock_destruct(struct sock *sk)
unsigned int ivlen = crypto_aead_ivsize(tfm);
af_alg_pull_tsgl(sk, ctx->used, NULL, 0);
- crypto_put_default_null_skcipher2();
sock_kzfree_s(sk, ctx->iv, ivlen);
sock_kfree_s(sk, ctx, ctx->len);
af_alg_release_parent(sk);
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 30cff827dd8f..baef9bfccdda 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -72,6 +72,12 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
int err = 0;
size_t len = 0;
+ if (!ctx->used) {
+ err = af_alg_wait_for_data(sk, flags);
+ if (err)
+ return err;
+ }
+
/* Allocate cipher request for current operation. */
areq = af_alg_alloc_areq(sk, sizeof(struct af_alg_async_req) +
crypto_skcipher_reqsize(tfm));
@@ -119,6 +125,10 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
/* AIO operation */
sock_hold(sk);
areq->iocb = msg->msg_iocb;
+
+ /* Remember output size that will be generated. */
+ areq->outlen = len;
+
skcipher_request_set_callback(&areq->cra_u.skcipher_req,
CRYPTO_TFM_REQ_MAY_SLEEP,
af_alg_async_cb, areq);
@@ -127,12 +137,8 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
crypto_skcipher_decrypt(&areq->cra_u.skcipher_req);
/* AIO operation in progress */
- if (err == -EINPROGRESS || err == -EBUSY) {
- /* Remember output size that will be generated. */
- areq->outlen = len;
-
+ if (err == -EINPROGRESS || err == -EBUSY)
return -EIOCBQUEUED;
- }
sock_put(sk);
} else {
diff --git a/crypto/hmac.c b/crypto/hmac.c
index 92871dc2a63e..e74730224f0a 100644
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@@ -195,11 +195,15 @@ static int hmac_create(struct crypto_template *tmpl, struct rtattr **tb)
salg = shash_attr_alg(tb[1], 0, 0);
if (IS_ERR(salg))
return PTR_ERR(salg);
+ alg = &salg->base;
+ /* The underlying hash algorithm must be unkeyed */
err = -EINVAL;
+ if (crypto_shash_alg_has_setkey(salg))
+ goto out_put_alg;
+
ds = salg->digestsize;
ss = salg->statesize;
- alg = &salg->base;
if (ds > alg->cra_blocksize ||
ss < alg->cra_blocksize)
goto out_put_alg;
diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index 4e6472658852..eca04d3729b3 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -81,6 +81,7 @@ static int mcryptd_init_queue(struct mcryptd_queue *queue,
pr_debug("cpu_queue #%d %p\n", cpu, queue->cpu_queue);
crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
INIT_WORK(&cpu_queue->work, mcryptd_queue_worker);
+ spin_lock_init(&cpu_queue->q_lock);
}
return 0;
}
@@ -104,15 +105,16 @@ static int mcryptd_enqueue_request(struct mcryptd_queue *queue,
int cpu, err;
struct mcryptd_cpu_queue *cpu_queue;
- cpu = get_cpu();
- cpu_queue = this_cpu_ptr(queue->cpu_queue);
- rctx->tag.cpu = cpu;
+ cpu_queue = raw_cpu_ptr(queue->cpu_queue);
+ spin_lock(&cpu_queue->q_lock);
+ cpu = smp_processor_id();
+ rctx->tag.cpu = smp_processor_id();
err = crypto_enqueue_request(&cpu_queue->queue, request);
pr_debug("enqueue request: cpu %d cpu_queue %p request %p\n",
cpu, cpu_queue, request);
+ spin_unlock(&cpu_queue->q_lock);
queue_work_on(cpu, kcrypto_wq, &cpu_queue->work);
- put_cpu();
return err;
}
@@ -161,16 +163,11 @@ static void mcryptd_queue_worker(struct work_struct *work)
cpu_queue = container_of(work, struct mcryptd_cpu_queue, work);
i = 0;
while (i < MCRYPTD_BATCH || single_task_running()) {
- /*
- * preempt_disable/enable is used to prevent
- * being preempted by mcryptd_enqueue_request()
- */
- local_bh_disable();
- preempt_disable();
+
+ spin_lock_bh(&cpu_queue->q_lock);
backlog = crypto_get_backlog(&cpu_queue->queue);
req = crypto_dequeue_request(&cpu_queue->queue);
- preempt_enable();
- local_bh_enable();
+ spin_unlock_bh(&cpu_queue->q_lock);
if (!req) {
mcryptd_opportunistic_flush();
@@ -185,7 +182,7 @@ static void mcryptd_queue_worker(struct work_struct *work)
++i;
}
if (cpu_queue->queue.qlen)
- queue_work(kcrypto_wq, &cpu_queue->work);
+ queue_work_on(smp_processor_id(), kcrypto_wq, &cpu_queue->work);
}
void mcryptd_flusher(struct work_struct *__work)
diff --git a/crypto/rsa_helper.c b/crypto/rsa_helper.c
index 0b66dc824606..cad395d70d78 100644
--- a/crypto/rsa_helper.c
+++ b/crypto/rsa_helper.c
@@ -30,7 +30,7 @@ int rsa_get_n(void *context, size_t hdrlen, unsigned char tag,
return -EINVAL;
if (fips_enabled) {
- while (!*ptr && n_sz) {
+ while (n_sz && !*ptr) {
ptr++;
n_sz--;
}
diff --git a/crypto/salsa20_generic.c b/crypto/salsa20_generic.c
index f550b5d94630..d7da0eea5622 100644
--- a/crypto/salsa20_generic.c
+++ b/crypto/salsa20_generic.c
@@ -188,13 +188,6 @@ static int encrypt(struct blkcipher_desc *desc,
salsa20_ivsetup(ctx, walk.iv);
- if (likely(walk.nbytes == nbytes))
- {
- salsa20_encrypt_bytes(ctx, walk.dst.virt.addr,
- walk.src.virt.addr, nbytes);
- return blkcipher_walk_done(desc, &walk, 0);
- }
-
while (walk.nbytes >= 64) {
salsa20_encrypt_bytes(ctx, walk.dst.virt.addr,
walk.src.virt.addr,
diff --git a/crypto/shash.c b/crypto/shash.c
index 325a14da5827..e849d3ee2e27 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -25,11 +25,12 @@
static const struct crypto_type crypto_shash_type;
-static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
- unsigned int keylen)
+int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
+ unsigned int keylen)
{
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(shash_no_setkey);
static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen)
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 778e0ff42bfa..11af5fd6a443 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -449,6 +449,8 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
walk->total = req->cryptlen;
walk->nbytes = 0;
+ walk->iv = req->iv;
+ walk->oiv = req->iv;
if (unlikely(!walk->total))
return 0;
@@ -456,9 +458,6 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
scatterwalk_start(&walk->in, req->src);
scatterwalk_start(&walk->out, req->dst);
- walk->iv = req->iv;
- walk->oiv = req->iv;
-
walk->flags &= ~SKCIPHER_WALK_SLEEP;
walk->flags |= req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
SKCIPHER_WALK_SLEEP : 0;
@@ -510,6 +509,8 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
int err;
walk->nbytes = 0;
+ walk->iv = req->iv;
+ walk->oiv = req->iv;
if (unlikely(!walk->total))
return 0;
@@ -525,9 +526,6 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
scatterwalk_done(&walk->in, 0, walk->total);
scatterwalk_done(&walk->out, 0, walk->total);
- walk->iv = req->iv;
- walk->oiv = req->iv;
-
if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP)
walk->flags |= SKCIPHER_WALK_SLEEP;
else
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6742f6c68034..9bff853e85f3 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -1007,7 +1007,7 @@ skip:
/* The record may be cleared by others, try read next record */
if (len == -ENOENT)
goto skip;
- else if (len < sizeof(*rcd)) {
+ else if (len < 0 || len < sizeof(*rcd)) {
rc = -EIO;
goto out;
}
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 30e84cc600ae..06ea4749ebd9 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1171,7 +1171,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
struct cpc_register_resource *desired_reg;
int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
- struct cppc_pcc_data *pcc_ss_data = pcc_data[pcc_ss_id];
+ struct cppc_pcc_data *pcc_ss_data;
int ret = 0;
if (!cpc_desc || pcc_ss_id < 0) {
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index e4ffaeec9ec2..a4c8ad98560d 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1138,7 +1138,7 @@ int acpi_subsys_thaw_noirq(struct device *dev)
* skip all of the subsequent "thaw" callbacks for the device.
*/
if (dev_pm_smart_suspend_and_suspended(dev)) {
- dev->power.direct_complete = true;
+ dev_pm_skip_next_resume_phases(dev);
return 0;
}
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ff2580e7611d..abeb4df4f22e 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1670,6 +1670,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
dev_name(&adev_dimm->dev));
return -ENXIO;
}
+ /*
+ * Record nfit_mem for the notification path to track back to
+ * the nfit sysfs attributes for this dimm device object.
+ */
+ dev_set_drvdata(&adev_dimm->dev, nfit_mem);
/*
* Until standardization materializes we need to consider 4
@@ -1752,9 +1757,11 @@ static void shutdown_dimm_notify(void *data)
sysfs_put(nfit_mem->flags_attr);
nfit_mem->flags_attr = NULL;
}
- if (adev_dimm)
+ if (adev_dimm) {
acpi_remove_notify_handler(adev_dimm->handle,
ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify);
+ dev_set_drvdata(&adev_dimm->dev, NULL);
+ }
}
mutex_unlock(&acpi_desc->init_mutex);
}
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index bccec9de0533..a7ecfde66b7b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -482,7 +482,8 @@ enum binder_deferred_state {
* @tsk task_struct for group_leader of process
* (invariant after initialized)
* @files files_struct for process
- * (invariant after initialized)
+ * (protected by @files_lock)
+ * @files_lock mutex to protect @files
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -530,6 +531,7 @@ struct binder_proc {
int pid;
struct task_struct *tsk;
struct files_struct *files;
+ struct mutex files_lock;
struct hlist_node deferred_work_node;
int deferred_work;
bool is_dead;
@@ -877,20 +879,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
{
- struct files_struct *files = proc->files;
unsigned long rlim_cur;
unsigned long irqs;
+ int ret;
- if (files == NULL)
- return -ESRCH;
-
- if (!lock_task_sighand(proc->tsk, &irqs))
- return -EMFILE;
-
+ mutex_lock(&proc->files_lock);
+ if (proc->files == NULL) {
+ ret = -ESRCH;
+ goto err;
+ }
+ if (!lock_task_sighand(proc->tsk, &irqs)) {
+ ret = -EMFILE;
+ goto err;
+ }
rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
unlock_task_sighand(proc->tsk, &irqs);
- return __alloc_fd(files, 0, rlim_cur, flags);
+ ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
+err:
+ mutex_unlock(&proc->files_lock);
+ return ret;
}
/*
@@ -899,8 +907,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
static void task_fd_install(
struct binder_proc *proc, unsigned int fd, struct file *file)
{
+ mutex_lock(&proc->files_lock);
if (proc->files)
__fd_install(proc->files, fd, file);
+ mutex_unlock(&proc->files_lock);
}
/*
@@ -910,9 +920,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
{
int retval;
- if (proc->files == NULL)
- return -ESRCH;
-
+ mutex_lock(&proc->files_lock);
+ if (proc->files == NULL) {
+ retval = -ESRCH;
+ goto err;
+ }
retval = __close_fd(proc->files, fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
@@ -920,7 +932,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
retval == -ERESTARTNOHAND ||
retval == -ERESTART_RESTARTBLOCK))
retval = -EINTR;
-
+err:
+ mutex_unlock(&proc->files_lock);
return retval;
}
@@ -4627,7 +4640,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
ret = binder_alloc_mmap_handler(&proc->alloc, vma);
if (ret)
return ret;
+ mutex_lock(&proc->files_lock);
proc->files = get_files_struct(current);
+ mutex_unlock(&proc->files_lock);
return 0;
err_bad_arg:
@@ -4651,6 +4666,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ mutex_init(&proc->files_lock);
INIT_LIST_HEAD(&proc->todo);
proc->default_priority = task_nice(current);
binder_dev = container_of(filp->private_data, struct binder_device,
@@ -4903,9 +4919,11 @@ static void binder_deferred_func(struct work_struct *work)
files = NULL;
if (defer & BINDER_DEFERRED_PUT_FILES) {
+ mutex_lock(&proc->files_lock);
files = proc->files;
if (files)
proc->files = NULL;
+ mutex_unlock(&proc->files_lock);
}
if (defer & BINDER_DEFERRED_FLUSH)
diff --git a/drivers/ata/ahci_mtk.c b/drivers/ata/ahci_mtk.c
index 80854f71559a..0ae6971c2a4c 100644
--- a/drivers/ata/ahci_mtk.c
+++ b/drivers/ata/ahci_mtk.c
@@ -1,5 +1,5 @@
/*
- * MeidaTek AHCI SATA driver
+ * MediaTek AHCI SATA driver
*
* Copyright (c) 2017 MediaTek Inc.
* Author: Ryder Lee <ryder.lee@mediatek.com>
@@ -25,7 +25,7 @@
#include <linux/reset.h>
#include "ahci.h"
-#define DRV_NAME "ahci"
+#define DRV_NAME "ahci-mtk"
#define SYS_CFG 0x14
#define SYS_CFG_SATA_MSK GENMASK(31, 30)
@@ -192,5 +192,5 @@ static struct platform_driver mtk_ahci_driver = {
};
module_platform_driver(mtk_ahci_driver);
-MODULE_DESCRIPTION("MeidaTek SATA AHCI Driver");
+MODULE_DESCRIPTION("MediaTek SATA AHCI Driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c
index b6b0bf76dfc7..2685f28160f7 100644
--- a/drivers/ata/ahci_qoriq.c
+++ b/drivers/ata/ahci_qoriq.c
@@ -35,6 +35,8 @@
/* port register default value */
#define AHCI_PORT_PHY_1_CFG 0xa003fffe
+#define AHCI_PORT_PHY2_CFG 0x28184d1f
+#define AHCI_PORT_PHY3_CFG 0x0e081509
#define AHCI_PORT_TRANS_CFG 0x08000029
#define AHCI_PORT_AXICC_CFG 0x3fffffff
@@ -183,6 +185,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv)
writel(readl(qpriv->ecc_addr) | ECC_DIS_ARMV8_CH2,
qpriv->ecc_addr);
writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1);
+ writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2);
+ writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3);
writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS);
if (qpriv->is_dmacoherent)
writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC);
@@ -190,6 +194,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv)
case AHCI_LS2080A:
writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1);
+ writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2);
+ writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3);
writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS);
if (qpriv->is_dmacoherent)
writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC);
@@ -201,6 +207,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv)
writel(readl(qpriv->ecc_addr) | ECC_DIS_ARMV8_CH2,
qpriv->ecc_addr);
writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1);
+ writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2);
+ writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3);
writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS);
if (qpriv->is_dmacoherent)
writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC);
@@ -212,6 +220,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv)
writel(readl(qpriv->ecc_addr) | ECC_DIS_LS1088A,
qpriv->ecc_addr);
writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1);
+ writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2);
+ writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3);
writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS);
if (qpriv->is_dmacoherent)
writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC);
@@ -219,6 +229,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv)
case AHCI_LS2088A:
writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1);
+ writel(AHCI_PORT_PHY2_CFG, reg_base + PORT_PHY2);
+ writel(AHCI_PORT_PHY3_CFG, reg_base + PORT_PHY3);
writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS);
if (qpriv->is_dmacoherent)
writel(AHCI_PORT_AXICC_CFG, reg_base + PORT_AXICC);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 2a882929de4a..8193b38a1cae 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3082,13 +3082,19 @@ int sata_down_spd_limit(struct ata_link *link, u32 spd_limit)
bit = fls(mask) - 1;
mask &= ~(1 << bit);
- /* Mask off all speeds higher than or equal to the current
- * one. Force 1.5Gbps if current SPD is not available.
+ /*
+ * Mask off all speeds higher than or equal to the current one. At
+ * this point, if current SPD is not available and we previously
+ * recorded the link speed from SStatus, the driver has already
+ * masked off the highest bit so mask should already be 1 or 0.
+ * Otherwise, we should not force 1.5Gbps on a link where we have
+ * not previously recorded speed from SStatus. Just return in this
+ * case.
*/
if (spd > 1)
mask &= (1 << (spd - 1)) - 1;
else
- mask &= 1;
+ return -EINVAL;
/* were we already at the bottom? */
if (!mask)
diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c
index ffd8d33c6e0f..6db2e34bd52f 100644
--- a/drivers/ata/pata_pdc2027x.c
+++ b/drivers/ata/pata_pdc2027x.c
@@ -82,7 +82,7 @@ static int pdc2027x_set_mode(struct ata_link *link, struct ata_device **r_failed
* is issued to the device. However, if the controller clock is 133MHz,
* the following tables must be used.
*/
-static struct pdc2027x_pio_timing {
+static const struct pdc2027x_pio_timing {
u8 value0, value1, value2;
} pdc2027x_pio_timing_tbl[] = {
{ 0xfb, 0x2b, 0xac }, /* PIO mode 0 */
@@ -92,7 +92,7 @@ static struct pdc2027x_pio_timing {
{ 0x23, 0x09, 0x25 }, /* PIO mode 4, IORDY on, Prefetch off */
};
-static struct pdc2027x_mdma_timing {
+static const struct pdc2027x_mdma_timing {
u8 value0, value1;
} pdc2027x_mdma_timing_tbl[] = {
{ 0xdf, 0x5f }, /* MDMA mode 0 */
@@ -100,7 +100,7 @@ static struct pdc2027x_mdma_timing {
{ 0x69, 0x25 }, /* MDMA mode 2 */
};
-static struct pdc2027x_udma_timing {
+static const struct pdc2027x_udma_timing {
u8 value0, value1, value2;
} pdc2027x_udma_timing_tbl[] = {
{ 0x4a, 0x0f, 0xd5 }, /* UDMA mode 0 */
@@ -649,7 +649,7 @@ static long pdc_detect_pll_input_clock(struct ata_host *host)
* @host: target ATA host
* @board_idx: board identifier
*/
-static int pdc_hardware_init(struct ata_host *host, unsigned int board_idx)
+static void pdc_hardware_init(struct ata_host *host, unsigned int board_idx)
{
long pll_clock;
@@ -665,8 +665,6 @@ static int pdc_hardware_init(struct ata_host *host, unsigned int board_idx)
/* Adjust PLL control register */
pdc_adjust_pll(host, pll_clock, board_idx);
-
- return 0;
}
/**
@@ -753,8 +751,7 @@ static int pdc2027x_init_one(struct pci_dev *pdev,
//pci_enable_intx(pdev);
/* initialize adapter */
- if (pdc_hardware_init(host, board_idx) != 0)
- return -EIO;
+ pdc_hardware_init(host, board_idx);
pci_set_master(pdev);
return ata_host_activate(host, pdev->irq, ata_bmdma_interrupt,
@@ -778,8 +775,7 @@ static int pdc2027x_reinit_one(struct pci_dev *pdev)
else
board_idx = PDC_UDMA_133;
- if (pdc_hardware_init(host, board_idx))
- return -EIO;
+ pdc_hardware_init(host, board_idx);
ata_host_resume(host);
return 0;
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index eb3af2739537..07532d83be0b 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
}
+static bool cache_node_is_unified(struct cacheinfo *this_leaf)
+{
+ return of_property_read_bool(this_leaf->of_node, "cache-unified");
+}
+
static void cache_of_override_properties(unsigned int cpu)
{
int index;
@@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
for (index = 0; index < cache_leaves(cpu); index++) {
this_leaf = this_cpu_ci->info_list + index;
+ /*
+ * init_cache_level must setup the cache level correctly
+ * overriding the architecturally specified levels, so
+ * if type is NONE at this stage, it should be unified
+ */
+ if (this_leaf->type == CACHE_TYPE_NOCACHE &&
+ cache_node_is_unified(this_leaf))
+ this_leaf->type = CACHE_TYPE_UNIFIED;
cache_size(this_leaf);
cache_get_line_size(this_leaf);
cache_nr_sets(this_leaf);
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index db2f04415927..08744b572af6 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -526,6 +526,21 @@ static void dpm_watchdog_clear(struct dpm_watchdog *wd)
/*------------------------- Resume routines -------------------------*/
/**
+ * dev_pm_skip_next_resume_phases - Skip next system resume phases for device.
+ * @dev: Target device.
+ *
+ * Make the core skip the "early resume" and "resume" phases for @dev.
+ *
+ * This function can be called by middle-layer code during the "noirq" phase of
+ * system resume if necessary, but not by device drivers.
+ */
+void dev_pm_skip_next_resume_phases(struct device *dev)
+{
+ dev->power.is_late_suspended = false;
+ dev->power.is_suspended = false;
+}
+
+/**
* device_resume_noirq - Execute a "noirq resume" callback for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index ccb9975a97fa..ad0477ae820f 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -35,13 +35,13 @@ static inline u64 mb_per_tick(int mbps)
struct nullb_cmd {
struct list_head list;
struct llist_node ll_list;
- call_single_data_t csd;
+ struct __call_single_data csd;
struct request *rq;
struct bio *bio;
unsigned int tag;
+ blk_status_t error;
struct nullb_queue *nq;
struct hrtimer timer;
- blk_status_t error;
};
struct nullb_queue {
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 779869ed32b1..71fad747c0c7 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -199,6 +199,9 @@ struct smi_info {
/* The timer for this si. */
struct timer_list si_timer;
+ /* This flag is set, if the timer can be set */
+ bool timer_can_start;
+
/* This flag is set, if the timer is running (timer_pending() isn't enough) */
bool timer_running;
@@ -355,6 +358,8 @@ out:
static void smi_mod_timer(struct smi_info *smi_info, unsigned long new_val)
{
+ if (!smi_info->timer_can_start)
+ return;
smi_info->last_timeout_jiffies = jiffies;
mod_timer(&smi_info->si_timer, new_val);
smi_info->timer_running = true;
@@ -374,21 +379,18 @@ static void start_new_msg(struct smi_info *smi_info, unsigned char *msg,
smi_info->handlers->start_transaction(smi_info->si_sm, msg, size);
}
-static void start_check_enables(struct smi_info *smi_info, bool start_timer)
+static void start_check_enables(struct smi_info *smi_info)
{
unsigned char msg[2];
msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
- if (start_timer)
- start_new_msg(smi_info, msg, 2);
- else
- smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+ start_new_msg(smi_info, msg, 2);
smi_info->si_state = SI_CHECKING_ENABLES;
}
-static void start_clear_flags(struct smi_info *smi_info, bool start_timer)
+static void start_clear_flags(struct smi_info *smi_info)
{
unsigned char msg[3];
@@ -397,10 +399,7 @@ static void start_clear_flags(struct smi_info *smi_info, bool start_timer)
msg[1] = IPMI_CLEAR_MSG_FLAGS_CMD;
msg[2] = WDT_PRE_TIMEOUT_INT;
- if (start_timer)
- start_new_msg(smi_info, msg, 3);
- else
- smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3);
+ start_new_msg(smi_info, msg, 3);
smi_info->si_state = SI_CLEARING_FLAGS;
}
@@ -435,11 +434,11 @@ static void start_getting_events(struct smi_info *smi_info)
* Note that we cannot just use disable_irq(), since the interrupt may
* be shared.
*/
-static inline bool disable_si_irq(struct smi_info *smi_info, bool start_timer)
+static inline bool disable_si_irq(struct smi_info *smi_info)
{
if ((smi_info->io.irq) && (!smi_info->interrupt_disabled)) {
smi_info->interrupt_disabled = true;
- start_check_enables(smi_info, start_timer);
+ start_check_enables(smi_info);
return true;
}
return false;
@@ -449,7 +448,7 @@ static inline bool enable_si_irq(struct smi_info *smi_info)
{
if ((smi_info->io.irq) && (smi_info->interrupt_disabled)) {
smi_info->interrupt_disabled = false;
- start_check_enables(smi_info, true);
+ start_check_enables(smi_info);
return true;
}
return false;
@@ -467,7 +466,7 @@ static struct ipmi_smi_msg *alloc_msg_handle_irq(struct smi_info *smi_info)
msg = ipmi_alloc_smi_msg();
if (!msg) {
- if (!disable_si_irq(smi_info, true))
+ if (!disable_si_irq(smi_info))
smi_info->si_state = SI_NORMAL;
} else if (enable_si_irq(smi_info)) {
ipmi_free_smi_msg(msg);
@@ -483,7 +482,7 @@ retry:
/* Watchdog pre-timeout */
smi_inc_stat(smi_info, watchdog_pretimeouts);
- start_clear_flags(smi_info, true);
+ start_clear_flags(smi_info);
smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT;
if (smi_info->intf)
ipmi_smi_watchdog_pretimeout(smi_info->intf);
@@ -866,7 +865,7 @@ restart:
* disable and messages disabled.
*/
if (smi_info->supports_event_msg_buff || smi_info->io.irq) {
- start_check_enables(smi_info, true);
+ start_check_enables(smi_info);
} else {
smi_info->curr_msg = alloc_msg_handle_irq(smi_info);
if (!smi_info->curr_msg)
@@ -1167,6 +1166,7 @@ static int smi_start_processing(void *send_info,
/* Set up the timer that drives the interface. */
timer_setup(&new_smi->si_timer, smi_timeout, 0);
+ new_smi->timer_can_start = true;
smi_mod_timer(new_smi, jiffies + SI_TIMEOUT_JIFFIES);
/* Try to claim any interrupts. */
@@ -1936,10 +1936,12 @@ static void check_for_broken_irqs(struct smi_info *smi_info)
check_set_rcv_irq(smi_info);
}
-static inline void wait_for_timer_and_thread(struct smi_info *smi_info)
+static inline void stop_timer_and_thread(struct smi_info *smi_info)
{
if (smi_info->thread != NULL)
kthread_stop(smi_info->thread);
+
+ smi_info->timer_can_start = false;
if (smi_info->timer_running)
del_timer_sync(&smi_info->si_timer);
}
@@ -2152,7 +2154,7 @@ static int try_smi_init(struct smi_info *new_smi)
* Start clearing the flags before we enable interrupts or the
* timer to avoid racing with the timer.
*/
- start_clear_flags(new_smi, false);
+ start_clear_flags(new_smi);
/*
* IRQ is defined to be set when non-zero. req_events will
@@ -2238,7 +2240,7 @@ out_err_remove_attrs:
dev_set_drvdata(new_smi->io.dev, NULL);
out_err_stop_timer:
- wait_for_timer_and_thread(new_smi);
+ stop_timer_and_thread(new_smi);
out_err:
new_smi->interrupt_disabled = true;
@@ -2388,7 +2390,7 @@ static void cleanup_one_si(struct smi_info *to_clean)
*/
if (to_clean->io.irq_cleanup)
to_clean->io.irq_cleanup(&to_clean->io);
- wait_for_timer_and_thread(to_clean);
+ stop_timer_and_thread(to_clean);
/*
* Timeouts are stopped, now make sure the interrupts are off
@@ -2400,7 +2402,7 @@ static void cleanup_one_si(struct smi_info *to_clean)
schedule_timeout_uninterruptible(1);
}
if (to_clean->handlers)
- disable_si_irq(to_clean, false);
+ disable_si_irq(to_clean);
while (to_clean->curr_msg || (to_clean->si_state != SI_NORMAL)) {
poll(to_clean);
schedule_timeout_uninterruptible(1);
diff --git a/drivers/char/ipmi/ipmi_si_parisc.c b/drivers/char/ipmi/ipmi_si_parisc.c
index 090b073ab441..6b10f0e18a95 100644
--- a/drivers/char/ipmi/ipmi_si_parisc.c
+++ b/drivers/char/ipmi/ipmi_si_parisc.c
@@ -10,6 +10,8 @@ static int __init ipmi_parisc_probe(struct parisc_device *dev)
{
struct si_sm_io io;
+ memset(&io, 0, sizeof(io));
+
io.si_type = SI_KCS;
io.addr_source = SI_DEVICETREE;
io.addr_type = IPMI_MEM_ADDR_SPACE;
diff --git a/drivers/char/ipmi/ipmi_si_pci.c b/drivers/char/ipmi/ipmi_si_pci.c
index 99771f5cad07..27dd11c49d21 100644
--- a/drivers/char/ipmi/ipmi_si_pci.c
+++ b/drivers/char/ipmi/ipmi_si_pci.c
@@ -103,10 +103,13 @@ static int ipmi_pci_probe(struct pci_dev *pdev,
io.addr_source_cleanup = ipmi_pci_cleanup;
io.addr_source_data = pdev;
- if (pci_resource_flags(pdev, 0) & IORESOURCE_IO)
+ if (pci_resource_flags(pdev, 0) & IORESOURCE_IO) {
io.addr_type = IPMI_IO_ADDR_SPACE;
- else
+ io.io_setup = ipmi_si_port_setup;
+ } else {
io.addr_type = IPMI_MEM_ADDR_SPACE;
+ io.io_setup = ipmi_si_mem_setup;
+ }
io.addr_data = pci_resource_start(pdev, 0);
io.regspacing = ipmi_pci_probe_regspacing(&io);
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 647d056df88c..b56c11f51baf 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -220,7 +220,8 @@ static bool clk_core_is_enabled(struct clk_core *core)
ret = core->ops->is_enabled(core->hw);
done:
- clk_pm_runtime_put(core);
+ if (core->dev)
+ pm_runtime_put(core->dev);
return ret;
}
@@ -1564,6 +1565,9 @@ static void clk_change_rate(struct clk_core *core)
best_parent_rate = core->parent->rate;
}
+ if (clk_pm_runtime_get(core))
+ return;
+
if (core->flags & CLK_SET_RATE_UNGATE) {
unsigned long flags;
@@ -1634,6 +1638,8 @@ static void clk_change_rate(struct clk_core *core)
/* handle the new child who might not be in core->children yet */
if (core->new_child)
clk_change_rate(core->new_child);
+
+ clk_pm_runtime_put(core);
}
static int clk_core_set_rate_nolock(struct clk_core *core,
diff --git a/drivers/clk/sunxi/clk-sun9i-mmc.c b/drivers/clk/sunxi/clk-sun9i-mmc.c
index a1a634253d6f..f00d8758ba24 100644
--- a/drivers/clk/sunxi/clk-sun9i-mmc.c
+++ b/drivers/clk/sunxi/clk-sun9i-mmc.c
@@ -16,6 +16,7 @@
#include <linux/clk.h>
#include <linux/clk-provider.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/of.h>
#include <linux/of_device.h>
@@ -83,9 +84,20 @@ static int sun9i_mmc_reset_deassert(struct reset_controller_dev *rcdev,
return 0;
}
+static int sun9i_mmc_reset_reset(struct reset_controller_dev *rcdev,
+ unsigned long id)
+{
+ sun9i_mmc_reset_assert(rcdev, id);
+ udelay(10);
+ sun9i_mmc_reset_deassert(rcdev, id);
+
+ return 0;
+}
+
static const struct reset_control_ops sun9i_mmc_reset_ops = {
.assert = sun9i_mmc_reset_assert,
.deassert = sun9i_mmc_reset_deassert,
+ .reset = sun9i_mmc_reset_reset,
};
static int sun9i_a80_mmc_config_clk_probe(struct platform_device *pdev)
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 58d4f4e1ad6a..ca38229b045a 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -22,6 +22,8 @@
#include "cpufreq_governor.h"
+#define CPUFREQ_DBS_MIN_SAMPLING_INTERVAL (2 * TICK_NSEC / NSEC_PER_USEC)
+
static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
static DEFINE_MUTEX(gov_dbs_data_mutex);
@@ -47,11 +49,15 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
struct policy_dbs_info *policy_dbs;
+ unsigned int sampling_interval;
int ret;
- ret = sscanf(buf, "%u", &dbs_data->sampling_rate);
- if (ret != 1)
+
+ ret = sscanf(buf, "%u", &sampling_interval);
+ if (ret != 1 || sampling_interval < CPUFREQ_DBS_MIN_SAMPLING_INTERVAL)
return -EINVAL;
+ dbs_data->sampling_rate = sampling_interval;
+
/*
* We are operating under dbs_data->mutex and so the list and its
* entries can't be freed concurrently.
@@ -430,7 +436,14 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
if (ret)
goto free_policy_dbs_info;
- dbs_data->sampling_rate = cpufreq_policy_transition_delay_us(policy);
+ /*
+ * The sampling interval should not be less than the transition latency
+ * of the CPU and it also cannot be too small for dbs_update() to work
+ * correctly.
+ */
+ dbs_data->sampling_rate = max_t(unsigned int,
+ CPUFREQ_DBS_MIN_SAMPLING_INTERVAL,
+ cpufreq_policy_transition_delay_us(policy));
if (!have_governor_per_policy())
gov->gdbs_data = dbs_data;
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 628fe899cb48..d9b2c2de49c4 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -226,17 +226,18 @@ static void imx6q_opp_check_speed_grading(struct device *dev)
val >>= OCOTP_CFG3_SPEED_SHIFT;
val &= 0x3;
- if ((val != OCOTP_CFG3_SPEED_1P2GHZ) &&
- of_machine_is_compatible("fsl,imx6q"))
- if (dev_pm_opp_disable(dev, 1200000000))
- dev_warn(dev, "failed to disable 1.2GHz OPP\n");
if (val < OCOTP_CFG3_SPEED_996MHZ)
if (dev_pm_opp_disable(dev, 996000000))
dev_warn(dev, "failed to disable 996MHz OPP\n");
- if (of_machine_is_compatible("fsl,imx6q")) {
+
+ if (of_machine_is_compatible("fsl,imx6q") ||
+ of_machine_is_compatible("fsl,imx6qp")) {
if (val != OCOTP_CFG3_SPEED_852MHZ)
if (dev_pm_opp_disable(dev, 852000000))
dev_warn(dev, "failed to disable 852MHz OPP\n");
+ if (val != OCOTP_CFG3_SPEED_1P2GHZ)
+ if (dev_pm_opp_disable(dev, 1200000000))
+ dev_warn(dev, "failed to disable 1.2GHz OPP\n");
}
iounmap(base);
put_node:
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index fbab271b3bf9..a861b5b4d443 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -708,7 +708,7 @@ atc_prep_dma_interleaved(struct dma_chan *chan,
unsigned long flags)
{
struct at_dma_chan *atchan = to_at_dma_chan(chan);
- struct data_chunk *first = xt->sgl;
+ struct data_chunk *first;
struct at_desc *desc = NULL;
size_t xfer_count;
unsigned int dwidth;
@@ -720,6 +720,8 @@ atc_prep_dma_interleaved(struct dma_chan *chan,
if (unlikely(!xt || xt->numf != 1 || !xt->frame_size))
return NULL;
+ first = xt->sgl;
+
dev_info(chan2dev(chan),
"%s: src=%pad, dest=%pad, numf=%d, frame_size=%d, flags=0x%lx\n",
__func__, &xt->src_start, &xt->dst_start, xt->numf,
diff --git a/drivers/dma/dma-jz4740.c b/drivers/dma/dma-jz4740.c
index d50273fed715..afd5e10f8927 100644
--- a/drivers/dma/dma-jz4740.c
+++ b/drivers/dma/dma-jz4740.c
@@ -555,7 +555,7 @@ static int jz4740_dma_probe(struct platform_device *pdev)
ret = dma_async_device_register(dd);
if (ret)
- return ret;
+ goto err_clk;
irq = platform_get_irq(pdev, 0);
ret = request_irq(irq, jz4740_dma_irq, 0, dev_name(&pdev->dev), dmadev);
@@ -568,6 +568,8 @@ static int jz4740_dma_probe(struct platform_device *pdev)
err_unregister:
dma_async_device_unregister(dd);
+err_clk:
+ clk_disable_unprepare(dmadev->clk);
return ret;
}
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 47edc7fbf91f..ec5f9d2bc820 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -155,6 +155,12 @@ MODULE_PARM_DESC(run, "Run the test (default: false)");
#define PATTERN_COUNT_MASK 0x1f
#define PATTERN_MEMSET_IDX 0x01
+/* poor man's completion - we want to use wait_event_freezable() on it */
+struct dmatest_done {
+ bool done;
+ wait_queue_head_t *wait;
+};
+
struct dmatest_thread {
struct list_head node;
struct dmatest_info *info;
@@ -165,6 +171,8 @@ struct dmatest_thread {
u8 **dsts;
u8 **udsts;
enum dma_transaction_type type;
+ wait_queue_head_t done_wait;
+ struct dmatest_done test_done;
bool done;
};
@@ -342,18 +350,25 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
return error_count;
}
-/* poor man's completion - we want to use wait_event_freezable() on it */
-struct dmatest_done {
- bool done;
- wait_queue_head_t *wait;
-};
static void dmatest_callback(void *arg)
{
struct dmatest_done *done = arg;
-
- done->done = true;
- wake_up_all(done->wait);
+ struct dmatest_thread *thread =
+ container_of(arg, struct dmatest_thread, done_wait);
+ if (!thread->done) {
+ done->done = true;
+ wake_up_all(done->wait);
+ } else {
+ /*
+ * If thread->done, it means that this callback occurred
+ * after the parent thread has cleaned up. This can
+ * happen in the case that driver doesn't implement
+ * the terminate_all() functionality and a dma operation
+ * did not occur within the timeout period
+ */
+ WARN(1, "dmatest: Kernel memory may be corrupted!!\n");
+ }
}
static unsigned int min_odd(unsigned int x, unsigned int y)
@@ -424,9 +439,8 @@ static unsigned long long dmatest_KBs(s64 runtime, unsigned long long len)
*/
static int dmatest_func(void *data)
{
- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(done_wait);
struct dmatest_thread *thread = data;
- struct dmatest_done done = { .wait = &done_wait };
+ struct dmatest_done *done = &thread->test_done;
struct dmatest_info *info;
struct dmatest_params *params;
struct dma_chan *chan;
@@ -673,9 +687,9 @@ static int dmatest_func(void *data)
continue;
}
- done.done = false;
+ done->done = false;
tx->callback = dmatest_callback;
- tx->callback_param = &done;
+ tx->callback_param = done;
cookie = tx->tx_submit(tx);
if (dma_submit_error(cookie)) {
@@ -688,21 +702,12 @@ static int dmatest_func(void *data)
}
dma_async_issue_pending(chan);
- wait_event_freezable_timeout(done_wait, done.done,
+ wait_event_freezable_timeout(thread->done_wait, done->done,
msecs_to_jiffies(params->timeout));
status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
- if (!done.done) {
- /*
- * We're leaving the timed out dma operation with
- * dangling pointer to done_wait. To make this
- * correct, we'll need to allocate wait_done for
- * each test iteration and perform "who's gonna
- * free it this time?" dancing. For now, just
- * leave it dangling.
- */
- WARN(1, "dmatest: Kernel stack may be corrupted!!\n");
+ if (!done->done) {
dmaengine_unmap_put(um);
result("test timed out", total_tests, src_off, dst_off,
len, 0);
@@ -789,7 +794,7 @@ err_thread_type:
dmatest_KBs(runtime, total_len), ret);
/* terminate all transfers on specified channels */
- if (ret)
+ if (ret || failed_tests)
dmaengine_terminate_all(chan);
thread->done = true;
@@ -849,6 +854,8 @@ static int dmatest_add_threads(struct dmatest_info *info,
thread->info = info;
thread->chan = dtc->chan;
thread->type = type;
+ thread->test_done.wait = &thread->done_wait;
+ init_waitqueue_head(&thread->done_wait);
smp_wmb();
thread->task = kthread_create(dmatest_func, thread, "%s-%s%u",
dma_chan_name(chan), op, i);
diff --git a/drivers/dma/fsl-edma.c b/drivers/dma/fsl-edma.c
index 6775f2c74e25..c7568869284e 100644
--- a/drivers/dma/fsl-edma.c
+++ b/drivers/dma/fsl-edma.c
@@ -863,11 +863,11 @@ static void fsl_edma_irq_exit(
}
}
-static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma)
+static void fsl_disable_clocks(struct fsl_edma_engine *fsl_edma, int nr_clocks)
{
int i;
- for (i = 0; i < DMAMUX_NR; i++)
+ for (i = 0; i < nr_clocks; i++)
clk_disable_unprepare(fsl_edma->muxclk[i]);
}
@@ -904,25 +904,25 @@ static int fsl_edma_probe(struct platform_device *pdev)
res = platform_get_resource(pdev, IORESOURCE_MEM, 1 + i);
fsl_edma->muxbase[i] = devm_ioremap_resource(&pdev->dev, res);
- if (IS_ERR(fsl_edma->muxbase[i]))
+ if (IS_ERR(fsl_edma->muxbase[i])) {
+ /* on error: disable all previously enabled clks */
+ fsl_disable_clocks(fsl_edma, i);
return PTR_ERR(fsl_edma->muxbase[i]);
+ }
sprintf(clkname, "dmamux%d", i);
fsl_edma->muxclk[i] = devm_clk_get(&pdev->dev, clkname);
if (IS_ERR(fsl_edma->muxclk[i])) {
dev_err(&pdev->dev, "Missing DMAMUX block clock.\n");
+ /* on error: disable all previously enabled clks */
+ fsl_disable_clocks(fsl_edma, i);
return PTR_ERR(fsl_edma->muxclk[i]);
}
ret = clk_prepare_enable(fsl_edma->muxclk[i]);
- if (ret) {
- /* disable only clks which were enabled on error */
- for (; i >= 0; i--)
- clk_disable_unprepare(fsl_edma->muxclk[i]);
-
- dev_err(&pdev->dev, "DMAMUX clk block failed.\n");
- return ret;
- }
+ if (ret)
+ /* on error: disable all previously enabled clks */
+ fsl_disable_clocks(fsl_edma, i);
}
@@ -976,7 +976,7 @@ static int fsl_edma_probe(struct platform_device *pdev)
if (ret) {
dev_err(&pdev->dev,
"Can't register Freescale eDMA engine. (%d)\n", ret);
- fsl_disable_clocks(fsl_edma);
+ fsl_disable_clocks(fsl_edma, DMAMUX_NR);
return ret;
}
@@ -985,7 +985,7 @@ static int fsl_edma_probe(struct platform_device *pdev)
dev_err(&pdev->dev,
"Can't register Freescale eDMA of_dma. (%d)\n", ret);
dma_async_device_unregister(&fsl_edma->dma_dev);
- fsl_disable_clocks(fsl_edma);
+ fsl_disable_clocks(fsl_edma, DMAMUX_NR);
return ret;
}
@@ -1015,7 +1015,7 @@ static int fsl_edma_remove(struct platform_device *pdev)
fsl_edma_cleanup_vchan(&fsl_edma->dma_dev);
of_dma_controller_free(np);
dma_async_device_unregister(&fsl_edma->dma_dev);
- fsl_disable_clocks(fsl_edma);
+ fsl_disable_clocks(fsl_edma, DMAMUX_NR);
return 0;
}
diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c
index 2f31d3d0caa6..7792a9186f9c 100644
--- a/drivers/dma/ioat/init.c
+++ b/drivers/dma/ioat/init.c
@@ -390,7 +390,7 @@ static int ioat_dma_self_test(struct ioatdma_device *ioat_dma)
if (memcmp(src, dest, IOAT_TEST_SIZE)) {
dev_err(dev, "Self-test copy failed compare, disabling\n");
err = -ENODEV;
- goto free_resources;
+ goto unmap_dma;
}
unmap_dma:
diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c
index dfcf56ee3c61..76861a00bb92 100644
--- a/drivers/gpio/gpio-bcm-kona.c
+++ b/drivers/gpio/gpio-bcm-kona.c
@@ -522,6 +522,7 @@ static struct of_device_id const bcm_kona_gpio_of_match[] = {
* category than their parents, so it won't report false recursion.
*/
static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
irq_hw_number_t hwirq)
@@ -531,7 +532,7 @@ static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq,
ret = irq_set_chip_data(irq, d->host_data);
if (ret < 0)
return ret;
- irq_set_lockdep_class(irq, &gpio_lock_class);
+ irq_set_lockdep_class(irq, &gpio_lock_class, &gpio_request_class);
irq_set_chip_and_handler(irq, &bcm_gpio_irq_chip, handle_simple_irq);
irq_set_noprobe(irq);
diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 545d43a587b7..bb4f8cf18bd9 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -327,6 +327,7 @@ static struct brcmstb_gpio_bank *brcmstb_gpio_hwirq_to_bank(
* category than their parents, so it won't report false recursion.
*/
static struct lock_class_key brcmstb_gpio_irq_lock_class;
+static struct lock_class_key brcmstb_gpio_irq_request_class;
static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
@@ -346,7 +347,8 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq,
ret = irq_set_chip_data(irq, &bank->gc);
if (ret < 0)
return ret;
- irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class);
+ irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class,
+ &brcmstb_gpio_irq_request_class);
irq_set_chip_and_handler(irq, &priv->irq_chip, handle_level_irq);
irq_set_noprobe(irq);
return 0;
diff --git a/drivers/gpio/gpio-reg.c b/drivers/gpio/gpio-reg.c
index 23e771dba4c1..e85903eddc68 100644
--- a/drivers/gpio/gpio-reg.c
+++ b/drivers/gpio/gpio-reg.c
@@ -103,8 +103,8 @@ static int gpio_reg_to_irq(struct gpio_chip *gc, unsigned offset)
struct gpio_reg *r = to_gpio_reg(gc);
int irq = r->irqs[offset];
- if (irq >= 0 && r->irq.domain)
- irq = irq_find_mapping(r->irq.domain, irq);
+ if (irq >= 0 && r->irqdomain)
+ irq = irq_find_mapping(r->irqdomain, irq);
return irq;
}
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 8db47f671708..02fa8fe2292a 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -565,6 +565,7 @@ static const struct dev_pm_ops tegra_gpio_pm_ops = {
* than their parents, so it won't report false recursion.
*/
static struct lock_class_key gpio_lock_class;
+static struct lock_class_key gpio_request_class;
static int tegra_gpio_probe(struct platform_device *pdev)
{
@@ -670,7 +671,8 @@ static int tegra_gpio_probe(struct platform_device *pdev)
bank = &tgi->bank_info[GPIO_BANK(gpio)];
- irq_set_lockdep_class(irq, &gpio_lock_class);
+ irq_set_lockdep_class(irq, &gpio_lock_class,
+ &gpio_request_class);
irq_set_chip_data(irq, bank);
irq_set_chip_and_handler(irq, &tgi->ic, handle_simple_irq);
}
diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c
index 2313af82fad3..acd59113e08b 100644
--- a/drivers/gpio/gpio-xgene-sb.c
+++ b/drivers/gpio/gpio-xgene-sb.c
@@ -139,7 +139,7 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio)
static int xgene_gpio_sb_domain_activate(struct irq_domain *d,
struct irq_data *irq_data,
- bool early)
+ bool reserve)
{
struct xgene_gpio_sb *priv = d->host_data;
u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq);
diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index eb4528c87c0b..d6f3d9ee1350 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -1074,7 +1074,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
}
if (!chip->names)
- devprop_gpiochip_set_names(chip);
+ devprop_gpiochip_set_names(chip, dev_fwnode(chip->parent));
acpi_gpiochip_request_regions(acpi_gpio);
acpi_gpiochip_scan_gpios(acpi_gpio);
diff --git a/drivers/gpio/gpiolib-devprop.c b/drivers/gpio/gpiolib-devprop.c
index 27f383bda7d9..f748aa3e77f7 100644
--- a/drivers/gpio/gpiolib-devprop.c
+++ b/drivers/gpio/gpiolib-devprop.c
@@ -19,30 +19,27 @@
/**
* devprop_gpiochip_set_names - Set GPIO line names using device properties
* @chip: GPIO chip whose lines should be named, if possible
+ * @fwnode: Property Node containing the gpio-line-names property
*
* Looks for device property "gpio-line-names" and if it exists assigns
* GPIO line names for the chip. The memory allocated for the assigned
* names belong to the underlying firmware node and should not be released
* by the caller.
*/
-void devprop_gpiochip_set_names(struct gpio_chip *chip)
+void devprop_gpiochip_set_names(struct gpio_chip *chip,
+ const struct fwnode_handle *fwnode)
{
struct gpio_device *gdev = chip->gpiodev;
const char **names;
int ret, i;
- if (!chip->parent) {
- dev_warn(&gdev->dev, "GPIO chip parent is NULL\n");
- return;
- }
-
- ret = device_property_read_string_array(chip->parent, "gpio-line-names",
+ ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
NULL, 0);
if (ret < 0)
return;
if (ret != gdev->ngpio) {
- dev_warn(chip->parent,
+ dev_warn(&gdev->dev,
"names %d do not match number of GPIOs %d\n", ret,
gdev->ngpio);
return;
@@ -52,10 +49,10 @@ void devprop_gpiochip_set_names(struct gpio_chip *chip)
if (!names)
return;
- ret = device_property_read_string_array(chip->parent, "gpio-line-names",
+ ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
names, gdev->ngpio);
if (ret < 0) {
- dev_warn(chip->parent, "failed to read GPIO line names\n");
+ dev_warn(&gdev->dev, "failed to read GPIO line names\n");
kfree(names);
return;
}
diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index e0d59e61b52f..72a0695d2ac3 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -493,7 +493,8 @@ int of_gpiochip_add(struct gpio_chip *chip)
/* If the chip defines names itself, these take precedence */
if (!chip->names)
- devprop_gpiochip_set_names(chip);
+ devprop_gpiochip_set_names(chip,
+ of_fwnode_handle(chip->of_node));
of_node_get(chip->of_node);
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index aad84a6306c4..44332b793718 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -73,7 +73,8 @@ LIST_HEAD(gpio_devices);
static void gpiochip_free_hogs(struct gpio_chip *chip);
static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
- struct lock_class_key *key);
+ struct lock_class_key *lock_key,
+ struct lock_class_key *request_key);
static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip);
static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip);
static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip);
@@ -1100,7 +1101,8 @@ static void gpiochip_setup_devs(void)
}
int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
- struct lock_class_key *key)
+ struct lock_class_key *lock_key,
+ struct lock_class_key *request_key)
{
unsigned long flags;
int status = 0;
@@ -1246,7 +1248,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
if (status)
goto err_remove_from_list;
- status = gpiochip_add_irqchip(chip, key);
+ status = gpiochip_add_irqchip(chip, lock_key, request_key);
if (status)
goto err_remove_chip;
@@ -1632,7 +1634,7 @@ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq,
* This lock class tells lockdep that GPIO irqs are in a different
* category than their parents, so it won't report false recursion.
*/
- irq_set_lockdep_class(irq, chip->irq.lock_key);
+ irq_set_lockdep_class(irq, chip->irq.lock_key, chip->irq.request_key);
irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler);
/* Chips that use nested thread handlers have them marked */
if (chip->irq.threaded)
@@ -1712,10 +1714,12 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset)
/**
* gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip
* @gpiochip: the GPIO chip to add the IRQ chip to
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
*/
static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
- struct lock_class_key *lock_key)
+ struct lock_class_key *lock_key,
+ struct lock_class_key *request_key)
{
struct irq_chip *irqchip = gpiochip->irq.chip;
const struct irq_domain_ops *ops;
@@ -1753,6 +1757,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
gpiochip->to_irq = gpiochip_to_irq;
gpiochip->irq.default_type = type;
gpiochip->irq.lock_key = lock_key;
+ gpiochip->irq.request_key = request_key;
if (gpiochip->irq.domain_ops)
ops = gpiochip->irq.domain_ops;
@@ -1850,7 +1855,8 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
* @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
* to have the core avoid setting up any default type in the hardware.
* @threaded: whether this irqchip uses a nested thread handler
- * @lock_key: lockdep class
+ * @lock_key: lockdep class for IRQ lock
+ * @request_key: lockdep class for IRQ request
*
* This function closely associates a certain irqchip with a certain
* gpiochip, providing an irq domain to translate the local IRQs to
@@ -1872,7 +1878,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
irq_flow_handler_t handler,
unsigned int type,
bool threaded,
- struct lock_class_key *lock_key)
+ struct lock_class_key *lock_key,
+ struct lock_class_key *request_key)
{
struct device_node *of_node;
@@ -1913,6 +1920,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
gpiochip->irq.default_type = type;
gpiochip->to_irq = gpiochip_to_irq;
gpiochip->irq.lock_key = lock_key;
+ gpiochip->irq.request_key = request_key;
gpiochip->irq.domain = irq_domain_add_simple(of_node,
gpiochip->ngpio, first_irq,
&gpiochip_domain_ops, gpiochip);
@@ -1940,7 +1948,8 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
#else /* CONFIG_GPIOLIB_IRQCHIP */
static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
- struct lock_class_key *key)
+ struct lock_class_key *lock_key,
+ struct lock_class_key *request_key)
{
return 0;
}
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index af48322839c3..6c44d1652139 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -228,7 +228,8 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc)
return desc - &desc->gdev->descs[0];
}
-void devprop_gpiochip_set_names(struct gpio_chip *chip);
+void devprop_gpiochip_set_names(struct gpio_chip *chip,
+ const struct fwnode_handle *fwnode);
/* With descriptor prefix */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index da43813d67a4..5aeb5f8816f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2467,7 +2467,7 @@ static int gfx_v9_0_kiq_kcq_enable(struct amdgpu_device *adev)
PACKET3_MAP_QUEUES_PIPE(ring->pipe) |
PACKET3_MAP_QUEUES_ME((ring->me == 1 ? 0 : 1)) |
PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
- PACKET3_MAP_QUEUES_ALLOC_FORMAT(1) | /* alloc format: all_on_one_pipe */
+ PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pi