aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt8
-rw-r--r--Documentation/admin-guide/thunderbolt.rst2
-rw-r--r--Documentation/x86/x86_64/mm.txt5
-rw-r--r--MAINTAINERS17
-rw-r--r--Makefile3
-rw-r--r--arch/sparc/lib/hweight.S4
-rw-r--r--arch/x86/boot/compressed/pagetable.c3
-rw-r--r--arch/x86/entry/calling.h145
-rw-r--r--arch/x86/entry/entry_64.S48
-rw-r--r--arch/x86/entry/entry_64_compat.S24
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c6
-rw-r--r--arch/x86/events/intel/ds.c130
-rw-r--r--arch/x86/events/perf_event.h23
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h13
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/disabled-features.h8
-rw-r--r--arch/x86/include/asm/intel_ds.h36
-rw-r--r--arch/x86/include/asm/mmu_context.h59
-rw-r--r--arch/x86/include/asm/pgalloc.h11
-rw-r--r--arch/x86/include/asm/pgtable.h30
-rw-r--r--arch/x86/include/asm/pgtable_64.h92
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h5
-rw-r--r--arch/x86/include/asm/processor.h23
-rw-r--r--arch/x86/include/asm/pti.h14
-rw-r--r--arch/x86/include/asm/tlbflush.h202
-rw-r--r--arch/x86/include/asm/vsyscall.h1
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h7
-rw-r--r--arch/x86/kernel/asm-offsets.c4
-rw-r--r--arch/x86/kernel/cpu/common.c9
-rw-r--r--arch/x86/kernel/dumpstack.c6
-rw-r--r--arch/x86/kernel/head_64.S30
-rw-r--r--arch/x86/kernel/ldt.c144
-rw-r--r--arch/x86/kernel/tls.c11
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/cpu_entry_area.c27
-rw-r--r--arch/x86/mm/debug_pagetables.c80
-rw-r--r--arch/x86/mm/dump_pagetables.c43
-rw-r--r--arch/x86/mm/init.c80
-rw-r--r--arch/x86/mm/pgtable.c5
-rw-r--r--arch/x86/mm/pti.c387
-rw-r--r--arch/x86/mm/tlb.c58
-rw-r--r--arch/x86/platform/efi/efi_64.c5
-rw-r--r--drivers/android/binder.c44
-rw-r--r--drivers/base/cacheinfo.c13
-rw-r--r--drivers/clk/clk.c3
-rw-r--r--drivers/gpu/drm/i915/gvt/display.c5
-rw-r--r--drivers/gpu/drm/i915/intel_display.c2
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_drm.c2
-rw-r--r--drivers/hid/hid-core.c2
-rw-r--r--drivers/hid/hid-cp2112.c15
-rw-r--r--drivers/hid/hid-holtekff.c8
-rw-r--r--drivers/hv/vmbus_drv.c2
-rw-r--r--drivers/infiniband/core/security.c3
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c4
-rw-r--r--drivers/infiniband/core/verbs.c3
-rw-r--r--drivers/infiniband/hw/cxgb4/cq.c13
-rw-r--r--drivers/infiniband/hw/cxgb4/iw_cxgb4.h2
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c72
-rw-r--r--drivers/infiniband/hw/cxgb4/t4.h6
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h1
-rw-r--r--drivers/infiniband/hw/hfi1/pcie.c30
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.c11
-rw-r--r--drivers/infiniband/hw/mlx5/cmd.h2
-rw-r--r--drivers/infiniband/hw/mlx5/main.c43
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h4
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c1
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma.h6
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c7
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c17
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c14
-rw-r--r--drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c7
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c7
-rw-r--r--drivers/leds/led-core.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c4
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c14
-rw-r--r--drivers/net/ethernet/broadcom/tg3.c19
-rw-r--r--drivers/net/ethernet/broadcom/tg3.h7
-rw-r--r--drivers/net/ethernet/freescale/fec_main.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lag.c56
-rw-r--r--drivers/net/phy/micrel.c1
-rw-r--r--drivers/net/phy/phylink.c2
-rw-r--r--drivers/nvmem/meson-mx-efuse.c4
-rw-r--r--drivers/phy/motorola/phy-cpcap-usb.c2
-rw-r--r--drivers/phy/renesas/Kconfig2
-rw-r--r--drivers/phy/rockchip/phy-rockchip-typec.c2
-rw-r--r--drivers/phy/tegra/xusb.c58
-rw-r--r--drivers/scsi/scsi_sysfs.c5
-rw-r--r--drivers/scsi/storvsc_drv.c3
-rw-r--r--drivers/staging/android/ion/Kconfig2
-rw-r--r--drivers/staging/android/ion/ion.c4
-rw-r--r--drivers/staging/android/ion/ion_cma_heap.c15
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c23
-rw-r--r--drivers/thunderbolt/nhi.c2
-rw-r--r--drivers/tty/n_tty.c4
-rw-r--r--drivers/usb/chipidea/ci_hdrc_msm.c2
-rw-r--r--drivers/usb/core/config.c2
-rw-r--r--drivers/usb/core/quirks.c6
-rw-r--r--drivers/usb/host/xhci-debugfs.c16
-rw-r--r--drivers/usb/host/xhci-pci.c3
-rw-r--r--drivers/usb/host/xhci.c6
-rw-r--r--drivers/usb/serial/ftdi_sio.c1
-rw-r--r--drivers/usb/serial/ftdi_sio_ids.h6
-rw-r--r--drivers/usb/serial/option.c17
-rw-r--r--drivers/usb/serial/qcserial.c3
-rw-r--r--drivers/usb/usbip/stub_dev.c3
-rw-r--r--drivers/usb/usbip/stub_main.c5
-rw-r--r--drivers/usb/usbip/stub_rx.c18
-rw-r--r--drivers/usb/usbip/stub_tx.c6
-rw-r--r--drivers/usb/usbip/usbip_common.c16
-rw-r--r--drivers/usb/usbip/vhci_hcd.c12
-rw-r--r--drivers/usb/usbip/vhci_rx.c23
-rw-r--r--drivers/usb/usbip/vhci_tx.c3
-rw-r--r--include/linux/mlx5/driver.h4
-rw-r--r--include/linux/pti.h11
-rw-r--r--include/linux/tick.h1
-rw-r--r--include/net/sock.h5
-rw-r--r--include/net/xfrm.h3
-rw-r--r--include/trace/events/tcp.h97
-rw-r--r--init/main.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/time/tick-sched.c13
-rw-r--r--lib/kobject_uevent.c16
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/ipv4/xfrm4_input.c12
-rw-r--r--net/ipv6/ip6_gre.c57
-rw-r--r--net/ipv6/xfrm6_input.c10
-rw-r--r--net/rds/send.c3
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/sch_generic.c4
-rw-r--r--net/sctp/socket.c4
-rw-r--r--net/strparser/strparser.c2
-rw-r--r--net/tipc/bearer.c5
-rw-r--r--net/tipc/group.c31
-rw-r--r--net/tipc/monitor.c6
-rw-r--r--net/tipc/socket.c2
-rw-r--r--net/xfrm/xfrm_input.c69
-rw-r--r--net/xfrm/xfrm_policy.c9
-rw-r--r--net/xfrm/xfrm_state.c1
-rw-r--r--net/xfrm/xfrm_user.c26
-rw-r--r--security/Kconfig10
-rw-r--r--tools/bpf/bpftool/map.c8
-rw-r--r--tools/bpf/bpftool/prog.c2
-rw-r--r--tools/testing/selftests/bpf/Makefile2
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c3
-rw-r--r--tools/usb/usbip/src/utils.c9
148 files changed, 2407 insertions, 602 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6571fbfdb2a1..e49311d53504 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2708,6 +2708,8 @@
steal time is computed, but won't influence scheduler
behaviour
+ nopti [X86-64] Disable kernel page table isolation
+
nolapic [X86-32,APIC] Do not enable or use the local APIC.
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
@@ -3282,6 +3284,12 @@
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
+ pti= [X86_64]
+ Control user/kernel address space isolation:
+ on - enable
+ off - disable
+ auto - default setting
+
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index de50a8561774..9b55952039a6 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -230,7 +230,7 @@ If supported by your machine this will be exposed by the WMI bus with
a sysfs attribute called "force_power".
For example the intel-wmi-thunderbolt driver exposes this attribute in:
- /sys/devices/platform/PNP0C14:00/wmi_bus/wmi_bus-PNP0C14:00/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
+ /sys/bus/wmi/devices/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
To force the power to on, write 1 to this attribute file.
To disable force power, write 0 to this attribute file.
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 51101708a03a..ad41b3813f0a 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
diff --git a/MAINTAINERS b/MAINTAINERS
index a6e86e20761e..b46c9cea5ae5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2621,24 +2621,22 @@ F: fs/bfs/
F: include/uapi/linux/bfs_fs.h
BLACKFIN ARCHITECTURE
-M: Steven Miao <realmz6@gmail.com>
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
T: git git://git.code.sf.net/p/adi-linux/code
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: arch/blackfin/
BLACKFIN EMAC DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/net/ethernet/adi/
BLACKFIN MEDIA DRIVER
-M: Scott Jiang <scott.jiang.linux@gmail.com>
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org/
-S: Supported
+S: Orphan
F: drivers/media/platform/blackfin/
F: drivers/media/i2c/adv7183*
F: drivers/media/i2c/vs6624*
@@ -2646,25 +2644,25 @@ F: drivers/media/i2c/vs6624*
BLACKFIN RTC DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/rtc/rtc-bfin.c
BLACKFIN SDH DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/mmc/host/bfin_sdh.c
BLACKFIN SERIAL DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/tty/serial/bfin_uart.c
BLACKFIN WATCHDOG DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org
-S: Supported
+S: Orphan
F: drivers/watchdog/bfin_wdt.c
BLINKM RGB LED DRIVER
@@ -13493,6 +13491,7 @@ M: Mika Westerberg <mika.westerberg@linux.intel.com>
M: Yehezkel Bernat <yehezkel.bernat@intel.com>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
S: Maintained
+F: Documentation/admin-guide/thunderbolt.rst
F: drivers/thunderbolt/
F: include/linux/thunderbolt.h
diff --git a/Makefile b/Makefile
index ac8c441866b7..92b74bcd3c2a 100644
--- a/Makefile
+++ b/Makefile
@@ -789,6 +789,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
+# Make sure -fstack-check isn't enabled (like gentoo apparently did)
+KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
+
# conserve stack if available
KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
index e5547b22cd18..0ddbbb031822 100644
--- a/arch/sparc/lib/hweight.S
+++ b/arch/sparc/lib/hweight.S
@@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
.previous
ENTRY(__arch_hweight64)
- sethi %hi(__sw_hweight16), %g1
- jmpl %g1 + %lo(__sw_hweight16), %g0
+ sethi %hi(__sw_hweight64), %g1
+ jmpl %g1 + %lo(__sw_hweight64), %g0
nop
ENDPROC(__arch_hweight64)
EXPORT_SYMBOL(__arch_hweight64)
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index d5364ca2e3f9..b5e5e02f8cde 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -23,6 +23,9 @@
*/
#undef CONFIG_AMD_MEM_ENCRYPT
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
#include "misc.h"
/* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3fd8bc560fae..45a63e00a6af 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,6 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
/*
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
+#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+
+.macro SET_NOFLUSH_BIT reg:req
+ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
+.macro ADJUST_KERNEL_CR3 reg:req
+ ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+ /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ andq $(~PTI_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+ ADJUST_KERNEL_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+#define THIS_CPU_user_pcid_flush_mask \
+ PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * Test if the ASID needs a flush.
+ */
+ movq \scratch_reg, \scratch_reg2
+ andq $(0x7FF), \scratch_reg /* mask ASID */
+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jnc .Lnoflush_\@
+
+ /* Flush needed, clear the bit */
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ movq \scratch_reg2, \scratch_reg
+ jmp .Lwrcr3_\@
+
+.Lnoflush_\@:
+ movq \scratch_reg2, \scratch_reg
+ SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_\@:
+ /* Flip the PGD and ASID to the user version */
+ orq $(PTI_SWITCH_MASK), \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ pushq %rax
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+ popq %rax
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ movq %cr3, \scratch_reg
+ movq \scratch_reg, \save_reg
+ /*
+ * Is the "switch mask" all zero? That means that both of
+ * these are zero:
+ *
+ * 1. The user/kernel PCID bit, and
+ * 2. The user/kernel "bit" that points CR3 to the
+ * bottom half of the 8k PGD
+ *
+ * That indicates a kernel CR3 value, not a user CR3.
+ */
+ testq $(PTI_SWITCH_MASK), \scratch_reg
+ jz .Ldone_\@
+
+ ADJUST_KERNEL_CR3 \scratch_reg
+ movq \scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * KERNEL pages can always resume with NOFLUSH as we do
+ * explicit flushes.
+ */
+ bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
+ jnc .Lnoflush_\@
+
+ /*
+ * Check if there's a pending flush for the user ASID we're
+ * about to set.
+ */
+ movq \save_reg, \scratch_reg
+ andq $(0x7FF), \scratch_reg
+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jnc .Lnoflush_\@
+
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jmp .Lwrcr3_\@
+
+.Lnoflush_\@:
+ SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
+ /*
+ * The CR3 write could be avoided when not changing its value,
+ * but would require a CR3 read *and* a scratch register.
+ */
+ movq \save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
#endif /* CONFIG_X86_64 */
/*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3d19c830e1b1..f048e384ff54 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -23,7 +23,6 @@
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
-#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
@@ -40,6 +39,8 @@
#include <asm/frame.h>
#include <linux/err.h>
+#include "calling.h"
+
.code64
.section .entry.text, "ax"
@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
+ /* Note: using %rsp as a scratch reg. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
*/
swapgs
+ /*
+ * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+ * is not required to switch CR3.
+ */
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -403,6 +411,7 @@ syscall_return_via_sysret:
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
* We can do future final exit work right here.
*/
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
/* Restore RDI. */
popq %rdi
SWAPGS
@@ -822,7 +833,9 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
- SWAPGS
+ SWAPGS /* to kernel GS */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
+
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
@@ -838,7 +851,6 @@ native_irq_return_ldt:
/* Now RAX == RSP. */
andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
- popq %rdi /* Restore user RDI */
/*
* espfix_stack[31:16] == 0. The page tables are set up such that
@@ -849,7 +861,11 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
- SWAPGS
+
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ SWAPGS /* to user GS */
+ popq %rdi /* Restore user RDI */
+
movq %rax, %rsp
UNWIND_HINT_IRET_REGS offset=8
@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC
pushq %rdi
+ /* Need to switch before accessing the thread stack. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
-1: ret
+
+1:
+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+ ret
END(paranoid_entry)
/*
@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
@@ -1299,6 +1322,8 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
+ /* We have user CR3. Change to kernel CR3. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
@@ -1345,6 +1370,7 @@ ENTRY(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
.Lbstep_iret:
@@ -1354,10 +1380,11 @@ ENTRY(error_entry)
.Lerror_bad_iret:
/*
- * We came from an IRET to user mode, so we have user gsbase.
- * Switch to kernel gsbase:
+ * We came from an IRET to user mode, so we have user
+ * gsbase and CR3. Switch to kernel gsbase and CR3:
*/
SWAPGS
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1389,6 +1416,10 @@ END(error_exit)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
* so we can use real assembly here.
+ *
+ * Registers:
+ * %r14: Used to save/restore the CR3 of the interrupted context
+ * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
@@ -1452,6 +1483,7 @@ ENTRY(nmi)
swapgs
cld
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1704,6 +1736,8 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi
+ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 95ad40eb7eff..40f17009ec20 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -49,6 +49,10 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS
+
+ /* We are about to clobber %rsp anyway, clobbering here is OK */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -216,6 +220,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
pushq $0 /* pt_regs->r15 = 0 */
/*
+ * We just saved %rdi so it is safe to clobber. It is not
+ * preserved during the C calls inside TRACE_IRQS_OFF anyway.
+ */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
+ /*
* User mode is traced as though IRQs are on, and SYSENTER
* turned them off.
*/
@@ -256,10 +266,22 @@ sysret32_from_system_call:
* when the system call started, which is already known to user
* code. We zero R8-R10 to avoid info leaks.
*/
+ movq RSP-ORIG_RAX(%rsp), %rsp
+
+ /*
+ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+ * on the process stack which is not mapped to userspace and
+ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
+ * switch until after after the last reference to the process
+ * stack.
+ *
+ * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+ */
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
- movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
END(entry_SYSCALL_compat)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 1faf40f2dda9..577fa8adb785 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
* vsyscalls but leave the page not present. If so, we skip calling
* this.
*/
-static void __init set_vsyscall_pgtable_user_bits(void)
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- pgd = pgd_offset_k(VSYSCALL_ADDR);
+ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
- set_vsyscall_pgtable_user_bits();
+ set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3674a4b6f8bd..8f0aace08b87 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,16 +3,18 @@
#include <linux/types.h>
#include <linux/slab.h>
+#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "../perf_event.h"
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
-#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer);
-static int alloc_pebs_buffer(int cpu)
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ phys_addr_t pa;
+ size_t msz = 0;
+
+ pa = virt_to_phys(addr);
+ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+ size_t msz = 0;
+
+ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+ unsigned int order = get_order(size);
int node = cpu_to_node(cpu);
- int max;
- void *buffer, *ibuffer;
+ struct page *page;
+
+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+ return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+ if (buffer)
+ free_pages((unsigned long)buffer, get_order(size));
+}
+
+static int alloc_pebs_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ size_t bsiz = x86_pmu.pebs_buffer_size;
+ int max, node = cpu_to_node(cpu);
+ void *buffer, *ibuffer, *cea;
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
if (unlikely(!buffer))
return -ENOMEM;
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
- kfree(buffer);
+ dsfree_pages(buffer, bsiz);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
}
-
- max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
-
- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+ hwev->ds_pebs_vaddr = buffer;
+ /* Update the cpu entry area mapping */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds->pebs_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
ds->pebs_index = ds->pebs_buffer_base;
- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
- max * x86_pmu.pebs_record_size;
-
+ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
return 0;
}
static void release_pebs_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *cea;
if (!ds || !x86_pmu.pebs)
return;
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
+ dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
+ hwev->ds_pebs_vaddr = NULL;
}
static int alloc_bts_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
- int node = cpu_to_node(cpu);
- int max, thresh;
- void *buffer;
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *buffer, *cea;
+ int max;
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
}
-
- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
- thresh = max / 16;
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ hwev->ds_bts_vaddr = buffer;
+ /* Update the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds->bts_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum = ds->bts_buffer_base +
- max * BTS_RECORD_SIZE;
- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
- thresh * BTS_RECORD_SIZE;
-
+ max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
+ ds->bts_absolute_maximum = ds->bts_buffer_base + max;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
return 0;
}
static void release_bts_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *cea;
if (!ds || !x86_pmu.bts)
return;
- kfree((void *)(unsigned long)ds->bts_buffer_base);
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds_clear_cea(cea, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
+ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+ hwev->ds_bts_vaddr = NULL;
}
static int alloc_ds_buffer(int cpu)
{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
-
- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
- if (unlikely(!ds))
- return -ENOMEM;
+ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
+ memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
-
return 0;
}
static void release_ds_buffer(int cpu)
{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- return;
-
per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
}
void release_ds_buffers(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f7aaadf9331f..8e4ea143ed96 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,8 @@
#include <linux/perf_event.h>
+#include <asm/intel_ds.h>
+
/* To enable MSR tracing please use the generic trace points. */
/*
@@ -77,8 +79,6 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 8
#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
/*
@@ -95,23 +95,6 @@ struct amd_nb {
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
- u64 bts_buffer_base;
- u64 bts_index;
- u64 bts_absolute_maximum;
- u64 bts_interrupt_threshold;
- u64 pebs_buffer_base;
- u64 pebs_index;
- u64 pebs_absolute_maximum;
- u64 pebs_interrupt_threshold;
- u64 pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
#define PEBS_REGS \
(PERF_REG_X86_AX | \
PERF_REG_X86_BX | \
@@ -216,6 +199,8 @@ struct cpu_hw_events {
* Intel DebugStore bits
*/
struct debug_store *ds;
+ void *ds_pebs_vaddr;
+ void *ds_bts_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 2fbc69a0916e..4a7884b8dca5 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -5,6 +5,7 @@
#include <linux/percpu-defs.h>
#include <asm/processor.h>
+#include <asm/intel_ds.h>
/*
* cpu_entry_area is a percpu region that contains things needed by the CPU
@@ -40,6 +41,18 @@ struct cpu_entry_area {
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+ /*
+ * Per CPU debug store for Intel performance monitoring. Wastes a
+ * full page at the moment.
+ */
+ struct debug_store cpu_debug_store;
+ /*
+ * The actual PEBS/BTS buffers must be mapped to user space
+ * Reserve enough fixmap PTEs.
+ */
+ struct debug_store_buffers cpu_debug_buffers;
+#endif
};
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 800104c8a3ed..07cdd1715705 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -197,11 +197,12 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
-
+#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +341,6 @@
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index ec8be07c0cda..13c5ee878a47 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
desc->type = (info->read_exec_only ^ 1) << 1;
desc->type |= info->contents << 2;
+ /* Set the ACCESS bit so it can be mapped RO */
+ desc->type |= 1;
desc->s = 1;
desc->dpl = 0x3;
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 14d6d5007314..b027633e7300 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -50,6 +50,12 @@
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI 0
+#else
+# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -60,7 +66,7 @@
#define DISABLED_MASK4 (DISABLE_PCID)
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
-#define DISABLED_MASK7 0
+#define DISABLED_MASK7 (DISABLE_PTI)
#define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX)
#define DISABLED_MASK10 0
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
new file mode 100644
index 000000000000..62a9f4966b42
--- /dev/null
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS 8
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_event_reset[MAX_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+ char bts_buffer[BTS_BUFFER_SIZE];
+ char pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5ede7cae1d67..c931b88982a0 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -50,10 +50,33 @@ struct ldt_struct {
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
- struct desc_struct *entries;
- unsigned int nr_entries;
+ struct desc_struct *entries;
+ unsigned int nr_entries;
+
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot). We use two slots so that we can allocate
+ * and map, and enable a new LDT without invalidating the mapping
+ * of an older, still-in-use LDT.
+ *
+ * slot will be -1 if this LDT doesn't have an alias mapping.
+ */
+ int slot;
};
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+#ifdef CONFIG_X86_64
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+#else
+ BUG();
+#endif
+}
+
/*
* Used for LDT copy/destruction.
*/
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
{
return 0;
}
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see.
*/
- if (unlikely(ldt))
- set_ldt(ldt->entries, ldt->nr_entries);
- else
+ if (unlikely(ldt)) {
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
+
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
clear_LDT();
+ }
#else
clear_LDT();
#endif
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 4b5e1eafada7..aff42e1da6ee 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
*/
extern gfp_t __userpte_alloc_gfp;
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
/*
* Allocate and free page tables.
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 95e2dfd75521..e42b8943cb1a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
void ptdump_walk_pgd_level_checkwx(void);
#ifdef CONFIG_DEBUG_WX
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
static inline int p4d_bad(p4d_t p4d)
{
- return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ unsigned long ignore_flags = _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
* pgd_offset() returns a (pgd_t *)
* pgd_index() is used get the offset into the pgd page's array of pgd_t's;
*/
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+/*
+ * a shortcut to get a pgd_t in a given mm
+ */
+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
/*
* a shortcut which implies the use of the kernel's pgd, instead
* of a process's
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+ /* Clone the user space pgd as well */
+ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+ count * sizeof(pgd_t));
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e9f05331e732..81462e9a34f6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif
}
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
+ * the user one is in the last 4k. To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr |= BIT(bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr &= ~BIT(bit);
+ return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return pgd;
+ return __pti_set_user_pgd(pgdp, pgd);
+}
+#else
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif
+
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
+ p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
+#else
*p4dp = p4d;
+#endif
}
static inline void native_p4d_clear(p4d_t *p4d)
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ *pgdp = pti_set_user_pgd(pgdp, pgd);
+#else
*pgdp = pgd;
+#endif
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3d27831bc58d..b97a539bcdee 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#ifdef CONFIG_X86_5LEVEL
-# define VMALLOC_SIZE_TB _AC(16384, UL)
-# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
+# define VMALLOC_SIZE_TB _AC(12800, UL)
+# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+# define LDT_PGD_ENTRY _AC(-112, UL)
+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else
# define VMALLOC_SIZE_TB _AC(32, UL)
# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+# define LDT_PGD_ENTRY _AC(-4, UL)
+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 43212a43ee69..6a60fea90b9d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -38,6 +38,11 @@
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_SWITCH_BIT 11
+#endif
+
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cad8dab266bc..d3a67fba200a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -852,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x)
#else
/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
new file mode 100644
index 000000000000..0b5ef05b2d2d
--- /dev/null
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _ASM_X86_PTI_H
+#define _ASM_X86_PTI_H
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern void pti_init(void);
+extern void pti_check_boottime_disable(void);
+#else
+static inline void pti_check_boottime_disable(void) { }
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e1884cf35257..f68f9c836cca 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,38 +10,90 @@
#include <asm/special_insns.h>
#include <asm/smp.h>
#include <asm/invpcid.h>
+#include <asm/pti.h>
+#include <asm/processor-flags.h>
-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
-{
- /*
- * Bump the generation count. This also serves as a full barrier
- * that synchronizes with switch_mm(): callers are required to order
- * their read of mm_cpumask after their writes to the paging
- * structures.
- */
- return atomic64_inc_return(&mm->context.tlb_gen);
-}
+/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
+ * the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * the value we write into the PCID part of CR3; corresponds to the
+ * ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * for KPTI each mm has two address spaces and thus needs two
+ * PCID values, but we can still do with a single ASID denomination
+ * for each mm. Corresponds to kPCID + 2048.
+ *
+ */
/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12
+
/*
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
-#define PTI_CONSUMED_ASID_BITS 0
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
/*
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
- * for them being zero-based. Another -1 is because ASID 0 is reserved for
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
* use by non-PCID-aware users.
*/
-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+ * lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+/*
+ * Given @asid, compute kPCID
+ */
static inline u16 kern_pcid(u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Make sure that the dynamic ASID space does not confict with the
+ * bit we are using to switch between user and kernel ASIDs.
+ */
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+
+ /*
+ * The ASID being passed in here should have respected the
+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+ */
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+#endif
/*
+ * The dynamically-assigned ASIDs that get passed in are small
+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
+ * so do not bother to clear it.
+ *
* If PCID is on, ASID-aware code paths put the ASID+1 into the
* PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other
@@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
return asid + 1;
}
+/*
+ * Given @asid, compute uPCID
+ */
+static inline u16 user_pcid(u16 asid)
+{
+ u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+#endif
+ return ret;
+}
+
struct pgd_t;
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{
@@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
return !static_cpu_has(X86_FEATURE_PCID);
}
-/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
- */
-#define TLB_NR_DYN_ASIDS 6
-
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -135,6 +193,24 @@ struct tlb_state {
bool is_lazy;
/*
+ * If set we changed the page tables in such a way that we
+ * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+ * This tells us to go invalidate all the non-loaded ctxs[]
+ * on the next context switch.
+ *
+ * The current ctx was kept up-to-date as it ran and does not
+ * need to be invalidated.
+ */
+ bool invalidate_other;
+
+ /*
+ * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+ * the corresponding user PCID needs a flush next time we
+ * switch to it; see SWITCH_TO_USER_CR3.
+ */
+ unsigned short user_pcid_flush_mask;
+
+ /*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
*/
@@ -215,6 +291,14 @@ static inline unsigned long cr4_read_shadow(void)
}
/*
+ * Mark all other ASIDs as invalid, preserves the current.
+ */
+static inline void invalidate_other_asid(void)
+{
+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
+/*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot
* up after us can get the correct flags. This should only be used
@@ -234,14 +318,41 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
extern void initialize_tlbstate_and_flush(void);
/*
+ * Given an ASID, flush the corresponding user ASID. We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+ /* There is no user ASID if address space separation is off */
+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ return;
+
+ /*
+ * We only have a single ASID if PCID is off and the CR3
+ * write will have flushed it.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ __set_bit(kern_pcid(asid),
+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
+/*
* flush the entire current user mapping
*/
static inline void __native_flush_tlb(void)
{
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/*
- * If current->mm == NULL then we borrow a mm which may change during a
- * task switch and therefore we must not be preempted while we write CR3
- * back:
+ * If current->mm == NULL then we borrow a mm which may change
+ * during a task switch and therefore we must not be preempted
+ * while we write CR3 back:
*/
preempt_disable();
native_write_cr3(__native_read_cr3());
@@ -259,6 +370,8 @@ static inline void __native_flush_tlb_global(void)
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@@ -285,7 +398,21 @@ static inline void __native_flush_tlb_global(void)
*/
static inline void __native_flush_tlb_single(unsigned long addr)
{
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+ * Just use invalidate_user_asid() in case we are called early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+ invalidate_user_asid(loaded_mm_asid);
+ else
+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
}
/*
@@ -301,14 +428,6 @@ static inline void __flush_tlb_all(void)
*/
__flush_tlb();
}
-
- /*
- * Note: if we somehow had PCID but not PGE, then this wouldn't work --
- * we'd end up flushing kernel translations for the current ASID but
- * we might fail to flush kernel translations for other cached ASIDs.
- *
- * To avoid this issue, we force PCID off if PGE is off.
- */
}
/*
@@ -318,6 +437,16 @@ static inline void __flush_tlb_one(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * __flush_tlb_single() will have cleared the TLB entry for this ASID,
+ * but since kernel space is replicated across all, we must also
+ * invalidate all others.
+ */
+ invalidate_other_asid();
}
#define TLB_FLUSH_ALL -1UL
@@ -378,6 +507,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+ /*
+ * Bump the generation count. This also serves as a full barrier
+ * that synchronizes with switch_mm(): callers are required to order
+ * their read of mm_cpumask after their writes to the paging
+ * structures.
+ */
+ return atomic64_inc_return(&mm->context.tlb_gen);
+}
+
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d9a7c659009c..b986b2ca688a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -7,6 +7,7 @@
#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
/*
* Called on instruction fetch fault in vsyscall page.
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 7e1e730396ae..bcba3c643e63 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -78,7 +78,12 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
+
+#define X86_CR3_PCID_BITS 12
+#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 676b7cf4b62b..76417a9aab73 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
#include <asm/sigframe.h>
#include <asm/bootparam.h>
#include <asm/suspend.h>
+#include <asm/tlbflush.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -94,6 +95,9 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+ /* TLB state for the entry code */
+ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c9757f07d738..c47de4ebf63a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -922,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
}
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+ /* Assume for now that ALL x86 CPUs are insecure */
+ setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+
fpu__init_system(c);
#ifdef CONFIG_X86_32
@@ -1360,7 +1364,10 @@ void syscall_init(void)
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+ else
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 36b17e0febe8..5fa110699ed2 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
unsigned long sp;
#endif
printk(KERN_DEFAULT
- "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
- IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
+ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
+ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7dca675fe78d..04a625f0fcda 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL 0
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -350,13 +371,14 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
+ .fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
#endif
#ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a6b5d62f45a7..579cc4a66fdf 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -24,6 +24,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
- mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
- pc = &mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+ load_mm_ldt(mm);
refresh_ldt_segments();
}
@@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return NULL;
}
+ /* The new LDT isn't aliased for PTI yet. */
+ new_ldt->slot = -1;
+
new_ldt->nr_entries = num_entries;
return new_ldt;
}
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ *
+ * There is no corresponding unmap function. Even if the LDT is freed, we
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
+ * This is harmless: the LDT is always in ordinary memory, and no one will
+ * access the freed slot.
+ *
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
+ * it useful, and the flush would slow down modify_ldt().
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ bool is_vmalloc, had_top_level_entry;
+ unsigned long va;
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ int i;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return 0;
+
+ /*
+ * Any given ldt_struct should have map_ldt_struct() called at most
+ * once.
+ */
+ WARN_ON(ldt->slot != -1);
+
+ /*
+ * Did we already have the top level entry allocated? We can't
+ * use pgd_none() for this because it doens't do anything on
+ * 4-level page table kernels.
+ */
+ pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ had_top_level_entry = (pgd->pgd != 0);
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn;
+ pte_t pte, *ptep;
+
+ va = (unsigned long)ldt_slot_va(slot) + offset;
+ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+ /*
+ * Treat the PTI LDT range as a *userspace* range.
+ * get_locked_pte() will allocate all needed pagetables
+ * and account for them in this mm.
+ */
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+ /*
+ * Map it RO so the easy to find address is not a primary
+ * target via some kernel interface which misses a
+ * permission check.
+ */
+ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_top_level_entry);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_top_level_entry);
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+ }
+ }
+
+ va = (unsigned long)ldt_slot_va(slot);
+ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+
+ ldt->slot = slot;
+#endif
+ return 0;
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
@@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval) {
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
mm->context.ldt = new_ldt;
out_unlock:
@@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
@@ -287,6 +413,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
+ /*
+ * If we are using PTI, map the new LDT into the userspace pagetables.
+ * If there is already an LDT, use the other slot so that other CPUs
+ * will continue to use the old LDT until install_ldt() switches
+ * them over to the new LDT.
+ */
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ free_ldt_struct(old_ldt);
+ goto out_unlock;
+ }
+
install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
error = 0;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9a9c9b076955..a5b802a12212 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info) || LDT_zero(info)) {
+ if (LDT_empty(info) || LDT_zero(info))
memset(desc, 0, sizeof(*desc));
- } else {
+ else
fill_ldt(desc, info);
-
- /*
- * Always set the accessed bit so that the CPU
- * doesn't try to write to the (read-only) GDT.
- */
- desc->type |= 1;
- }
++info;
++desc;
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d2a8b5a24a44..1e413a9326aa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
+#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+
#else
#define X64_ALIGN_RODATA_BEGIN
#define X64_ALIGN_RODATA_END
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+
#endif
PHDRS {
@@ -102,8 +108,10 @@ SECTIONS
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
+ ALIGN_ENTRY_TEXT_BEGIN
ENTRY_TEXT
IRQENTRY_TEXT
+ ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 52195ee3f6d5..27e9e90a8d35 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index fe814fd5e014..b9283cc27622 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
}
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+ int npages;
+ void *cea;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+ npages = sizeof(struct debug_store) / PAGE_SIZE;
+ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+ PAGE_KERNEL);
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+ /*
+ * Force the population of PMDs for not yet allocated per cpu
+ * memory like debug store buffers.
+ */
+ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+ for (; npages; npages--, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
+ percpu_setup_debug_store(cpu);
}
static __init void setup_cpu_entry_area_ptes(void)
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index bfcffdf6c577..421f2664ffa0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
static int ptdump_show(struct seq_file *m, void *v)
{
- ptdump_walk_pgd_level(m, NULL);
+ ptdump_walk_pgd_level_debugfs(m, NULL, false);
return 0;
}
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
.release = single_release,
};
-static struct dentry *pe;
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curknl,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curusr,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
{
- pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
- &ptdump_fops);
- if (!pe)
+ dir = debugfs_create_dir("page_tables", NULL);
+ if (!dir)
return -ENOMEM;
+ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+ &ptdump_fops);
+ if (!pe_knl)
+ goto err;
+
+ pe_curknl = debugfs_create_file("current_kernel", 0400,
+ dir, NULL, &ptdump_curknl_fops);
+ if (!pe_curknl)
+ goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pe_curusr = debugfs_create_file("current_user", 0400,
+ dir, NULL, &ptdump_curusr_fops);
+ if (!pe_curusr)
+ goto err;
+#endif
return 0;
+err:
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
}
static void __exit pt_dump_debug_exit(void)
{
- debugfs_remove_recursive(pe);
+ debugfs_remove_recursive(dir);
}
module_init(pt_dump_debug_init);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 43dedbfb7257..f56902c1f04b 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -52,12 +52,18 @@ enum address_markers_idx {
USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+ LDT_NR,
+#endif
CPU_ENTRY_AREA_NR,
#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
@@ -82,6 +88,9 @@ static struct addr_marker address_markers[] = {
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
+#endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64
[ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
- bool checkwx)
+ bool checkwx, bool dmesg)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
if (pgd) {
start = pgd;
- st.to_dmesg = true;
+ st.to_dmesg = dmesg;
}
st.check_wx = checkwx;
@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
- ptdump_walk_pgd_level_core(m, pgd, false);
+ ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (user && static_cpu_has(X86_FEATURE_PTI))
+ pgd = kernel_to_user_pgdp(pgd);
+#endif
+ ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+static void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pgd_t *pgd = (pgd_t *) &init_top_pgt;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("x86/mm: Checking user space page tables\n");
+ pgd = kernel_to_user_pgdp(pgd);
+ ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
}
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
void ptdump_walk_pgd_level_checkwx(void)
{
- ptdump_walk_pgd_level_core(NULL, NULL, true);
+ ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+ ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6fdf91ef130a..8ca324d07282 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
+#include <asm/pti.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -160,6 +161,12 @@ struct map_range {
static int page_size_mask;
+static void enable_global_pages(void)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ __supported_pte_mask |= _PAGE_GLOBAL;
+}
+
static void __init probe_page_size_mask(void)
{
/*
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
- __supported_pte_mask |= _PAGE_GLOBAL;
- } else
- __supported_pte_mask &= ~_PAGE_GLOBAL;
+ enable_global_pages();
+ }
/* Enable 1 GB linear kernel mappings if available: */
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
static void setup_pcid(void)
{
-#ifdef CONFIG_X86_64
- if (boot_cpu_has(X86_FEATURE_PCID)) {
- if (boot_cpu_has(X86_FEATURE_PGE)) {
- /*
- * This can't be cr4_set_bits_and_update_boot() --
- * the trampoline code can't handle CR4.PCIDE and
- * it wouldn't do any good anyway. Despite the name,
- * cr4_set_bits_and_update_boot() doesn't actually
- * cause the bits in question to remain set all the
- * way through the secondary boot asm.
- *
- * Instead, we brute-force it and set CR4.PCIDE
- * manually in start_secondary().
- */
- cr4_set_bits(X86_CR4_PCIDE);
- } else {
- /*
- * flush_tlb_all(), as currently implemented, won't
- * work if PCID is on but PGE is not. Since that
- * combination doesn't exist on real hardware, there's
- * no reason to try to fully support it, but it's
- * polite to avoid corrupting data if we're on
- * an improperly configured VM.
- */
- setup_clear_cpu_cap(X86_FEATURE_PCID);
- }
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * This can't be cr4_set_bits_and_update_boot() -- the
+ * trampoline code can't handle CR4.PCIDE and it wouldn't
+ * do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually cause
+ * the bits in question to remain set all the way through
+ * the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE manually in
+ * start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+
+ /*
+ * INVPCID's single-context modes (2/3) only work if we set
+ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
+ * on systems that have X86_CR4_PCIDE clear, or that have
+ * no INVPCID support at all.
+ */
+ if (boot_cpu_has(X86_FEATURE_INVPCID))
+ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't work if
+ * PCID is on but PGE is not. Since that combination
+ * doesn't exist on real hardware, there's no reason to try
+ * to fully support it, but it's polite to avoid corrupting
+ * data if we're on an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
}
-#endif
}
#ifdef CONFIG_X86_32
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
{
unsigned long end;
+ pti_check_boottime_disable();
probe_page_size_mask();
setup_pcid();
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 96d456a94b03..004abf9ebf12 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
+
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000000..bce8aea65606
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ * https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
+ * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
+ * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+ * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+ *
+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
+ * Andy Lutomirsky <luto@amacapital.net>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
+
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK 0
+#endif
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+ if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ pr_info("%s\n", reason);
+}
+
+static void __init pti_print_if_secure(const char *reason)
+{
+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ pr_info("%s\n", reason);
+}
+
+void __init pti_check_boottime_disable(void)
+{
+ char arg[5];
+ int ret;
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+ pti_print_if_insecure("disabled on XEN PV.");
+ return;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+ if (ret > 0) {
+ if (ret == 3 && !strncmp(arg, "off", 3)) {
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+ if (ret == 2 && !strncmp(arg, "on", 2)) {
+ pti_print_if_secure("force enabled on command line.");
+ goto enable;
+ }
+ if (ret == 4 && !strncmp(arg, "auto", 4))
+ goto autosel;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+
+autosel:
+ if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+ return;
+enable:
+ setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ /*
+ * Changes to the high (kernel) portion of the kernelmode page
+ * tables are not automatically propagated to the usermode tables.
+ *
+ * Users should keep in mind that, unlike the kernelmode tables,
+ * there is no vmalloc_fault equivalent for the usermode tables.
+ * Top-level entries added to init_mm's usermode pgd after boot
+ * will not be automatically propagated to other mms.
+ */
+ if (!pgdp_maps_userspace(pgdp))
+ return pgd;
+
+ /*
+ * The user page tables get the full PGD, accessible from
+ * userspace:
+ */
+ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+ /*
+ * If this is normal user memory, make it NX in the kernel
+ * pagetables so that, if we somehow screw up and return to
+ * usermode with the kernel CR3 loaded, we'll get a page fault
+ * instead of allowing user code to execute with the wrong CR3.
+ *
+ * As exceptions, we don't set NX if:
+ * - _PAGE_USER is not set. This could be an executable
+ * EFI runtime mapping or something similar, and the kernel
+ * may execute from it
+ * - we don't have NX support
+ * - we're clearing the PGD (i.e. the new pgd is not present).
+ */
+ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+ (__supported_pte_mask & _PAGE_NX))
+ pgd.pgd |= _PAGE_NX;
+
+ /* return the copy of the PGD we want the kernel to use: */
+ return pgd;
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (address < PAGE_OFFSET) {
+ WARN_ONCE(1, "attempt to walk user address\n");
+ return NULL;
+ }
+
+ if (pgd_none(*pgd)) {
+ unsigned long new_p4d_page = __get_free_page(gfp);
+ if (!new_p4d_page)
+ return NULL;
+
+ if (pgd_none(*pgd)) {
+ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+ new_p4d_page = 0;
+ }
+ if (new_p4d_page)
+ free_page(new_p4d_page);
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+ pud_t *pud;
+
+ BUILD_BUG_ON(p4d_large(*p4d) != 0);
+ if (p4d_none(*p4d)) {
+ unsigned long new_pud_page = __get_free_page(gfp);
+ if (!new_pud_page)
+ return NULL;
+
+ if (p4d_none(*p4d)) {
+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+ new_pud_page = 0;
+ }
+ if (new_pud_page)
+ free_page(new_pud_page);
+ }
+
+ pud = pud_offset(p4d, address);
+ /* The user page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+
+ if (pud_none(*pud)) {
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ new_pmd_page = 0;
+ }
+ if (new_pmd_page)
+ free_page(new_pmd_page);
+ }
+
+ return pmd_offset(pud, address);
+}
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down. Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables. It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+ pte_t *pte;
+
+ /* We can't do anything sensible if we hit a large mapping. */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+
+ if (pmd_none(*pmd)) {
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ new_pte_page = 0;
+ }
+ if (new_pte_page)
+ free_page(new_pte_page);
+ }
+
+ pte = pte_offset_kernel(pmd, address);
+ if (pte_flags(*pte) & _PAGE_USER) {
+ WARN_ONCE(1, "attempt to walk to user pte\n");
+ return NULL;
+ }
+ return pte;
+}
+
+static void __init pti_setup_vsyscall(void)
+{
+ pte_t *pte, *target_pte;
+ unsigned int level;
+
+ pte = lookup_address(VSYSCALL_ADDR, &level);
+ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+ return;
+
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ if (WARN_ON(!target_pte))
+ return;
+
+ *target_pte = *pte;
+ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
+static void __init
+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+{
+ unsigned long addr;
+
+ /*
+ * Clone the populated PMDs which cover start to end. These PMD areas
+ * can have holes.
+ */
+ for (addr = start; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd, *target_pmd;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(addr);
+ if (WARN_ON(pgd_none(*pgd)))
+ return;
+ p4d = p4d_offset(pgd, addr);
+ if (WARN_ON(p4d_none(*p4d)))
+ return;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ target_pmd = pti_user_pagetable_walk_pmd(addr);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode
+ * tables will share the last-level page tables of this
+ * address range
+ */
+ *target_pmd = pmd_clear_flags(*pmd, clear);
+ }
+}
+
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+ p4d_t *kernel_p4d, *user_p4d;
+ pgd_t *kernel_pgd;
+
+ user_p4d = pti_user_pagetable_walk_p4d(addr);
+ kernel_pgd = pgd_offset_k(addr);
+ kernel_p4d = p4d_offset(kernel_pgd, addr);
+ *user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
+/*
+ * Clone the ESPFIX P4D into the user space visinble page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+ pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void __init pti_clone_entry_text(void)
+{
+ pti_clone_pmds((unsigned long) __entry_text_start,
+ (unsigned long) __irqentry_text_end, _PAGE_RW);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("enabled\n");
+
+ pti_clone_user_shared();
+ pti_clone_entry_text();
+ pti_setup_espfix64();
+ pti_setup_vsyscall();
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0a1be3adc97e..a1561957dccb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts. We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_asid_other(void)
+{
+ u16 asid;
+
+ /*
+ * This is only expected to be set if we have disabled
+ * kernel _PAGE_GLOBAL pages.
+ */
+ if (!static_cpu_has(X86_FEATURE_PTI)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ /* Do not need to flush the current asid */
+ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+ continue;
+ /*
+ * Make sure the next time we go to switch to
+ * this asid, we do a flush:
+ */
+ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+ }
+ this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
+ if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ clear_asid_other();
+
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+ unsigned long new_mm_cr3;
+
+ if (need_flush) {
+ invalidate_user_asid(new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid);
+ } else {
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- write_cr3(build_cr3(next->pgd, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- write_cr3(build_cr3_noflush(next->pgd, new_asid));
+ load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6a151ce70e86..d87ac96e37ed 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -196,6 +196,9 @@ static pgd_t *efi_pgd;
* because we want to avoid inserting EFI region mappings (EFI_VA_END
* to EFI_VA_START) into the standard kernel page tables. Everything
* else can be shared, see efi_sync_low_kernel_mappings().
+ *
+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+ * allocation.
*/
int __init efi_alloc_page_tables(void)
{
@@ -208,7 +211,7 @@ int __init efi_alloc_page_tables(void)
return 0;
gfp_mask = GFP_KERNEL | __GFP_ZERO;
- efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
if (!efi_pgd)
return -ENOMEM;
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index bccec9de0533..a7ecfde66b7b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -482,7 +482,8 @@ enum binder_deferred_state {
* @tsk task_struct for group_leader of process
* (invariant after initialized)
* @files files_struct for process
- * (invariant after initialized)
+ * (protected by @files_lock)
+ * @files_lock mutex to protect @files
* @deferred_work_node: element for binder_deferred_list
* (protected by binder_deferred_lock)
* @deferred_work: bitmap of deferred work to perform
@@ -530,6 +531,7 @@ struct binder_proc {
int pid;
struct task_struct *tsk;
struct files_struct *files;
+ struct mutex files_lock;
struct hlist_node deferred_work_node;
int deferred_work;
bool is_dead;
@@ -877,20 +879,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
{
- struct files_struct *files = proc->files;
unsigned long rlim_cur;
unsigned long irqs;
+ int ret;
- if (files == NULL)
- return -ESRCH;
-
- if (!lock_task_sighand(proc->tsk, &irqs))
- return -EMFILE;
-
+ mutex_lock(&proc->files_lock);
+ if (proc->files == NULL) {
+ ret = -ESRCH;
+ goto err;
+ }
+ if (!lock_task_sighand(proc->tsk, &irqs)) {
+ ret = -EMFILE;
+ goto err;
+ }
rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
unlock_task_sighand(proc->tsk, &irqs);
- return __alloc_fd(files, 0, rlim_cur, flags);
+ ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
+err:
+ mutex_unlock(&proc->files_lock);
+ return ret;
}
/*
@@ -899,8 +907,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
static void task_fd_install(
struct binder_proc *proc, unsigned int fd, struct file *file)
{
+ mutex_lock(&proc->files_lock);
if (proc->files)
__fd_install(proc->files, fd, file);
+ mutex_unlock(&proc->files_lock);
}
/*
@@ -910,9 +920,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
{
int retval;
- if (proc->files == NULL)
- return -ESRCH;
-
+ mutex_lock(&proc->files_lock);
+ if (proc->files == NULL) {
+ retval = -ESRCH;
+ goto err;
+ }
retval = __close_fd(proc->files, fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
@@ -920,7 +932,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
retval == -ERESTARTNOHAND ||
retval == -ERESTART_RESTARTBLOCK))
retval = -EINTR;
-
+err:
+ mutex_unlock(&proc->files_lock);
return retval;
}
@@ -4627,7 +4640,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
ret = binder_alloc_mmap_handler(&proc->alloc, vma);
if (ret)
return ret;
+ mutex_lock(&proc->files_lock);
proc->files = get_files_struct(current);
+ mutex_unlock(&proc->files_lock);
return 0;
err_bad_arg:
@@ -4651,6 +4666,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
spin_lock_init(&proc->outer_lock);
get_task_struct(current->group_leader);
proc->tsk = current->group_leader;
+ mutex_init(&proc->files_lock);
INIT_LIST_HEAD(&proc->todo);
proc->default_priority = task_nice(current);
binder_dev = container_of(filp->private_data, struct binder_device,
@@ -4903,9 +4919,11 @@ static void binder_deferred_func(struct work_struct *work)
files = NULL;
if (defer & BINDER_DEFERRED_PUT_FILES) {
+ mutex_lock(&proc->files_lock);
files = proc->files;
if (files)
proc->files = NULL;
+ mutex_unlock(&proc->files_lock);
}
if (defer & BINDER_DEFERRED_FLUSH)
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index eb3af2739537..07532d83be0b 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
}
+static bool cache_node_is_unified(struct cacheinfo *this_leaf)
+{
+ return of_property_read_bool(this_leaf->of_node, "cache-unified");
+}
+
static void cache_of_override_properties(unsigned int cpu)
{
int index;
@@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
for (index = 0; index < cache_leaves(cpu); index++) {
this_leaf = this_cpu_ci->info_list + index;
+ /*
+ * init_cache_level must setup the cache level correctly
+ * overriding the architecturally specified levels, so
+ * if type is NONE at this stage, it should be unified
+ */
+ if (this_leaf->type == CACHE_TYPE_NOCACHE &&
+ cache_node_is_unified(this_leaf))
+ this_leaf->type = CACHE_TYPE_UNIFIED;
cache_size(this_leaf);
cache_get_line_size(this_leaf);
cache_nr_sets(this_leaf);
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 8a1860a36c77..b56c11f51baf 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -220,7 +220,8 @@ static bool clk_core_is_enabled(struct clk_core *core)
ret = core->ops->is_enabled(core->hw);
done:
- clk_pm_runtime_put(core);
+ if (core->dev)
+ pm_runtime_put(core->dev);
return ret;
}
diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c
index 355120865efd..309f3fa6794a 100644
--- a/drivers/gpu/drm/i915/gvt/display.c
+++ b/drivers/gpu/drm/i915/gvt/display.c
@@ -266,6 +266,8 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
/* Clear host CRT status, so guest couldn't detect this host CRT. */
if (IS_BROADWELL(dev_priv))
vgpu_vreg(vgpu, PCH_ADPA) &= ~ADPA_CRT_HOTPLUG_MONITOR_MASK;
+
+ vgpu_vreg(vgpu, PIPECONF(PIPE_A)) |= PIPECONF_ENABLE;
}
static void clean_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num)
@@ -282,7 +284,6 @@ static void clean_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num)
static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num,
int type, unsigned int resolution)
{
- struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
if (WARN_ON(resolution >= GVT_EDID_NUM))
@@ -308,7 +309,7 @@ static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num,
port->type = type;
emulate_monitor_status_change(vgpu);
- vgpu_vreg(vgpu, PIPECONF(PIPE_A)) |= PIPECONF_ENABLE;
+
return 0;
}
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index ff9397030092..30cf273d57aa 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -13194,7 +13194,7 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe)
primary->frontbuffer_bit = INTEL_FRONTBUFFER_PRIMARY(pipe);
primary->check_plane = intel_check_primary_plane;
- if (INTEL_GEN(dev_priv) >= 10 || IS_GEMINILAKE(dev_priv)) {
+ if (INTEL_GEN(dev_priv) >= 10) {
intel_primary_formats = skl_primary_formats;
num_formats = ARRAY_SIZE(skl_primary_formats);
modifiers = skl_format_modifiers_ccs;
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 8d4a5be3b913..56fe261b6268 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -152,9 +152,9 @@ nouveau_cli_work_queue(struct nouveau_cli *cli, struct dma_fence *fence,
work->cli = cli;
mutex_lock(&cli->lock);
list_add_tail(&work->head, &cli->worker);
- mutex_unlock(&cli->lock);
if (dma_fence_add_callback(fence, &work->cb, nouveau_cli_work_fence))
nouveau_cli_work_fence(fence, &work->cb);
+ mutex_unlock(&cli->lock);
}
static void
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index f3fcb836a1f9..0c3f608131cf 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -551,7 +551,7 @@ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item)
ret = hid_add_field(parser, HID_FEATURE_REPORT, data);
break;
default:
- hid_err(parser->device, "unknown main item tag 0x%x\n", item->tag);
+ hid_warn(parser->device, "unknown main item tag 0x%x\n", item->tag);
ret = 0;
}
diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c
index 68cdc962265b..271f31461da4 100644
--- a/drivers/hid/hid-cp2112.c
+++ b/drivers/hid/hid-cp2112.c
@@ -696,8 +696,16 @@ static int cp2112_xfer(struct i2c_adapter *adap, u16 addr,
(u8 *)&word, 2);
break;
case I2C_SMBUS_I2C_BLOCK_DATA:
- size = I2C_SMBUS_BLOCK_DATA;
- /* fallthrough */
+ if (read_write == I2C_SMBUS_READ) {
+ read_length = data->block[0];
+ count = cp2112_write_read_req(buf, addr, read_length,
+ command, NULL, 0);
+ } else {
+ count = cp2112_write_req(buf, addr, command,
+ data->block + 1,
+ data->block[0]);
+ }
+ break;
case I2C_SMBUS_BLOCK_DATA:
if (I2C_SMBUS_READ == read_write) {
count = cp2112_write_read_req(buf, addr,
@@ -785,6 +793,9 @@ static int cp2112_xfer(struct i2c_adapter *adap, u16 addr,
case I2C_SMBUS_WORD_DATA:
data->word = le16_to_cpup((__le16 *)buf);
break;
+ case I2C_SMBUS_I2C_BLOCK_DATA:
+ memcpy(data->block + 1, buf, read_length);
+ break;
case I2C_SMBUS_BLOCK_DATA:
if (read_length > I2C_SMBUS_BLOCK_MAX) {
ret = -EPROTO;
diff --git a/drivers/hid/hid-holtekff.c b/drivers/hid/hid-holtekff.c
index 9325545fc3ae..edc0f64bb584 100644
--- a/drivers/hid/hid-holtekff.c
+++ b/drivers/hid/hid-holtekff.c
@@ -32,10 +32,6 @@
#ifdef CONFIG_HOLTEK_FF
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Anssi Hannula <anssi.hannula@iki.fi>");
-MODULE_DESCRIPTION("Force feedback support for Holtek On Line Grip based devices");
-
/*
* These commands and parameters are currently known:
*
@@ -223,3 +219,7 @@ static struct hid_driver holtek_driver = {
.probe = holtek_probe,
};
module_hid_driver(holtek_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Anssi Hannula <anssi.hannula@iki.fi>");
+MODULE_DESCRIPTION("Force feedback support for Holtek On Line Grip based devices");
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 76ed9a216f10..610223f0e945 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1378,6 +1378,8 @@ void vmbus_device_unregister(struct hv_device *device_obj)
pr_debug("child device %s unregistered\n",
dev_name(&device_obj->device));
+ kset_unregister(device_obj->channels_kset);
+
/*
* Kick off the process of unregistering the device.
* This will call vmbus_remove() and eventually vmbus_device_release()
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index feafdb961c48..59b2f96d986a 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -386,6 +386,9 @@ int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
if (ret)
return ret;
+ if (!qp->qp_sec)
+ return 0;
+
mutex_lock(&real_qp->qp_sec->mutex);
ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
qp->qp_sec);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index d0202bb176a4..840b24096690 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2074,8 +2074,8 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
return -EOPNOTSUPP;
if (ucore->inlen > sizeof(cmd)) {
- if (ib_is_udata_cleared(ucore, sizeof(cmd),
- ucore->inlen - sizeof(cmd)))
+ if (!ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
return -EOPNOTSUPP;
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3fb8fb6cc824..e36d27ed4daa 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1438,7 +1438,8 @@ int ib_close_qp(struct ib_qp *qp)
spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
atomic_dec(&real_qp->usecnt);
- ib_close_shared_qp_security(qp->qp_sec);
+ if (qp->qp_sec)
+ ib_close_shared_qp_security(qp->qp_sec);
kfree(qp);
return 0;
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index b7bfc536e00f..6f2b26126c64 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -395,7 +395,7 @@ next_cqe:
static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
{
- if (CQE_OPCODE(cqe) == C4IW_DRAIN_OPCODE) {
+ if (DRAIN_CQE(cqe)) {
WARN_ONCE(1, "Unexpected DRAIN CQE qp id %u!\n", wq->sq.qid);
return 0;
}
@@ -494,7 +494,7 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
/*
* Special cqe for drain WR completions...
*/
- if (CQE_OPCODE(hw_cqe) == C4IW_DRAIN_OPCODE) {
+ if (DRAIN_CQE(hw_cqe)) {
*cookie = CQE_DRAIN_COOKIE(hw_cqe);
*cqe = *hw_cqe;
goto skip_cqe;
@@ -571,10 +571,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
ret = -EAGAIN;
goto skip_cqe;
}
- if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
+ if (unlikely(!CQE_STATUS(hw_cqe) &&
+ CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
t4_set_wq_in_error(wq);
- hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
- goto proc_cqe;
+ hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
}
goto proc_cqe;
}
@@ -748,9 +748,6 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
c4iw_invalidate_mr(qhp->rhp,
CQE_WRID_FR_STAG(&cqe));
break;
- case C4IW_DRAIN_OPCODE:
- wc->opcode = IB_WC_SEND;
- break;
default:
pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n",
CQE_OPCODE(&cqe), CQE_QPID(&cqe));
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 470f97a79ebb..65dd3726ca02 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -693,8 +693,6 @@ static inline int to_ib_qp_state(int c4iw_qp_state)
return IB_QPS_ERR;
}
-#define C4IW_DRAIN_OPCODE FW_RI_SGE_EC_CR_RETURN
-
static inline u32 c4iw_ib_to_tpt_access(int a)
{
return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 38bddd02a943..d5c92fc520d6 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -790,21 +790,57 @@ static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc)
return 0;
}
-static void complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
+static int ib_to_fw_opcode(int ib_opcode)
+{
+ int opcode;
+
+ switch (ib_opcode) {
+ case IB_WR_SEND_WITH_INV:
+ opcode = FW_RI_SEND_WITH_INV;
+ break;
+ case IB_WR_SEND:
+ opcode = FW_RI_SEND;
+ break;
+ case IB_WR_RDMA_WRITE:
+ opcode = FW_RI_RDMA_WRITE;
+ break;
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
+ opcode = FW_RI_READ_REQ;
+ break;
+ case IB_WR_REG_MR:
+ opcode = FW_RI_FAST_REGISTER;
+ break;
+ case IB_WR_LOCAL_INV:
+ opcode = FW_RI_LOCAL_INV;
+ break;
+ default:
+ opcode = -EINVAL;
+ }
+ return opcode;
+}
+
+static int complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
{
struct t4_cqe cqe = {};
struct c4iw_cq *schp;
unsigned long flag;
struct t4_cq *cq;
+ int opcode;
schp = to_c4iw_cq(qhp->ibqp.send_cq);
cq = &schp->cq;
+ opcode = ib_to_fw_opcode(wr->opcode);
+ if (opcode < 0)
+ return opcode;
+
cqe.u.drain_cookie = wr->wr_id;
cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
- CQE_OPCODE_V(C4IW_DRAIN_OPCODE) |
+ CQE_OPCODE_V(opcode) |
CQE_TYPE_V(1) |
CQE_SWCQE_V(1) |
+ CQE_DRAIN_V(1) |
CQE_QPID_V(qhp->wq.sq.qid));
spin_lock_irqsave(&schp->lock, flag);
@@ -819,6 +855,23 @@ static void complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr)
schp->ibcq.cq_context);
spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
}
+ return 0;
+}
+
+static int complete_sq_drain_wrs(struct c4iw_qp *qhp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ int ret = 0;
+
+ while (wr) {
+ ret = complete_sq_drain_wr(qhp, wr);
+ if (ret) {
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+ return ret;
}
static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
@@ -833,9 +886,10 @@ static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
cqe.u.drain_cookie = wr->wr_id;
cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
- CQE_OPCODE_V(C4IW_DRAIN_OPCODE) |
+ CQE_OPCODE_V(FW_RI_SEND) |
CQE_TYPE_V(0) |
CQE_SWCQE_V(1) |
+ CQE_DRAIN_V(1) |
CQE_QPID_V(qhp->wq.sq.qid));
spin_lock_irqsave(&rchp->lock, flag);
@@ -852,6 +906,14 @@ static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
}
}
+static void complete_rq_drain_wrs(struct c4iw_qp *qhp, struct ib_recv_wr *wr)
+{
+ while (wr) {
+ complete_rq_drain_wr(qhp, wr);
+ wr = wr->next;
+ }
+}
+
int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
struct ib_send_wr **bad_wr)
{
@@ -875,7 +937,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
*/
if (qhp->wq.flushed) {
spin_unlock_irqrestore(&qhp->lock, flag);
- complete_sq_drain_wr(qhp, wr);
+ err = complete_sq_drain_wrs(qhp, wr, bad_wr);
return err;
}
num_wrs = t4_sq_avail(&qhp->wq);
@@ -1023,7 +1085,7 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
*/
if (qhp->wq.flushed) {
spin_unlock_irqrestore(&qhp->lock, flag);
- complete_rq_drain_wr(qhp, wr);
+ complete_rq_drain_wrs(qhp, wr);
return err;
}
num_wrs = t4_rq_avail(&qhp->wq);
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index e9ea94268d51..79e8ee12c391 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -197,6 +197,11 @@ struct t4_cqe {
#define CQE_SWCQE_G(x) ((((x) >> CQE_SWCQE_S)) & CQE_SWCQE_M)
#define CQE_SWCQE_V(x) ((x)<<CQE_SWCQE_S)
+#define CQE_DRAIN_S 10
+#define CQE_DRAIN_M 0x1
+#define CQE_DRAIN_G(x) ((((x) >> CQE_DRAIN_S)) & CQE_DRAIN_M)
+#define CQE_DRAIN_V(x) ((x)<<CQE_DRAIN_S)
+
#define CQE_STATUS_S 5
#define CQE_STATUS_M 0x1F
#define CQE_STATUS_G(x) ((((x) >> CQE_STATUS_S)) & CQE_STATUS_M)
@@ -213,6 +218,7 @@ struct t4_cqe {
#define CQE_OPCODE_V(x) ((x)<<CQE_OPCODE_S)
#define SW_CQE(x) (CQE_SWCQE_G(be32_to_cpu((x)->header)))
+#define DRAIN_CQE(x) (CQE_DRAIN_G(be32_to_cpu((x)->header)))
#define CQE_QPID(x) (CQE_QPID_G(be32_to_cpu((x)->header)))
#define CQE_TYPE(x) (CQE_TYPE_G(be32_to_cpu((x)->header)))
#define SQ_TYPE(x) (CQE_TYPE((x)))
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 4a9b4d7efe63..8ce9118d4a7f 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1131,7 +1131,6 @@ struct hfi1_devdata {
u16 pcie_lnkctl;
u16 pcie_devctl2;
u32 pci_msix0;
- u32 pci_lnkctl3;
u32 pci_tph2;
/*
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 09e50fd2a08f..8c7e7a60b715 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -411,15 +411,12 @@ int restore_pci_variables(struct hfi1_devdata *dd)
if (ret)
goto error;
- ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
- dd->pci_lnkctl3);
- if (ret)
- goto error;
-
- ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
- if (ret)
- goto error;
-
+ if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
+ ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2,
+ dd->pci_tph2);
+ if (ret)
+ goto error;
+ }
return 0;
error:
@@ -469,15 +466,12 @@ int save_pci_variables(struct hfi1_devdata *dd)
if (ret)
goto error;
- ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
- &dd->pci_lnkctl3);
- if (ret)
- goto error;
-
- ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
- if (ret)
- goto error;
-
+ if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
+ ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2,
+ &dd->pci_tph2);
+ if (ret)
+ goto error;
+ }
return 0;
error:
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 470995fa38d2..6f6712f87a73 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -47,17 +47,6 @@ int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey)
return err;
}
-int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
- bool reset, void *out, int out_size)
-{
- u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
-
- MLX5_SET(query_cong_statistics_in, in, opcode,
- MLX5_CMD_OP_QUERY_CONG_STATISTICS);
- MLX5_SET(query_cong_statistics_in, in, clear, reset);
- return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
-}
-
int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
void *out, int out_size)
{
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index af4c24596274..78ffded7cc2c 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -37,8 +37,6 @@
#include <linux/mlx5/driver.h>
int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
-int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
- bool reset, void *out, int out_size);
int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
void *out, int out_size);
int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 543d0a4c8bf3..8ac50de2b242 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1463,6 +1463,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
}
INIT_LIST_HEAD(&context->vma_private_list);
+ mutex_init(&context->vma_private_list_mutex);
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
@@ -1624,7 +1625,9 @@ static void mlx5_ib_vma_close(struct vm_area_struct *area)
* mlx5_ib_disassociate_ucontext().
*/
mlx5_ib_vma_priv_data->vma = NULL;
+ mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
list_del(&mlx5_ib_vma_priv_data->list);
+ mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
kfree(mlx5_ib_vma_priv_data);
}
@@ -1644,10 +1647,13 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
return -ENOMEM;
vma_prv->vma = vma;
+ vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
vma->vm_private_data = vma_prv;
vma->vm_ops = &mlx5_ib_vm_ops;
+ mutex_lock(&ctx->vma_private_list_mutex);
list_add(&vma_prv->list, vma_head);
+ mutex_unlock(&ctx->vma_private_list_mutex);
return 0;
}
@@ -1690,6 +1696,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
* mlx5_ib_vma_close.
*/
down_write(&owning_mm->mmap_sem);
+ mutex_lock(&context->vma_private_list_mutex);
list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
list) {
vma = vma_private->vma;
@@ -1704,6 +1711,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
list_del(&vma_private->list);
kfree(vma_private);
}
+ mutex_unlock(&context->vma_private_list_mutex);
up_write(&owning_mm->mmap_sem);
mmput(owning_mm);
put_task_struct(owning_process);
@@ -3737,34 +3745,6 @@ free:
return ret;
}
-static int mlx5_ib_query_cong_counters(struct mlx5_ib_dev *dev,
- struct mlx5_ib_port *port,
- struct rdma_hw_stats *stats)
-{
- int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
- void *out;
- int ret, i;
- int offset = port->cnts.num_q_counters;
-
- out = kvzalloc(outlen, GFP_KERNEL);
- if (!out)
- return -ENOMEM;
-
- ret = mlx5_cmd_query_cong_counter(dev->mdev, false, out, outlen);
- if (ret)
- goto free;
-
- for (i = 0; i < port->cnts.num_cong_counters; i++) {
- stats->value[i + offset] =
- be64_to_cpup((__be64 *)(out +
- port->cnts.offsets[i + offset]));
- }
-
-free:
- kvfree(out);
- return ret;
-}
-
static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
struct rdma_hw_stats *stats,
u8 port_num, int index)
@@ -3782,7 +3762,12 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
num_counters = port->cnts.num_q_counters;
if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
- ret = mlx5_ib_query_cong_counters(dev, port, stats);
+ ret = mlx5_lag_query_cong_counters(dev->mdev,
+ stats->value +
+ port->cnts.num_q_counters,
+ port->cnts.num_cong_counters,
+ port->cnts.offsets +
+ port->cnts.num_q_counters);
if (ret)
return ret;
num_counters += port->cnts.num_cong_counters;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 6dd8cac78de2..2c5f3533bbc9 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -115,6 +115,8 @@ enum {
struct mlx5_ib_vma_private_data {
struct list_head list;
struct vm_area_struct *vma;
+ /* protect vma_private_list add/del */
+ struct mutex *vma_private_list_mutex;
};
struct mlx5_ib_ucontext {
@@ -129,6 +131,8 @@ struct mlx5_ib_ucontext {
/* Transport Domain number */
u32 tdn;
struct list_head vma_private_list;
+ /* protect vma_private_list add/del */
+ struct mutex vma_private_list_mutex;
unsigned long upd_xlt_page;
/* protect ODP/KSM */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index ee0ee1f9994b..d109fe8290a7 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1637,6 +1637,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
MLX5_SET(mkc, mkc, access_mode, mr->access_mode);
MLX5_SET(mkc, mkc, umr_en, 1);
+ mr->ibmr.device = pd->device;
err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
if (err)
goto err_destroy_psv;
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index 63bc2efc34eb..4f7bd3b6a315 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -94,7 +94,7 @@ struct pvrdma_cq {
u32 cq_handle;
bool is_kernel;
atomic_t refcnt;
- wait_queue_head_t wait;
+ struct completion free;
};
struct pvrdma_id_table {
@@ -175,7 +175,7 @@ struct pvrdma_srq {
u32 srq_handle;
int npages;
refcount_t refcnt;
- wait_queue_head_t wait;
+ struct completion free;
};
struct pvrdma_qp {
@@ -197,7 +197,7 @@ struct pvrdma_qp {
bool is_kernel;
struct mutex mutex; /* QP state mutex. */
atomic_t refcnt;
- wait_queue_head_t wait;
+ struct completion free;
};
struct pvrdma_dev {
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 3562c0c30492..e529622cefad 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -179,7 +179,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
pvrdma_page_dir_insert_umem(&cq->pdir, cq->umem, 0);
atomic_set(&cq->refcnt, 1);
- init_waitqueue_head(&cq->wait);
+ init_completion(&cq->free);
spin_lock_init(&cq->cq_lock);
memset(cmd, 0, sizeof(*cmd));
@@ -230,8 +230,9 @@ err_cq:
static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
{
- atomic_dec(&cq->refcnt);
- wait_event(cq->wait, !atomic_read(&cq->refcnt));
+ if (atomic_dec_and_test(&cq->refcnt))
+ complete(&cq->free);
+ wait_for_completion(&cq->free);
if (!cq->is_kernel)
ib_umem_release(cq->umem);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 1f4e18717a00..e92681878c93 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -346,9 +346,8 @@ static void pvrdma_qp_event(struct pvrdma_dev *dev, u32 qpn, int type)
ibqp->event_handler(&e, ibqp->qp_context);
}
if (qp) {
- atomic_dec(&qp->refcnt);
- if (atomic_read(&qp->refcnt) == 0)
- wake_up(&qp->wait);
+ if (atomic_dec_and_test(&qp->refcnt))
+ complete(&qp->free);
}
}
@@ -373,9 +372,8 @@ static void pvrdma_cq_event(struct pvrdma_dev *dev, u32 cqn, int type)
ibcq->event_handler(&e, ibcq->cq_context);
}
if (cq) {
- atomic_dec(&cq->refcnt);
- if (atomic_read(&cq->refcnt) == 0)
- wake_up(&cq->wait);
+ if (atomic_dec_and_test(&cq->refcnt))
+ complete(&cq->free);
}
}
@@ -404,7 +402,7 @@ static void pvrdma_srq_event(struct pvrdma_dev *dev, u32 srqn, int type)
}
if (srq) {
if (refcount_dec_and_test(&srq->refcnt))
- wake_up(&srq->wait);
+ complete(&srq->free);
}
}
@@ -539,9 +537,8 @@ static irqreturn_t pvrdma_intrx_handler(int irq, void *dev_id)
if (cq && cq->ibcq.comp_handler)
cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
if (cq) {
- atomic_dec(&cq->refcnt);
- if (atomic_read(&cq->refcnt))
- wake_up(&cq->wait);
+ if (atomic_dec_and_test(&cq->refcnt))
+ complete(&cq->free);
}
pvrdma_idx_ring_inc(&ring->cons_head, ring_slots);
}
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index 10420a18d02f..4059308e1454 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -246,7 +246,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
spin_lock_init(&qp->rq.lock);
mutex_init(&qp->mutex);
atomic_set(&qp->refcnt, 1);
- init_waitqueue_head(&qp->wait);
+ init_completion(&qp->free);
qp->state = IB_QPS_RESET;
@@ -428,8 +428,16 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp)
pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags);
- atomic_dec(&qp->refcnt);
- wait_event(qp->wait, !atomic_read(&qp->refcnt));
+ if (atomic_dec_and_test(&qp->refcnt))
+ complete(&qp->free);
+ wait_for_completion(&qp->free);
+
+ if (!qp->is_kernel) {
+ if (qp->rumem)
+ ib_umem_release(qp->rumem);
+ if (qp->sumem)
+ ib_umem_release(qp->sumem);
+ }
pvrdma_page_dir_cleanup(dev, &qp->pdir);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index 826ccb864596..5acebb1ef631 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -149,7 +149,7 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
spin_lock_init(&srq->lock);
refcount_set(&srq->refcnt, 1);
- init_waitqueue_head(&srq->wait);
+ init_completion(&srq->free);
dev_dbg(&dev->pdev->dev,
"create shared receive queue from user space\n");
@@ -236,8 +236,9 @@ static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
dev->srq_tbl[srq->srq_handle] = NULL;
spin_unlock_irqrestore(&dev->srq_tbl_lock, flags);
- refcount_dec(&srq->refcnt);
- wait_event(srq->wait, !refcount_read(&srq->refcnt));
+ if (refcount_dec_and_test(&srq->refcnt))
+ complete(&srq->free);
+ wait_for_completion(&srq->free);
/* There is no support for kernel clients, so this is safe. */
ib_umem_release(srq->umem);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 3b96cdaf9a83..e6151a29c412 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -1236,13 +1236,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
ipoib_ib_dev_down(dev);
if (level == IPOIB_FLUSH_HEAVY) {
- rtnl_lock();
if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
ipoib_ib_dev_stop(dev);
- result = ipoib_ib_dev_open(dev);
- rtnl_unlock();
- if (result)
+ if (ipoib_ib_dev_open(dev))
return;
if (netif_queue_stopped(dev))
@@ -1282,7 +1279,9 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work)
struct ipoib_dev_priv *priv =
container_of(work, struct ipoib_dev_priv, flush_heavy);
+ rtnl_lock();
__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0);
+ rtnl_unlock();
}
void ipoib_ib_dev_cleanup(struct net_device *dev)
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index fd83c7f77a95..f3654fd2eaf3 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -186,7 +186,7 @@ void led_blink_set(struct led_classdev *led_cdev,
unsigned long *delay_on,
unsigned long *delay_off)
{
- del_timer_sync(&led_cdev->blink_timer);
+ led_stop_software_blink(led_cdev);
clear_bit(LED_BLINK_ONESHOT, &led_cdev->work_flags);
clear_bit(LED_BLINK_ONESHOT_STOP, &led_cdev->work_flags);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 4c739d5355d2..8ae269ec17a1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3030,7 +3030,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
del_timer_sync(&bp->timer);
- if (IS_PF(bp)) {
+ if (IS_PF(bp) && !BP_NOMCP(bp)) {
/* Set ALWAYS_ALIVE bit in shmem */
bp->fw_drv_pulse_wr_seq |= DRV_PULSE_ALWAYS_ALIVE;
bnx2x_drv_pulse(bp);
@@ -3116,7 +3116,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, bool keep_link)
bp->cnic_loaded = false;
/* Clear driver version indication in shmem */
- if (IS_PF(bp))
+ if (IS_PF(bp) && !BP_NOMCP(bp))
bnx2x_update_mng_version(bp);
/* Check if there are pending parity attentions. If there are - set
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 91e2a7560b48..ddd5d3ebd201 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9578,6 +9578,15 @@ static int bnx2x_init_shmem(struct bnx2x *bp)
do {
bp->common.shmem_base = REG_RD(bp, MISC_REG_SHARED_MEM_ADDR);
+
+ /* If we read all 0xFFs, means we are in PCI error state and
+ * should bail out to avoid crashes on adapter's FW reads.
+ */
+ if (bp->common.shmem_base == 0xFFFFFFFF) {
+ bp->flags |= NO_MCP_FLAG;
+ return -ENODEV;
+ }
+
if (bp->common.shmem_base) {
val = SHMEM_RD(bp, validity_map[BP_PORT(bp)]);
if (val & SHR_MEM_VALIDITY_MB)
@@ -14320,7 +14329,10 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct pci_dev *pdev)
BNX2X_ERR("IO slot reset --> driver unload\n");
/* MCP should have been reset; Need to wait for validity */
- bnx2x_init_shmem(bp);
+ if (bnx2x_init_shmem(bp)) {
+ rtnl_unlock();
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
if (IS_PF(bp) && SHMEM2_HAS(bp, drv_capabilities_flag)) {
u32 v;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index d09c5a9c53b5..8995cfefbfcf 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -4,11 +4,13 @@
* Copyright (C) 2001, 2002, 2003, 2004 David S. Miller (davem@redhat.com)
* Copyright (C) 2001, 2002, 2003 Jeff Garzik (jgarzik@pobox.com)
* Copyright (C) 2004 Sun Microsystems Inc.
- * Copyright (C) 2005-2014 Broadcom Corporation.
+ * Copyright (C) 2005-2016 Broadcom Corporation.
+ * Copyright (C) 2016-2017 Broadcom Limited.
*
* Firmware is:
* Derived from proprietary unpublished source code,
- * Copyright (C) 2000-2003 Broadcom Corporation.
+ * Copyright (C) 2000-2016 Broadcom Corporation.
+ * Copyright (C) 2016-2017 Broadcom Ltd.
*
* Permission is hereby granted for the distribution of this firmware
* data in hexadecimal or equivalent format, provided this copyright
@@ -10052,6 +10054,16 @@ static int tg3_reset_hw(struct tg3 *tp, bool reset_phy)
tw32(GRC_MODE, tp->grc_mode | val);
+ /* On one of the AMD platform, MRRS is restricted to 4000 because of
+ * south bridge limitation. As a workaround, Driver is setting MRRS
+ * to 2048 instead of default 4096.
+ */
+ if (tp->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL &&
+ tp->pdev->subsystem_device == TG3PCI_SUBDEVICE_ID_DELL_5762) {
+ val = tr32(TG3PCI_DEV_STATUS_CTRL) & ~MAX_READ_REQ_MASK;
+ tw32(TG3PCI_DEV_STATUS_CTRL, val | MAX_READ_REQ_SIZE_2048);
+ }
+
/* Setup the timer prescalar register. Clock is always 66Mhz. */
val = tr32(GRC_MISC_CFG);
val &= ~0xff;
@@ -14227,7 +14239,8 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
*/
if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
tg3_asic_rev(tp) == ASIC_REV_5717 ||
- tg3_asic_rev(tp) == ASIC_REV_5719)
+ tg3_asic_rev(tp) == ASIC_REV_5719 ||
+ tg3_asic_rev(tp) == ASIC_REV_5720)
reset_phy = true;
err = tg3_restart_hw(tp, reset_phy);
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index c2d02d02d1e6..1f0271fa7c74 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -5,7 +5,8 @@
* Copyright (C) 2001, 2002, 2003, 2004 David S. Miller (davem@redhat.com)
* Copyright (C) 2001 Jeff Garzik (jgarzik@pobox.com)
* Copyright (C) 2004 Sun Microsystems Inc.
- * Copyright (C) 2007-2014 Broadcom Corporation.
+ * Copyright (C) 2007-2016 Broadcom Corporation.
+ * Copyright (C) 2016-2017 Broadcom Limited.
*/
#ifndef _T3_H
@@ -96,6 +97,7 @@
#define TG3PCI_SUBDEVICE_ID_DELL_JAGUAR 0x0106
#define TG3PCI_SUBDEVICE_ID_DELL_MERLOT 0x0109
#define TG3PCI_SUBDEVICE_ID_DELL_SLIM_MERLOT 0x010a
+#define TG3PCI_SUBDEVICE_ID_DELL_5762 0x07f0
#define TG3PCI_SUBVENDOR_ID_COMPAQ PCI_VENDOR_ID_COMPAQ
#define TG3PCI_SUBDEVICE_ID_COMPAQ_BANSHEE 0x007c
#define TG3PCI_SUBDEVICE_ID_COMPAQ_BANSHEE_2 0x009a
@@ -281,6 +283,9 @@
#define TG3PCI_STD_RING_PROD_IDX 0x00000098 /* 64-bit */
#define TG3PCI_RCV_RET_RING_CON_IDX 0x000000a0 /* 64-bit */
/* 0xa8 --> 0xb8 unused */
+#define TG3PCI_DEV_STATUS_CTRL 0x000000b4
+#define MAX_READ_REQ_SIZE_2048 0x00004000
+#define MAX_READ_REQ_MASK 0x00007000
#define TG3PCI_DUAL_MAC_CTRL 0x000000b8
#define DUAL_MAC_CTRL_CH_MASK 0x00000003
#define DUAL_MAC_CTRL_ID 0x00000004
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 610573855213..8184d2fca9be 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_device *dev)
for (i = 0; i < txq->bd.ring_size; i++) {
/* Initialize the BD for every fragment in the page. */
bdp->cbd_sc = cpu_to_fec16(0);
+ if (bdp->cbd_bufaddr &&
+ !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
+ dma_unmap_single(&fep->pdev->dev,
+ fec32_to_cpu(bdp->cbd_bufaddr),
+ fec16_to_cpu(bdp->cbd_datlen),
+ DMA_TO_DEVICE);
if (txq->tx_skbuff[i]) {
dev_kfree_skb_any(txq->tx_skbuff[i]);
txq->tx_skbuff[i] = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index f26f97fe4666..582b2f18010a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -137,6 +137,17 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
}
EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
+static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
+ bool reset, void *out, int out_size)
+{
+ u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
+
+ MLX5_SET(query_cong_statistics_in, in, opcode,
+ MLX5_CMD_OP_QUERY_CONG_STATISTICS);
+ MLX5_SET(query_cong_statistics_in, in, clear, reset);
+ return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev)
{
return dev->priv.lag;
@@ -633,3 +644,48 @@ bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv)
/* If bonded, we do not add an IB device for PF1. */
return false;
}
+
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+ u64 *values,
+ int num_counters,
+ size_t *offsets)
+{
+ int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
+ struct mlx5_core_dev *mdev[MLX5_MAX_PORTS];
+ struct mlx5_lag *ldev;
+ int num_ports;
+ int ret, i, j;
+ void *out;
+
+ out = kvzalloc(outlen, GFP_KERNEL);
+ if (!out)
+ return -ENOMEM;
+
+ memset(values, 0, sizeof(*values) * num_counters);
+
+ mutex_lock(&lag_mutex);
+ ldev = mlx5_lag_dev_get(dev);
+ if (ldev && mlx5_lag_is_bonded(ldev)) {
+ num_ports = MLX5_MAX_PORTS;
+ mdev[0] = ldev->pf[0].dev;
+ mdev[1] = ldev->pf[1].dev;
+ } else {
+ num_ports = 1;
+ mdev[0] = dev;
+ }
+
+ for (i = 0; i < num_ports; ++i) {
+ ret = mlx5_cmd_query_cong_counter(mdev[i], false, out, outlen);
+ if (ret)
+ goto unlock;
+
+ for (j = 0; j < num_counters; ++j)
+ values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
+ }
+
+unlock:
+ mutex_unlock(&lag_mutex);
+ kvfree(out);
+ return ret;
+}
+EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index ab4614113403..422ff6333c52 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -624,6 +624,7 @@ static int ksz9031_read_status(struct phy_device *phydev)
phydev->link = 0;
if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
phydev->drv->config_intr(phydev);
+ return genphy_config_aneg(phydev);
}
return 0;
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 5dc9668dde34..827f3f92560e 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -526,6 +526,7 @@ struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
pl->link_config.pause = MLO_PAUSE_AN;
pl->link_config.speed = SPEED_UNKNOWN;
pl->link_config.duplex = DUPLEX_UNKNOWN;
+ pl->link_config.an_enabled = true;
pl->ops = ops;
__set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
@@ -951,6 +952,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
mutex_lock(&pl->state_mutex);
/* Configure the MAC to match the new settings */
linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising);
+ pl->link_config.interface = config.interface;
pl->link_config.speed = our_kset.base.speed;
pl->link_config.duplex = our_kset.base.duplex;
pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE;
diff --git a/drivers/nvmem/meson-mx-efuse.c b/drivers/nvmem/meson-mx-efuse.c
index a346b4923550..41d3a3c1104e 100644
--- a/drivers/nvmem/meson-mx-efuse.c
+++ b/drivers/nvmem/meson-mx-efuse.c
@@ -156,8 +156,8 @@ static int meson_mx_efuse_read(void *context, unsigned int offset,
MESON_MX_EFUSE_CNTL1_AUTO_RD_ENABLE,
MESON_MX_EFUSE_CNTL1_AUTO_RD_ENABLE);
- for (i = offset; i < offset + bytes; i += efuse->config.word_size) {
- addr = i / efuse->config.word_size;
+ for (i = 0; i < bytes; i += efuse->config.word_size) {
+ addr = (offset + i) / efuse->config.word_size;
err = meson_mx_efuse_read_addr(efuse, addr, &tmp);
if (err)
diff --git a/drivers/phy/motorola/phy-cpcap-usb.c b/drivers/phy/motorola/phy-cpcap-usb.c
index accaaaccb662..6601ad0dfb3a 100644
--- a/drivers/phy/motorola/phy-cpcap-usb.c
+++ b/drivers/phy/motorola/phy-cpcap-usb.c
@@ -310,7 +310,7 @@ static int cpcap_usb_init_irq(struct platform_device *pdev,
int irq, error;
irq = platform_get_irq_byname(pdev, name);
- if (!irq)
+ if (irq < 0)
return -ENODEV;
error = devm_request_threaded_irq(ddata->dev, irq, NULL,
diff --git a/drivers/phy/renesas/Kconfig b/drivers/phy/renesas/Kconfig
index cb09245e9b4c..c845facacb06 100644
--- a/drivers/phy/renesas/Kconfig
+++ b/drivers/phy/renesas/Kconfig
@@ -12,7 +12,9 @@ config PHY_RCAR_GEN3_USB2
tristate "Renesas R-Car generation 3 USB 2.0 PHY driver"
depends on ARCH_RENESAS
depends on EXTCON
+ depends on USB_SUPPORT
select GENERIC_PHY
+ select USB_COMMON
help
Support for USB 2.0 PHY found on Renesas R-Car generation 3 SoCs.
diff --git a/drivers/phy/rockchip/phy-rockchip-typec.c b/drivers/phy/rockchip/phy-rockchip-typec.c
index ee85fa0ca4b0..7492c8978217 100644
--- a/drivers/phy/rockchip/phy-rockchip-typec.c
+++ b/drivers/phy/rockchip/phy-rockchip-typec.c
@@ -1137,6 +1137,7 @@ static int rockchip_typec_phy_probe(struct platform_device *pdev)
if (IS_ERR(phy)) {
dev_err(dev, "failed to create phy: %s\n",
child_np->name);
+ pm_runtime_disable(dev);
return PTR_ERR(phy);
}
@@ -1146,6 +1147,7 @@ static int rockchip_typec_phy_probe(struct platform_device *pdev)
phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
if (IS_ERR(phy_provider)) {
dev_err(dev, "Failed to register phy provider\n");
+ pm_runtime_disable(dev);
return PTR_ERR(phy_provider);
}
diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index 4307bf0013e1..63e916d4d069 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -75,14 +75,14 @@ MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
static struct device_node *
tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
{
- /*
- * of_find_node_by_name() drops a reference, so make sure to grab one.
- */
- struct device_node *np = of_node_get(padctl->dev->of_node);
+ struct device_node *pads, *np;
+
+ pads = of_get_child_by_name(padctl->dev->of_node, "pads");
+ if (!pads)
+ return NULL;
- np = of_find_node_by_name(np, "pads");
- if (np)
- np = of_find_node_by_name(np, name);
+ np = of_get_child_by_name(pads, name);
+ of_node_put(pads);
return np;
}
@@ -90,16 +90,16 @@ tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
static struct device_node *
tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index)
{
- /*
- * of_find_node_by_name() drops a reference, so make sure to grab one.
- */
- struct device_node *np = of_node_get(pad->dev.of_node);
+ struct device_node *np, *lanes;
- np = of_find_node_by_name(np, "lanes");
- if (!np)
+ lanes = of_get_child_by_name(pad->dev.of_node, "lanes");
+ if (!lanes)
return NULL;
- return of_find_node_by_name(np, pad->soc->lanes[index].name);
+ np = of_get_child_by_name(lanes, pad->soc->lanes[index].name);
+ of_node_put(lanes);
+
+ return np;
}
static int
@@ -195,7 +195,7 @@ int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
unsigned int i;
int err;
- children = of_find_node_by_name(pad->dev.of_node, "lanes");
+ children = of_get_child_by_name(pad->dev.of_node, "lanes");
if (!children)
return -ENODEV;
@@ -444,21 +444,21 @@ static struct device_node *
tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
unsigned int index)
{
- /*
- * of_find_node_by_name() drops a reference, so make sure to grab one.
- */
- struct device_node *np = of_node_get(padctl->dev->of_node);
+ struct device_node *ports, *np;
+ char *name;
- np = of_find_node_by_name(np, "ports");
- if (np) {
- char *name;
+ ports = of_get_child_by_name(padctl->dev->of_node, "ports");
+ if (!ports)
+ return NULL;
- name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
- if (!name)
- return ERR_PTR(-ENOMEM);
- np = of_find_node_by_name(np, name);
- kfree(name);
+ name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
+ if (!name) {
+ of_node_put(ports);
+ return ERR_PTR(-ENOMEM);
}
+ np = of_get_child_by_name(ports, name);
+ kfree(name);
+ of_node_put(ports);
return np;
}
@@ -847,7 +847,7 @@ static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
static int tegra_xusb_padctl_probe(struct platform_device *pdev)
{
- struct device_node *np = of_node_get(pdev->dev.of_node);
+ struct device_node *np = pdev->dev.of_node;
const struct tegra_xusb_padctl_soc *soc;
struct tegra_xusb_padctl *padctl;
const struct of_device_id *match;
@@ -855,7 +855,7 @@ static int tegra_xusb_padctl_probe(struct platform_device *pdev)
int err;
/* for backwards compatibility with old device trees */
- np = of_find_node_by_name(np, "pads");
+ np = of_get_child_by_name(np, "pads");
if (!np) {
dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n");
return tegra_xusb_padctl_legacy_probe(pdev);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index a9996c16f4ae..26ce17178401 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1415,7 +1415,10 @@ static void __scsi_remove_target(struct scsi_target *starget)
* check.
*/
if (sdev->channel != starget->channel ||
- sdev->id != starget->id ||
+ sdev->id != starget->id)
+ continue;
+ if (sdev->sdev_state == SDEV_DEL ||
+ sdev->sdev_state == SDEV_CANCEL ||
!get_device(&sdev->sdev_gendev))
continue;
spin_unlock_irqrestore(shost->host_lock, flags);
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 1b06cf0375dc..3b3d1d050cac 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -953,10 +953,11 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb,
case TEST_UNIT_READY:
break;
default:
- set_host_byte(scmnd, DID_TARGET_FAILURE);
+ set_host_byte(scmnd, DID_ERROR);
}
break;
case SRB_STATUS_INVALID_LUN:
+ set_host_byte(scmnd, DID_NO_CONNECT);
do_work = true;
process_err_fn = storvsc_remove_lun;
break;
diff --git a/drivers/staging/android/ion/Kconfig b/drivers/staging/android/ion/Kconfig
index a517b2d29f1b..8f6494158d3d 100644
--- a/drivers/staging/android/ion/Kconfig
+++ b/drivers/staging/android/ion/Kconfig
@@ -37,7 +37,7 @@ config ION_CHUNK_HEAP
config ION_CMA_HEAP
bool "Ion CMA heap support"
- depends on ION && CMA
+ depends on ION && DMA_CMA
help
Choose this option to enable CMA heaps with Ion. This heap is backed
by the Contiguous Memory Allocator (CMA). If your system has these
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index a7d9b0e98572..f480885e346b 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -346,7 +346,7 @@ static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
mutex_lock(&buffer->lock);
list_for_each_entry(a, &buffer->attachments, list) {
dma_sync_sg_for_cpu(a->dev, a->table->sgl, a->table->nents,
- DMA_BIDIRECTIONAL);
+ direction);
}
mutex_unlock(&buffer->lock);
@@ -368,7 +368,7 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
mutex_lock(&buffer->lock);
list_for_each_entry(a, &buffer->attachments, list) {
dma_sync_sg_for_device(a->dev, a->table->sgl, a->table->nents,
- DMA_BIDIRECTIONAL);
+ direction);
}
mutex_unlock(&buffer->lock);
diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c
index dd5545d9990a..86196ffd2faf 100644
--- a/drivers/staging/android/ion/ion_cma_heap.c
+++ b/drivers/staging/android/ion/ion_cma_heap.c
@@ -39,9 +39,15 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
struct ion_cma_heap *cma_heap = to_cma_heap(heap);
struct sg_table *table;
struct page *pages;
+ unsigned long size = PAGE_ALIGN(len);
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ unsigned long align = get_order(size);
int ret;
- pages = cma_alloc(cma_heap->cma, len, 0, GFP_KERNEL);
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -53,7 +59,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
if (ret)
goto free_mem;
- sg_set_page(table->sgl, pages, len, 0);
+ sg_set_page(table->sgl, pages, size, 0);
buffer->priv_virt = pages;
buffer->sg_table = table;
@@ -62,7 +68,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
free_mem:
kfree(table);
err:
- cma_release(cma_heap->cma, pages, buffer->size);
+ cma_release(cma_heap->cma, pages, nr_pages);
return -ENOMEM;
}
@@ -70,9 +76,10 @@ static void ion_cma_free(struct ion_buffer *buffer)
{
struct ion_cma_heap *cma_heap = to_cma_heap(buffer->heap);
struct page *pages = buffer->priv_virt;
+ unsigned long nr_pages = PAGE_ALIGN(buffer->size) >> PAGE_SHIFT;
/* release memory */
- cma_release(cma_heap->cma, pages, buffer->size);
+ cma_release(cma_heap->cma, pages, nr_pages);
/* release sg table */
sg_free_table(buffer->sg_table);
kfree(buffer->sg_table);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
index 986c2a40d978..8267119ccc8e 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -487,21 +487,18 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr,
ksocknal_nid2peerlist(id.nid));
}
- route2 = NULL;
list_for_each_entry(route2, &peer->ksnp_routes, ksnr_list) {
- if (route2->ksnr_ipaddr == ipaddr)
- break;
-
- route2 = NULL;
- }
- if (!route2) {
- ksocknal_add_route_locked(peer, route);
- route->ksnr_share_count++;
- } else {
- ksocknal_route_decref(route);
- route2->ksnr_share_count++;
+ if (route2->ksnr_ipaddr == ipaddr) {
+ /* Route already exists, use the old one */
+ ksocknal_route_decref(route);
+ route2->ksnr_share_count++;
+ goto out;
+ }
}
-
+ /* Route doesn't already exist, add the new one */
+ ksocknal_add_route_locked(peer, route);
+ route->ksnr_share_count++;
+out:
write_unlock_bh(&ksocknal_data.ksnd_global_lock);
return 0;
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 419a7a90bce0..f45bcbc63738 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -339,7 +339,7 @@ static void __ring_interrupt(struct tb_ring *ring)
return;
if (ring->start_poll) {
- __ring_interrupt_mask(ring, false);
+ __ring_interrupt_mask(ring, true);
ring->start_poll(ring->poll_data);
} else {
schedule_work(&ring->work);
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 427e0d5d8f13..539b49adb6af 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1762,7 +1762,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
{
struct n_tty_data *ldata = tty->disc_data;
- if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) {
+ if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) {
bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
ldata->line_start = ldata->read_tail;
if (!L_ICANON(tty) || !read_cnt(ldata)) {
@@ -2425,7 +2425,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
case TIOCINQ:
down_write(&tty->termios_rwsem);
- if (L_ICANON(tty))
+ if (L_ICANON(tty) && !L_EXTPROC(tty))
retval = inq_canon(ldata);
else
retval = read_cnt(ldata);
diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
index 3593ce0ec641..880009987460 100644
--- a/drivers/usb/chipidea/ci_hdrc_msm.c
+++ b/drivers/usb/chipidea/ci_hdrc_msm.c
@@ -247,7 +247,7 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
if (ret)
goto err_mux;
- ulpi_node = of_find_node_by_name(of_node_get(pdev->dev.of_node), "ulpi");
+ ulpi_node = of_get_child_by_name(pdev->dev.of_node, "ulpi");
if (ulpi_node) {
phy_node = of_get_next_available_child(ulpi_node, NULL);
ci->hsic = of_device_is_compatible(phy_node, "qcom,usb-hsic-phy");
diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 78e92d29f8d9..c821b4b9647e 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -1007,7 +1007,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
case USB_SSP_CAP_TYPE:
ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
- USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
+ USB_SSP_SUBLINK_SPEED_ATTRIBS);
if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
dev->bos->ssp_cap = ssp_cap;
break;
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index a10b346b9777..4024926c1d68 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -52,10 +52,11 @@ static const struct usb_device_id usb_quirk_list[] = {
/* Microsoft LifeCam-VX700 v2.0 */
{ USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
- /* Logitech HD Pro Webcams C920, C920-C and C930e */
+ /* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */
{ USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
{ USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
{ USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
+ { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
/* Logitech ConferenceCam CC3000e */
{ USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },
@@ -149,6 +150,9 @@ static const struct usb_device_id usb_quirk_list[] = {
/* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
{ USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },
+ /* ELSA MicroLink 56K */
+ { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },
+
/* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
{ USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
diff --git a/drivers/usb/host/xhci-debugfs.c b/drivers/usb/host/xhci-debugfs.c
index 4f7895dbcf88..e26e685d8a57 100644
--- a/drivers/usb/host/xhci-debugfs.c
+++ b/drivers/usb/host/xhci-debugfs.c
@@ -162,7 +162,7 @@ static void xhci_debugfs_extcap_regset(struct xhci_hcd *xhci, int cap_id,
static int xhci_ring_enqueue_show(struct seq_file *s, void *unused)
{
dma_addr_t dma;
- struct xhci_ring *ring = s->private;
+ struct xhci_ring *ring = *(struct xhci_ring **)s->private;
dma = xhci_trb_virt_to_dma(ring->enq_seg, ring->enqueue);
seq_printf(s, "%pad\n", &dma);
@@ -173,7 +173,7 @@ static int xhci_ring_enqueue_show(struct seq_file *s, void *unused)
static int xhci_ring_dequeue_show(struct seq_file *s, void *unused)
{
dma_addr_t dma;
- struct xhci_ring *ring = s->private;
+ struct xhci_ring *ring = *(struct xhci_ring **)s->private;
dma = xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue);
seq_printf(s, "%pad\n", &dma);
@@ -183,7 +183,7 @@ static int xhci_ring_dequeue_show(struct seq_file *s, void *unused)
static int xhci_ring_cycle_show(struct seq_file *s, void *unused)
{
- struct xhci_ring *ring = s->private;
+ struct xhci_ring *ring = *(struct xhci_ring **)s->private;
seq_printf(s, "%d\n", ring->cycle_state);
@@ -346,7 +346,7 @@ static void xhci_debugfs_create_files(struct xhci_hcd *xhci,
}
static struct dentry *xhci_debugfs_create_ring_dir(struct xhci_hcd *xhci,
- struct xhci_ring *ring,
+ struct xhci_ring **ring,
const char *name,
struct dentry *parent)
{
@@ -387,7 +387,7 @@ void xhci_debugfs_create_endpoint(struct xhci_hcd *xhci,
snprintf(epriv->name, sizeof(epriv->name), "ep%02d", ep_index);
epriv->root = xhci_debugfs_create_ring_dir(xhci,
- dev->eps[ep_index].new_ring,
+ &dev->eps[ep_index].new_ring,
epriv->name,
spriv->root);
spriv->eps[ep_index] = epriv;
@@ -423,7 +423,7 @@ void xhci_debugfs_create_slot(struct xhci_hcd *xhci, int slot_id)
priv->dev = dev;
dev->debugfs_private = priv;
- xhci_debugfs_create_ring_dir(xhci, dev->eps[0].ring,
+ xhci_debugfs_create_ring_dir(xhci, &dev->eps[0].ring,
"ep00", priv->root);
xhci_debugfs_create_context_files(xhci, priv->root, slot_id);
@@ -488,11 +488,11 @@ void xhci_debugfs_init(struct xhci_hcd *xhci)
ARRAY_SIZE(xhci_extcap_dbc),
"reg-ext-dbc");
- xhci_debugfs_create_ring_dir(xhci, xhci->cmd_ring,
+ xhci_debugfs_create_ring_dir(xhci, &xhci->cmd_ring,
"command-ring",
xhci->debugfs_root);
- xhci_debugfs_create_ring_dir(xhci, xhci->event_ring,
+ xhci_debugfs_create_ring_dir(xhci, &xhci->event_ring,
"event-ring",
xhci->debugfs_root);
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 7ef1274ef7f7..1aad89b8aba0 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -178,6 +178,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
xhci->quirks |= XHCI_BROKEN_STREAMS;
}
if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
+ pdev->device == 0x0014)
+ xhci->quirks |= XHCI_TRUST_TX_LENGTH;
+ if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
pdev->device == 0x0015)
xhci->quirks |= XHCI_RESET_ON_RESUME;
if (pdev->vendor == PCI_VENDOR_ID_VIA)
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 2424d3020ca3..da6dbe3ebd8b 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -3525,8 +3525,6 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
struct xhci_slot_ctx *slot_ctx;
int i, ret;
- xhci_debugfs_remove_slot(xhci, udev->slot_id);
-
#ifndef CONFIG_USB_DEFAULT_PERSIST
/*
* We called pm_runtime_get_noresume when the device was attached.
@@ -3555,8 +3553,10 @@ static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
}
ret = xhci_disable_slot(xhci, udev->slot_id);
- if (ret)
+ if (ret) {
+ xhci_debugfs_remove_slot(xhci, udev->slot_id);
xhci_free_virt_device(xhci, udev->slot_id);
+ }
}
int xhci_disable_slot(struct xhci_hcd *xhci, u32 slot_id)
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 1aba9105b369..fc68952c994a 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1013,6 +1013,7 @@ static const struct usb_device_id id_table_combined[] = {
.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
+ { USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
{ } /* Terminating entry */
};
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 4faa09fe308c..8b4ecd2bd297 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -915,6 +915,12 @@
#define ICPDAS_I7563U_PID 0x0105
/*
+ * Airbus Defence and Space
+ */
+#define AIRBUS_DS_VID 0x1e8e /* Vendor ID */
+#define AIRBUS_DS_P8GR 0x6001 /* Tetra P8GR */
+
+/*
* RT Systems programming cables for various ham radios
*/
#define RTSYSTEMS_VID 0x2100 /* Vendor ID */
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 3b3513874cfd..b6320e3be429 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -233,6 +233,8 @@ static void option_instat_callback(struct urb *urb);
/* These Quectel products use Qualcomm's vendor ID */
#define QUECTEL_PRODUCT_UC20 0x9003
#define QUECTEL_PRODUCT_UC15 0x9090
+/* These Yuga products use Qualcomm's vendor ID */
+#define YUGA_PRODUCT_CLM920_NC5 0x9625
#define QUECTEL_VENDOR_ID 0x2c7c
/* These Quectel products use Quectel's vendor ID */
@@ -280,6 +282,7 @@ static void option_instat_callback(struct urb *urb);
#define TELIT_PRODUCT_LE922_USBCFG3 0x1043
#define TELIT_PRODUCT_LE922_USBCFG5 0x1045
#define TELIT_PRODUCT_ME910 0x1100
+#define TELIT_PRODUCT_ME910_DUAL_MODEM 0x1101
#define TELIT_PRODUCT_LE920 0x1200
#define TELIT_PRODUCT_LE910 0x1201
#define TELIT_PRODUCT_LE910_USBCFG4 0x1206
@@ -645,6 +648,11 @@ static const struct option_blacklist_info telit_me910_blacklist = {
.reserved = BIT(1) | BIT(3),
};
+static const struct option_blacklist_info telit_me910_dual_modem_blacklist = {
+ .sendsetup = BIT(0),
+ .reserved = BIT(3),
+};
+
static const struct option_blacklist_info telit_le910_blacklist = {
.sendsetup = BIT(0),
.reserved = BIT(1) | BIT(2),
@@ -674,6 +682,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = {
.reserved = BIT(4) | BIT(5),
};
+static const struct option_blacklist_info yuga_clm920_nc5_blacklist = {
+ .reserved = BIT(1) | BIT(4),
+};
+
static const struct usb_device_id option_ids[] = {
{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
@@ -1178,6 +1190,9 @@ static const struct usb_device_id option_ids[] = {
{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
.driver_info = (kernel_ulong_t)&net_intf4_blacklist },
+ /* Yuga products use Qualcomm vendor ID */
+ { USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
+ .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist },
/* Quectel products using Quectel vendor ID */
{ USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21),
.driver_info = (kernel_ulong_t)&net_intf4_blacklist },
@@ -1244,6 +1259,8 @@ static const struct usb_device_id option_ids[] = {
.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
.driver_info = (kernel_ulong_t)&telit_me910_blacklist },
+ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
+ .driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist },
{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
.driver_info = (kernel_ulong_t)&telit_le910_blacklist },
{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),
diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index e3892541a489..613f91add03d 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -162,6 +162,8 @@ static const struct usb_device_id id_table[] = {
{DEVICE_SWI(0x1199, 0x9079)}, /* Sierra Wireless EM74xx */
{DEVICE_SWI(0x1199, 0x907a)}, /* Sierra Wireless EM74xx QDL */
{DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */
+ {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */
+ {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */
{DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
{DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
{DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
@@ -342,6 +344,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id)
break;
case 2:
dev_dbg(dev, "NMEA GPS interface found\n");
+ sendsetup = true;
break;
case 3:
dev_dbg(dev, "Modem port found\n");
diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
index a3df8ee82faf..e31a6f204397 100644
--- a/drivers/usb/usbip/stub_dev.c
+++ b/drivers/usb/usbip/stub_dev.c
@@ -149,8 +149,7 @@ static void stub_shutdown_connection(struct usbip_device *ud)
* step 1?
*/
if (ud->tcp_socket) {
- dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n",
- ud->tcp_socket);
+ dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd);
kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
}
diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index 4f48b306713f..c31c8402a0c5 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -237,11 +237,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev)
struct stub_priv *priv;
struct urb *urb;
- dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev);
+ dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n");
while ((priv = stub_priv_pop(sdev))) {
urb = priv->urb;
- dev_dbg(&sdev->udev->dev, "free urb %p\n", urb);
+ dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n",
+ priv->seqnum);
usb_kill_urb(urb);
kmem_cache_free(stub_priv_cache, priv);
diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 493ac2928391..6c5a59313999 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -211,9 +211,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
if (priv->seqnum != pdu->u.cmd_unlink.seqnum)
continue;
- dev_info(&priv->urb->dev->dev, "unlink urb %p\n",
- priv->urb);
-
/*
* This matched urb is not completed yet (i.e., be in
* flight in usb hcd hardware/driver). Now we are
@@ -252,8 +249,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
ret = usb_unlink_urb(priv->urb);
if (ret != -EINPROGRESS)
dev_err(&priv->urb->dev->dev,
- "failed to unlink a urb %p, ret %d\n",
- priv->urb, ret);
+ "failed to unlink a urb # %lu, ret %d\n",
+ priv->seqnum, ret);
return 0;
}
@@ -342,14 +339,6 @@ static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu)
epd = &ep->desc;
- /* validate transfer_buffer_length */
- if (pdu->u.cmd_submit.transfer_buffer_length > INT_MAX) {
- dev_err(&sdev->udev->dev,
- "CMD_SUBMIT: -EMSGSIZE transfer_buffer_length %d\n",
- pdu->u.cmd_submit.transfer_buffer_length);
- return -1;
- }
-
if (usb_endpoint_xfer_control(epd)) {
if (dir == USBIP_DIR_OUT)
return usb_sndctrlpipe(udev, epnum);
@@ -482,8 +471,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev,
}
/* allocate urb transfer buffer, if needed */
- if (pdu->u.cmd_submit.transfer_buffer_length > 0 &&
- pdu->u.cmd_submit.transfer_buffer_length <= INT_MAX) {
+ if (pdu->u.cmd_submit.transfer_buffer_length > 0) {
priv->urb->transfer_buffer =
kzalloc(pdu->u.cmd_submit.transfer_buffer_length,
GFP_KERNEL);
diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
index 53172b1f6257..f0ec41a50cbc 100644
--- a/drivers/usb/usbip/stub_tx.c
+++ b/drivers/usb/usbip/stub_tx.c
@@ -88,7 +88,7 @@ void stub_complete(struct urb *urb)
/* link a urb to the queue of tx. */
spin_lock_irqsave(&sdev->priv_lock, flags);
if (sdev->ud.tcp_socket == NULL) {
- usbip_dbg_stub_tx("ignore urb for closed connection %p", urb);
+ usbip_dbg_stub_tx("ignore urb for closed connection\n");
/* It will be freed in stub_device_cleanup_urbs(). */
} else if (priv->unlinking) {
stub_enqueue_ret_unlink(sdev, priv->seqnum, urb->status);
@@ -190,8 +190,8 @@ static int stub_send_ret_submit(struct stub_device *sdev)
/* 1. setup usbip_header */
setup_ret_submit_pdu(&pdu_header, urb);
- usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
- pdu_header.base.seqnum, urb);
+ usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
+ pdu_header.base.seqnum);
usbip_header_correct_endian(&pdu_header, 1);
iov[iovnum].iov_base = &pdu_header;
diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
index f7978933b402..7b219d9109b4 100644
--- a/drivers/usb/usbip/usbip_common.c
+++ b/drivers/usb/usbip/usbip_common.c
@@ -317,26 +317,20 @@ int usbip_recv(struct socket *sock, void *buf, int size)
struct msghdr msg = {.msg_flags = MSG_NOSIGNAL};
int total = 0;
+ if (!sock || !buf || !size)
+ return -EINVAL;
+
iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size);
usbip_dbg_xmit("enter\n");
- if (!sock || !buf || !size) {
- pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf,
- size);
- return -EINVAL;
- }
-
do {
- int sz = msg_data_left(&msg);
+ msg_data_left(&msg);
sock->sk->sk_allocation = GFP_NOIO;
result = sock_recvmsg(sock, &msg, MSG_WAITALL);
- if (result <= 0) {
- pr_debug("receive sock %p buf %p size %u ret %d total %d\n",
- sock, buf + total, sz, result, total);
+ if (result <= 0)
goto err;
- }
total += result;
} while (msg_data_left(&msg));
diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
index 6b3278c4b72a..c3e1008aa491 100644
--- a/drivers/usb/usbip/vhci_hcd.c
+++ b/drivers/usb/usbip/vhci_hcd.c
@@ -656,9 +656,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
struct vhci_device *vdev;
unsigned long flags;
- usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n",
- hcd, urb, mem_flags);
-
if (portnum > VHCI_HC_PORTS) {
pr_err("invalid port number %d\n", portnum);
return -ENODEV;
@@ -822,8 +819,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
struct vhci_device *vdev;
unsigned long flags;
- pr_info("dequeue a urb %p\n", urb);
-
spin_lock_irqsave(&vhci->lock, flags);
priv = urb->hcpriv;
@@ -851,7 +846,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
/* tcp connection is closed */
spin_lock(&vdev->priv_lock);
- pr_info("device %p seems to be disconnected\n", vdev);
list_del(&priv->list);
kfree(priv);
urb->hcpriv = NULL;
@@ -863,8 +857,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
* vhci_rx will receive RET_UNLINK and give back the URB.
* Otherwise, we give back it here.
*/
- pr_info("gives back urb %p\n", urb);
-
usb_hcd_unlink_urb_from_ep(hcd, urb);
spin_unlock_irqrestore(&vhci->lock, flags);
@@ -892,8 +884,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
unlink->unlink_seqnum = priv->seqnum;
- pr_info("device %p seems to be still connected\n", vdev);
-
/* send cmd_unlink and try to cancel the pending URB in the
* peer */
list_add_tail(&unlink->list, &vdev->unlink_tx);
@@ -975,7 +965,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
/* need this? see stub_dev.c */
if (ud->tcp_socket) {
- pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket);
+ pr_debug("shutdown tcp_socket %d\n", ud->sockfd);
kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
}
diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
index 90577e8b2282..112ebb90d8c9 100644
--- a/drivers/usb/usbip/vhci_rx.c
+++ b/drivers/usb/usbip/vhci_rx.c
@@ -23,24 +23,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum)
urb = priv->urb;
status = urb->status;
- usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n",
- urb, priv, seqnum);
+ usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum);
switch (status) {
case -ENOENT:
/* fall through */
case -ECONNRESET:
- dev_info(&urb->dev->dev,
- "urb %p was unlinked %ssynchronuously.\n", urb,
- status == -ENOENT ? "" : "a");
+ dev_dbg(&urb->dev->dev,
+ "urb seq# %u was unlinked %ssynchronuously\n",
+ seqnum, status == -ENOENT ? "" : "a");
break;
case -EINPROGRESS:
/* no info output */
break;
default:
- dev_info(&urb->dev->dev,
- "urb %p may be in a error, status %d\n", urb,
- status);
+ dev_dbg(&urb->dev->dev,
+ "urb seq# %u may be in a error, status %d\n",
+ seqnum, status);
}
list_del(&priv->list);
@@ -67,8 +66,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
spin_unlock_irqrestore(&vdev->priv_lock, flags);
if (!urb) {
- pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum);
- pr_info("max seqnum %d\n",
+ pr_err("cannot find a urb of seqnum %u max seqnum %d\n",
+ pdu->base.seqnum,
atomic_read(&vhci_hcd->seqnum));
usbip_event_add(ud, VDEV_EVENT_ERROR_TCP);
return;
@@ -91,7 +90,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
if (usbip_dbg_flag_vhci_rx)
usbip_dump_urb(urb);
- usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+ usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum);
spin_lock_irqsave(&vhci->lock, flags);
usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb);
@@ -158,7 +157,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev,
pr_info("the urb (seqnum %d) was already given back\n",
pdu->base.seqnum);
} else {
- usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+ usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum);
/* If unlink is successful, status is -ECONNRESET */
urb->status = pdu->u.ret_unlink.status;
diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
index d625a2ff4b71..9aed15a358b7 100644
--- a/drivers/usb/usbip/vhci_tx.c
+++ b/drivers/usb/usbip/vhci_tx.c
@@ -69,7 +69,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev)
memset(&msg, 0, sizeof(msg));
memset(&iov, 0, sizeof(iov));
- usbip_dbg_vhci_tx("setup txdata urb %p\n", urb);
+ usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n",
+ priv->seqnum);
/* 1. setup usbip_header */
setup_cmd_submit_pdu(&pdu_header, urb);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 57b109c6e422..1f509d072026 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1165,6 +1165,10 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+ u64 *values,
+ int num_counters,
+ size_t *offsets);
struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
diff --git a/include/linux/pti.h b/include/linux/pti.h
new file mode 100644
index 000000000000..0174883a935a
--- /dev/null
+++ b/include/linux/pti.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _INCLUDE_PTI_H
+#define _INCLUDE_PTI_H
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#include <asm/pti.h>
+#else
+static inline void pti_init(void) { }
+#endif
+
+#endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index f442d1a42025..7cc35921218e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void);
extern void tick_nohz_irq_exit(void);
extern ktime_t tick_nohz_get_sleep_length(void);
extern unsigned long tick_nohz_get_idle_calls(void);
+extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else /* !CONFIG_NO_HZ_COMMON */
diff --git a/include/net/sock.h b/include/net/sock.h
index 9155da422692..7a7b14e9628a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1514,6 +1514,11 @@ static inline bool sock_owned_by_user(const struct sock *sk)
return sk->sk_lock.owned;
}
+static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
+{
+ return sk->sk_lock.owned;
+}
+
/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index dc28a98ce97c..ae35991b5877 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1570,6 +1570,9 @@ int xfrm_init_state(struct xfrm_state *x);
int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb);
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
+int xfrm_trans_queue(struct sk_buff *skb,
+ int (*finish)(struct net *, struct sock *,
+ struct sk_buff *));
int xfrm_output_resume(struct sk_buff *skb, int err);
int xfrm_output(struct sock *sk, struct sk_buff *skb);
int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb);
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07cccca6cbf1..ab34c561f26b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -25,6 +25,35 @@
tcp_state_name(TCP_CLOSING), \
tcp_state_name(TCP_NEW_SYN_RECV))
+#define TP_STORE_V4MAPPED(__entry, saddr, daddr) \
+ do { \
+ struct in6_addr *pin6; \
+ \
+ pin6 = (struct in6_addr *)__entry->saddr_v6; \
+ ipv6_addr_set_v4mapped(saddr, pin6); \
+ pin6 = (struct in6_addr *)__entry->daddr_v6; \
+ ipv6_addr_set_v4mapped(daddr, pin6); \
+ } while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \
+ do { \
+ if (sk->sk_family == AF_INET6) { \
+ struct in6_addr *pin6; \
+ \
+ pin6 = (struct in6_addr *)__entry->saddr_v6; \
+ *pin6 = saddr6; \
+ pin6 = (struct in6_addr *)__entry->daddr_v6; \
+ *pin6 = daddr6; \
+ } else { \
+ TP_STORE_V4MAPPED(__entry, saddr, daddr); \
+ } \
+ } while (0)
+#else
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \
+ TP_STORE_V4MAPPED(__entry, saddr, daddr)
+#endif
+
/*
* tcp event with arguments sk and skb
*
@@ -50,7 +79,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
TP_fast_assign(
struct inet_sock *inet = inet_sk(sk);
- struct in6_addr *pin6;
__be32 *p32;
__entry->skbaddr = skb;
@@ -65,20 +93,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
p32 = (__be32 *) __entry->daddr;
*p32 = inet->inet_daddr;
-#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- *pin6 = sk->sk_v6_rcv_saddr;
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- *pin6 = sk->sk_v6_daddr;
- } else
-#endif
- {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
- }
+ TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+ sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
),
TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -127,7 +143,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
TP_fast_assign(
struct inet_sock *inet = inet_sk(sk);
- struct in6_addr *pin6;
__be32 *p32;
__entry->skaddr = sk;
@@ -141,20 +156,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
p32 = (__be32 *) __entry->daddr;
*p32 = inet->inet_daddr;
-#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- *pin6 = sk->sk_v6_rcv_saddr;
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- *pin6 = sk->sk_v6_daddr;
- } else
-#endif
- {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
- }
+ TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+ sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
),
TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -197,7 +200,6 @@ TRACE_EVENT(tcp_set_state,
TP_fast_assign(
struct inet_sock *inet = inet_sk(sk);
- struct in6_addr *pin6;
__be32 *p32;
__entry->skaddr = sk;
@@ -213,20 +215,8 @@ TRACE_EVENT(tcp_set_state,
p32 = (__be32 *) __entry->daddr;
*p32 = inet->inet_daddr;
-#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- *pin6 = sk->sk_v6_rcv_saddr;
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- *pin6 = sk->sk_v6_daddr;
- } else
-#endif
- {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
- }
+ TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+ sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
),
TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
@@ -256,7 +246,6 @@ TRACE_EVENT(tcp_retransmit_synack,
TP_fast_assign(
struct inet_request_sock *ireq = inet_rsk(req);
- struct in6_addr *pin6;
__be32 *p32;
__entry->skaddr = sk;
@@ -271,20 +260,8 @@ TRACE_EVENT(tcp_retransmit_synack,
p32 = (__be32 *) __entry->daddr;
*p32 = ireq->ir_rmt_addr;
-#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- *pin6 = ireq->ir_v6_loc_addr;
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- *pin6 = ireq->ir_v6_rmt_addr;
- } else
-#endif
- {
- pin6 = (struct in6_addr *)__entry->saddr_v6;
- ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
- pin6 = (struct in6_addr *)__entry->daddr_v6;
- ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
- }
+ TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr,
+ ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr);
),
TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
diff --git a/init/main.c b/init/main.c
index 7b606fc48482..a8100b954839 100644
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
+#include <linux/pti.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/sched_clock.h>
@@ -506,6 +507,8 @@ static void __init mm_init(void)
ioremap_huge_init();
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
+ /* Should be run after espfix64 is set up. */
+ pti_init();
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
#ifdef CONFIG_NO_HZ_COMMON
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
{
- unsigned long idle_calls = tick_nohz_get_idle_calls();
+ unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
bool ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..77555faf6fbc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -986,6 +986,19 @@ ktime_t tick_nohz_get_sleep_length(void)
}
/**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+
+ return ts->idle_calls;
+}
+
+/**
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index c3e84edc47c9..2615074d3de5 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -346,7 +346,8 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
static void zap_modalias_env(struct kobj_uevent_env *env)
{
static const char modalias_prefix[] = "MODALIAS=";
- int i;
+ size_t len;
+ int i, j;
for (i = 0; i < env->envp_idx;) {
if (strncmp(env->envp[i], modalias_prefix,
@@ -355,11 +356,18 @@ static void zap_modalias_env(struct kobj_uevent_env *env)
continue;
}
- if (i != env->envp_idx - 1)
- memmove(&env->envp[i], &env->envp[i + 1],
- sizeof(env->envp[i]) * env->envp_idx - 1);
+ len = strlen(env->envp[i]) + 1;
+
+ if (i != env->envp_idx - 1) {
+ memmove(env->envp[i], env->envp[i + 1],
+ env->buflen - len);
+
+ for (j = i; j < env->envp_idx - 1; j++)
+ env->envp[j] = env->envp[j + 1] - len;
+ }
env->envp_idx--;
+ env->buflen -= len;
}
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a3cb0be4c6f3..08f574081315 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1177,12 +1177,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
int i, new_frags;
u32 d_off;
- if (!num_frags)
- goto release;
-
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;
+ if (!num_frags)
+ goto release;
+
new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < new_frags; i++) {
page = alloc_page(gfp_mask);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index e50b7fea57ee..bcfc00e88756 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -23,6 +23,12 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
return xfrm4_extract_header(skb);
}
+static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dst_input(skb);
+}
+
static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
@@ -33,7 +39,11 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
iph->tos, skb->dev))
goto drop;
}
- return dst_input(skb);
+
+ if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2))
+ goto drop;
+
+ return 0;
drop:
kfree_skb(skb);
return NET_RX_DROP;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 416c8913f132..772695960890 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1014,6 +1014,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
eth_random_addr(dev->perm_addr);
}
+#define GRE6_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
+
+static void ip6gre_tnl_init_features(struct net_device *dev)
+{
+ struct ip6_tnl *nt = netdev_priv(dev);
+
+ dev->features |= GRE6_FEATURES;
+ dev->hw_features |= GRE6_FEATURES;
+
+ if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported, nor
+ * can we support 2 levels of outer headers requiring
+ * an update.
+ */
+ if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
+ nt->encap.type == TUNNEL_ENCAP_NONE) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
+
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
+ dev->features |= NETIF_F_LLTX;
+ }
+}
+
static int ip6gre_tunnel_init_common(struct net_device *dev)
{
struct ip6_tnl *tunnel;
@@ -1048,6 +1078,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
+ ip6gre_tnl_init_features(dev);
+
return 0;
}
@@ -1298,11 +1330,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
.ndo_get_iflink = ip6_tnl_get_iflink,
};
-#define GRE6_FEATURES (NETIF_F_SG | \
- NETIF_F_FRAGLIST | \
- NETIF_F_HIGHDMA | \
- NETIF_F_HW_CSUM)
-
static void ip6gre_tap_setup(struct net_device *dev)
{
@@ -1383,26 +1410,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
nt->net = dev_net(dev);
ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
- dev->features |= GRE6_FEATURES;
- dev->hw_features |= GRE6_FEATURES;
-
- if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP offload with GRE SEQ is not supported, nor
- * can we support 2 levels of outer headers requiring
- * an update.
- */
- if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
- (nt->encap.type == TUNNEL_ENCAP_NONE)) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- }
-
- /* Can use a lockless transmit, unless we generate
- * output sequences
- */
- dev->features |= NETIF_F_LLTX;
- }
-
err = register_netdevice(dev);
if (err)
goto out;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index fe04e23af986..841f4a07438e 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -32,6 +32,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
}
EXPORT_SYMBOL(xfrm6_rcv_spi);
+static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (xfrm_trans_queue(skb, ip6_rcv_finish))
+ __kfree_skb(skb);
+ return -1;
+}
+
int xfrm6_transport_finish(struct sk_buff *skb, int async)
{
struct xfrm_offload *xo = xfrm_offload(skb);
@@ -56,7 +64,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
- ip6_rcv_finish);
+ xfrm6_transport_finish2);
return -1;
}
diff --git a/net/rds/send.c b/net/rds/send.c
index b52cdc8ae428..f72466c63f0c 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
continue;
if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+ if (cmsg->cmsg_len <
+ CMSG_LEN(sizeof(struct rds_rdma_args)))
+ return -EINVAL;
args = CMSG_DATA(cmsg);
*rdma_bytes += args->remote_vec.bytes;
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b91ea03e3afa..b9d63d2246e6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -379,6 +379,8 @@ void tcf_block_put(struct tcf_block *block)
{
struct tcf_block_ext_info ei = {0, };
+ if (!block)
+ return;
tcf_block_put_ext(block, block->q, &ei);
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cd1b200acae7..661c7144b53a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1040,6 +1040,8 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
if (!tp_head) {
RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
+ /* Wait for flying RCU callback before it is freed. */
+ rcu_barrier_bh();
return;
}
@@ -1055,7 +1057,7 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
rcu_assign_pointer(*miniqp->p_miniq, miniq);
if (miniq_old)
- /* This is counterpart of the rcu barrier above. We need to
+ /* This is counterpart of the rcu barriers above. We need to
* block potential new user of miniq_old until all readers
* are not seeing it.
*/
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3253f724a995..b4fb6e4886d2 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4498,7 +4498,7 @@ static int sctp_init_sock(struct sock *sk)
SCTP_DBG_OBJCNT_INC(sock);
local_bh_disable();
- percpu_counter_inc(&sctp_sockets_allocated);
+ sk_sockets_allocated_inc(sk);
sock_prot_inuse_add(net, sk->sk_prot, 1);
/* Nothing can fail after this block, otherwise
@@ -4542,7 +4542,7 @@ static void sctp_destroy_sock(struct sock *sk)
}
sctp_endpoint_free(sp->ep);
local_bh_disable();
- percpu_counter_dec(&sctp_sockets_allocated);
+ sk_sockets_allocated_dec(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
local_bh_enable();
}
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index c5fda15ba319..1fdab5c4eda8 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -401,7 +401,7 @@ void strp_data_ready(struct strparser *strp)
* allows a thread in BH context to safely check if the process
* lock is held. In this case, if the lock is held, queue work.
*/
- if (sock_owned_by_user(strp->sk)) {
+ if (sock_owned_by_user_nocheck(strp->sk)) {
queue_work(strp_wq, &strp->work);
return;
}
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 47ec121574ce..c8001471da6c 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -324,6 +324,7 @@ restart:
if (res) {
pr_warn("Bearer <%s> rejected, enable failure (%d)\n",
name, -res);
+ kfree(b);
return -EINVAL;
}
@@ -347,8 +348,10 @@ restart:
if (skb)
tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr);
- if (tipc_mon_create(net, bearer_id))
+ if (tipc_mon_create(net, bearer_id)) {
+ bearer_disable(net, b);
return -ENOMEM;
+ }
pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
name,
diff --git a/net/tipc/group.c b/net/tipc/group.c
index 7ebbdeb2a90e..8e12ab55346b 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -368,18 +368,20 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack)
u16 prev = grp->bc_snd_nxt - 1;
struct tipc_member *m;
struct rb_node *n;
+ u16 ackers = 0;
for (n = rb_first(&grp->members); n; n = rb_next(n)) {
m = container_of(n, struct tipc_member, tree_node);
if (tipc_group_is_enabled(m)) {
tipc_group_update_member(m, len);
m->bc_acked = prev;
+ ackers++;
}
}
/* Mark number of acknowledges to expect, if any */
if (ack)
- grp->bc_ackers = grp->member_cnt;
+ grp->bc_ackers = ackers;
grp->bc_snd_nxt++;
}
@@ -848,17 +850,26 @@ void tipc_group_member_evt(struct tipc_group *grp,
*usr_wakeup = true;
m->usr_pending = false;
node_up = tipc_node_is_up(net, node);
-
- /* Hold back event if more messages might be expected */
- if (m->state != MBR_LEAVING && node_up) {
- m->event_msg = skb;
- tipc_group_decr_active(grp, m);
- m->state = MBR_LEAVING;
- } else {
- if (node_up)
+ m->event_msg = NULL;
+
+ if (node_up) {
+ /* Hold back event if a LEAVE msg should be expected */
+ if (m->state != MBR_LEAVING) {
+ m->event_msg = skb;
+ tipc_group_decr_active(grp, m);
+ m->state = MBR_LEAVING;
+ } else {
msg_set_grp_bc_seqno(hdr, m->bc_syncpt);
- else
+ __skb_queue_tail(inputq, skb);
+ }
+ } else {
+ if (m->state != MBR_LEAVING) {
+ tipc_group_decr_active(grp, m);
+ m->state = MBR_LEAVING;
msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt);
+ } else {
+ msg_set_grp_bc_seqno(hdr, m->bc_syncpt);
+ }
__skb_queue_tail(inputq, skb);
}
list_del_init(&m->list);
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 8e884ed06d4b..32dc33a94bc7 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -642,9 +642,13 @@ void tipc_mon_delete(struct net *net, int bearer_id)
{
struct tipc_net *tn = tipc_net(net);
struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
- struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *self;
struct tipc_peer *peer, *tmp;
+ if (!mon)
+ return;
+
+ self = get_self(net, bearer_id);
write_lock_bh(&mon->lock);
tn->monitors[bearer_id] = NULL;
list_for_each_entry_safe(peer, tmp, &self->list, list) {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 41127d0b925e..3b4084480377 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -727,11 +727,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
switch (sk->sk_state) {
case TIPC_ESTABLISHED:
+ case TIPC_CONNECTING:
if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
revents |= POLLOUT;
/* fall thru' */
case TIPC_LISTEN:
- case TIPC_CONNECTING:
if (!skb_queue_empty(&sk->sk_receive_queue))
revents |= POLLIN | POLLRDNORM;
break;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 347ab31574d5..3f6f6f8c9fa5 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -8,15 +8,29 @@
*
*/
+#include <linux/bottom_half.h>
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/netdevice.h>
+#include <linux/percpu.h>
#include <net/dst.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ip_tunnels.h>
#include <net/ip6_tunnel.h>
+struct xfrm_trans_tasklet {
+ struct tasklet_struct tasklet;
+ struct sk_buff_head queue;
+};
+
+struct xfrm_trans_cb {
+ int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb);
+};
+
+#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0]))
+
static struct kmem_cache *secpath_cachep __read_mostly;
static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
@@ -25,6 +39,8 @@ static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
static struct gro_cells gro_cells;
static struct net_device xfrm_napi_dev;
+static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet);
+
int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo)
{
int err = 0;
@@ -207,7 +223,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
xfrm_address_t *daddr;
struct xfrm_mode *inner_mode;
u32 mark = skb->mark;
- unsigned int family;
+ unsigned int family = AF_UNSPEC;
int decaps = 0;
int async = 0;
bool xfrm_gro = false;
@@ -216,6 +232,16 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
if (encap_type < 0) {
x = xfrm_input_state(skb);
+
+ if (unlikely(x->km.state != XFRM_STATE_VALID)) {
+ if (x->km.state == XFRM_STATE_ACQ)
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
+ else
+ XFRM_INC_STATS(net,
+ LINUX_MIB_XFRMINSTATEINVALID);
+ goto drop;
+ }
+
family = x->outer_mode->afinfo->family;
/* An encap_type of -1 indicates async resumption. */
@@ -467,9 +493,41 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr)
}
EXPORT_SYMBOL(xfrm_input_resume);
+static void xfrm_trans_reinject(unsigned long data)
+{
+ struct xfrm_trans_tasklet *trans = (void *)data;
+ struct sk_buff_head queue;
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(&queue);
+ skb_queue_splice_init(&trans->queue, &queue);
+
+ while ((skb = __skb_dequeue(&queue)))
+ XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb);
+}
+
+int xfrm_trans_queue(struct sk_buff *skb,
+ int (*finish)(struct net *, struct sock *,
+ struct sk_buff *))
+{
+ struct xfrm_trans_tasklet *trans;
+
+ trans = this_cpu_ptr(&xfrm_trans_tasklet);
+
+ if (skb_queue_len(&trans->queue) >= netdev_max_backlog)
+ return -ENOBUFS;
+
+ XFRM_TRANS_SKB_CB(skb)->finish = finish;
+ skb_queue_tail(&trans->queue, skb);
+ tasklet_schedule(&trans->tasklet);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_trans_queue);
+
void __init xfrm_input_init(void)
{
int err;
+ int i;
init_dummy_netdev(&xfrm_napi_dev);
err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
@@ -480,4 +538,13 @@ void __init xfrm_input_init(void)
sizeof(struct sec_path),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+
+ for_each_possible_cpu(i) {
+ struct xfrm_trans_tasklet *trans;
+
+ trans = &per_cpu(xfrm_trans_tasklet, i);
+ __skb_queue_head_init(&trans->queue);
+ tasklet_init(&trans->tasklet, xfrm_trans_reinject,
+ (unsigned long)trans);
+ }
}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9542975eb2f9..70aa5cb0c659 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1168,9 +1168,15 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
again:
pol = rcu_dereference(sk->sk_policy[dir]);
if (pol != NULL) {
- bool match = xfrm_selector_match(&pol->selector, fl, family);
+ bool match;
int err = 0;
+ if (pol->family != family) {
+ pol = NULL;
+ goto out;
+ }
+
+ match = xfrm_selector_match(&pol->selector, fl, family);
if (match) {
if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
pol = NULL;
@@ -1833,6 +1839,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
sizeof(struct xfrm_policy *) * num_pols) == 0 &&
xfrm_xdst_can_reuse(xdst, xfrm, err)) {
dst_hold(&xdst->u.dst);
+ xfrm_pols_put(pols, num_pols);
while (err > 0)
xfrm_state_put(xfrm[--err]);
return xdst;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 065d89606888..500b3391f474 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1343,6 +1343,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
if (orig->aead) {
x->aead = xfrm_algo_aead_clone(orig->aead);
+ x->geniv = orig->geniv;
if (!x->aead)
goto error;
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 983b0233767b..bdb48e5dba04 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1419,11 +1419,14 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
{
+ u16 prev_family;
int i;
if (nr > XFRM_MAX_DEPTH)
return -EINVAL;
+ prev_family = family;
+
for (i = 0; i < nr; i++) {
/* We never validated the ut->family value, so many
* applications simply leave it at zero. The check was
@@ -1435,6 +1438,12 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
if (!ut[i].family)
ut[i].family = family;
+ if ((ut[i].mode == XFRM_MODE_TRANSPORT) &&
+ (ut[i].family != prev_family))
+ return -EINVAL;
+
+ prev_family = ut[i].family;
+
switch (ut[i].family) {
case AF_INET:
break;
@@ -1445,6 +1454,21 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
default:
return -EINVAL;
}
+
+ switch (ut[i].id.proto) {
+ case IPPROTO_AH:
+ case IPPROTO_ESP:
+ case IPPROTO_COMP:
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+#endif
+ case IPSEC_PROTO_ANY:
+ break;
+ default:
+ return -EINVAL;
+ }
+
}
return 0;
@@ -2470,7 +2494,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_PROTO] = { .type = NLA_U8 },
[XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
[XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) },
- [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 },
+ [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 },
};
static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
diff --git a/security/Kconfig b/security/Kconfig
index e8e449444e65..a623d13bf288 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,16 @@ config SECURITY_NETWORK
implement socket and networking access controls.
If you are unsure how to answer this question, answer N.
+config PAGE_TABLE_ISOLATION
+ bool "Remove the kernel mapping in user mode"
+ depends on X86_64 && !UML
+ help
+ This feature reduces the number of hardware side channels by
+ ensuring that the majority of kernel addresses are not mapped
+ into userspace.
+
+ See Documentation/x86/pagetable-isolation.txt for more details.
+
config SECURITY_INFINIBAND
bool "Infiniband Security Hooks"
depends on SECURITY && INFINIBAND
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index e2450c8e88e6..a8c3a33dd185 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -523,21 +523,23 @@ static int do_show(int argc, char **argv)
break;
p_err("can't get next map: %s%s", strerror(errno),
errno == EINVAL ? " -- kernel too old?" : "");
- return -1;
+ break;
}
fd = bpf_map_get_fd_by_id(id);
if (fd < 0) {
+ if (errno == ENOENT)
+ continue;
p_err("can't get map by id (%u): %s",
id, strerror(errno));
- return -1;
+ break;
}
err = bpf_obj_get_info_by_fd(fd, &info, &len);
if (err) {
p_err("can't get map info: %s", strerror(errno));
close(fd);
- return -1;
+ break;
}
if (json_output)
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index ad619b96c276..dded77345bfb 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -382,6 +382,8 @@ static int do_show(int argc, char **argv)
fd = bpf_prog_get_fd_by_id(id);
if (fd < 0) {
+ if (errno == ENOENT)
+ continue;
p_err("can't get prog by id (%u): %s",
id, strerror(errno));
err = -1;
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 05fc4e2e7b3a..9316e648a880 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -39,7 +39,7 @@ $(BPFOBJ): force
CLANG ?= clang
LLC ?= llc
-PROBE := $(shell llc -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
# Let newer LLVM versions transparently probe the kernel for availability
# of full BPF instruction set.
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 0304ffb714f2..1aef72df20a1 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
* NB: Different Linux versions do different things with the
* accessed bit in set_thread_area().
*/
- if (ar != expected_ar &&
- (ldt || ar != (expected_ar | AR_ACCESSED))) {
+ if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
(ldt ? "LDT" : "GDT"), index, ar, expected_ar);
nerrs++;
diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c
index 2b3d6d235015..3d7b42e77299 100644
--- a/tools/usb/usbip/src/utils.c
+++ b/tools/usb/usbip/src/utils.c
@@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add)
char command[SYSFS_BUS_ID_SIZE + 4];
char match_busid_attr_path[SYSFS_PATH_MAX];
int rc;
+ int cmd_size;
snprintf(match_busid_attr_path, sizeof(match_busid_attr_path),
"%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME,
@@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add)
attr_name);
if (add)
- snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid);
+ cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s",
+ busid);
else
- snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid);
+ cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s",
+ busid);
rc = write_sysfs_attribute(match_busid_attr_path, command,
- sizeof(command));
+ cmd_size);
if (rc < 0) {
dbg("failed to write match_busid: %s", strerror(errno));
return -1;

Privacy Policy