aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c9
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c102
-rw-r--r--kernel/bpf/bpf_inode_storage.c272
-rw-r--r--kernel/bpf/bpf_iter.c62
-rw-r--r--kernel/bpf/bpf_local_storage.c600
-rw-r--r--kernel/bpf/bpf_lsm.c21
-rw-r--r--kernel/bpf/bpf_struct_ops.c6
-rw-r--r--kernel/bpf/btf.c1221
-rw-r--r--kernel/bpf/core.c31
-rw-r--r--kernel/bpf/cpumap.c17
-rw-r--r--kernel/bpf/devmap.c17
-rw-r--r--kernel/bpf/hashtab.c37
-rw-r--r--kernel/bpf/helpers.c58
-rw-r--r--kernel/bpf/inode.c120
-rw-r--r--kernel/bpf/lpm_trie.c1
-rw-r--r--kernel/bpf/map_in_map.c24
-rw-r--r--kernel/bpf/map_in_map.h2
-rw-r--r--kernel/bpf/map_iter.c15
-rw-r--r--kernel/bpf/percpu_freelist.c101
-rw-r--r--kernel/bpf/percpu_freelist.h1
-rw-r--r--kernel/bpf/preload/.gitignore4
-rw-r--r--kernel/bpf/preload/Kconfig26
-rw-r--r--kernel/bpf/preload/Makefile25
-rw-r--r--kernel/bpf/preload/bpf_preload.h16
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c91
-rw-r--r--kernel/bpf/preload/bpf_preload_umd_blob.S7
-rw-r--r--kernel/bpf/preload/iterators/.gitignore2
-rw-r--r--kernel/bpf/preload/iterators/Makefile57
-rw-r--r--kernel/bpf/preload/iterators/README4
-rw-r--r--kernel/bpf/preload/iterators/bpf_preload_common.h13
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c114
-rw-r--r--kernel/bpf/preload/iterators/iterators.c94
-rw-r--r--kernel/bpf/preload/iterators/iterators.skel.h412
-rw-r--r--kernel/bpf/queue_stack_maps.c2
-rw-r--r--kernel/bpf/reuseport_array.c3
-rw-r--r--kernel/bpf/ringbuf.c1
-rw-r--r--kernel/bpf/stackmap.c6
-rw-r--r--kernel/bpf/syscall.c333
-rw-r--r--kernel/bpf/sysfs_btf.c6
-rw-r--r--kernel/bpf/task_iter.c15
-rw-r--r--kernel/bpf/trampoline.c63
-rw-r--r--kernel/bpf/verifier.c1388
-rw-r--r--kernel/cgroup/cgroup.c4
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cpu_pm.c48
-rw-r--r--kernel/dma/Kconfig25
-rw-r--r--kernel/dma/Makefile1
-rw-r--r--kernel/dma/coherent.c25
-rw-r--r--kernel/dma/contiguous.c153
-rw-r--r--kernel/dma/debug.c19
-rw-r--r--kernel/dma/debug.h122
-rw-r--r--kernel/dma/direct.c270
-rw-r--r--kernel/dma/direct.h119
-rw-r--r--kernel/dma/dummy.c3
-rw-r--r--kernel/dma/mapping.c159
-rw-r--r--kernel/dma/ops_helpers.c85
-rw-r--r--kernel/dma/pool.c5
-rw-r--r--kernel/dma/swiotlb.c12
-rw-r--r--kernel/dma/virt.c4
-rw-r--r--kernel/entry/common.c43
-rw-r--r--kernel/events/core.c126
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c54
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/irq/chip.c36
-rw-r--r--kernel/irq/debugfs.c4
-rw-r--r--kernel/irq/internals.h9
-rw-r--r--kernel/irq/irqdomain.c99
-rw-r--r--kernel/irq/msi.c83
-rw-r--r--kernel/irq/pm.c34
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/resend.c15
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/irq/timings.c2
-rw-r--r--kernel/jump_label.c12
-rw-r--r--kernel/kcsan/core.c210
-rw-r--r--kernel/kcsan/debugfs.c130
-rw-r--r--kernel/kcsan/encoding.h2
-rw-r--r--kernel/kcsan/kcsan-test.c128
-rw-r--r--kernel/kcsan/kcsan.h12
-rw-r--r--kernel/kcsan/report.c10
-rw-r--r--kernel/kcsan/selftest.c8
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c21
-rw-r--r--kernel/kprobes.c168
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/livepatch/state.c2
-rw-r--r--kernel/locking/lockdep.c990
-rw-r--r--kernel/locking/lockdep_internals.h9
-rw-r--r--kernel/locking/percpu-rwsem.c4
-rw-r--r--kernel/module.c39
-rw-r--r--kernel/notifier.c144
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/params.c17
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/hibernate.c50
-rw-r--r--kernel/power/main.c8
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c14
-rw-r--r--kernel/power/swap.c36
-rw-r--r--kernel/power/user.c40
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h4
-rw-r--r--kernel/printk/printk.c1153
-rw-r--r--kernel/printk/printk_ringbuffer.c2083
-rw-r--r--kernel/printk/printk_ringbuffer.h382
-rw-r--r--kernel/printk/printk_safe.c4
-rw-r--r--kernel/range.c3
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/tasks.h55
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c121
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/cpufreq_schedutil.c18
-rw-r--r--kernel/sched/deadline.c34
-rw-r--r--kernel/sched/debug.c56
-rw-r--r--kernel/sched/fair.c103
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/membarrier.c136
-rw-r--r--kernel/sched/topology.c69
-rw-r--r--kernel/seccomp.c88
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/stackleak.c2
-rw-r--r--kernel/stacktrace.c8
-rw-r--r--kernel/static_call.c482
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/taskstats.c40
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/sched_clock.c6
-rw-r--r--kernel/time/timekeeping.c119
-rw-r--r--kernel/time/timer.c6
-rw-r--r--kernel/trace/blktrace.c13
-rw-r--r--kernel/trace/bpf_trace.c180
-rw-r--r--kernel/trace/fgraph.c8
-rw-r--r--kernel/trace/ftrace.c36
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/synth_event_gen_test.c18
-rw-r--r--kernel/trace/trace.c431
-rw-r--r--kernel/trace/trace.h31
-rw-r--r--kernel/trace/trace_boot.c23
-rw-r--r--kernel/trace/trace_dynevent.c10
-rw-r--r--kernel/trace/trace_events.c142
-rw-r--r--kernel/trace/trace_events_hist.c46
-rw-r--r--kernel/trace/trace_events_synth.c413
-rw-r--r--kernel/trace/trace_functions.c22
-rw-r--r--kernel/trace/trace_functions_graph.c8
-rw-r--r--kernel/trace/trace_hwlat.c8
-rw-r--r--kernel/trace/trace_kprobe.c41
-rw-r--r--kernel/trace/trace_output.c12
-rw-r--r--kernel/trace/trace_preemptirq.c4
-rw-r--r--kernel/trace/trace_printk.c12
-rw-r--r--kernel/trace/trace_probe.h14
-rw-r--r--kernel/trace/trace_stack.c12
-rw-r--r--kernel/trace/trace_stat.c8
-rw-r--r--kernel/trace/trace_synth.h6
-rw-r--r--kernel/trace/trace_uprobe.c24
-rw-r--r--kernel/trace/tracing_map.c2
-rw-r--r--kernel/tracepoint.c39
-rw-r--r--kernel/umh.c9
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/workqueue.c4
176 files changed, 12698 insertions, 3017 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a20016d4900..e5bc66a94b70 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,7 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o regset.o
-obj-$(CONFIG_BPFILTER) += usermode_driver.o
+obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
@@ -111,6 +111,7 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/acct.c b/kernel/acct.c
index b0c5b3a9f5af..f175df8f6aa4 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -25,7 +25,7 @@
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
- * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * Fixed a nasty interaction with sys_umount(). If the accounting
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
@@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex);
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
- * Returns 0 for success or negative errno values for failure.
- *
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
+ *
+ * Returns: 0 for success or negative errno values for failure.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
@@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns)
}
/**
- * acct_process
- *
- * handles process accounting for an exiting task
+ * acct_process - handles process accounting for an exiting task
*/
void acct_process(void)
{
diff --git a/kernel/audit.c b/kernel/audit.c
index 7efaece534a9..68cee3bc8cfe 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -123,9 +123,9 @@ static u32 audit_backlog_limit = 64;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
-kuid_t audit_sig_uid = INVALID_UID;
-pid_t audit_sig_pid = -1;
-u32 audit_sig_sid = 0;
+static kuid_t audit_sig_uid = INVALID_UID;
+static pid_t audit_sig_pid = -1;
+static u32 audit_sig_sid;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -934,8 +934,7 @@ static void audit_free_reply(struct audit_reply *reply)
if (!reply)
return;
- if (reply->skb)
- kfree_skb(reply->skb);
+ kfree_skb(reply->skb);
if (reply->net)
put_net(reply->net);
kfree(reply);
diff --git a/kernel/audit.h b/kernel/audit.h
index ddc22878433d..3b9c0945225a 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -327,10 +327,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
-extern pid_t audit_sig_pid;
-extern kuid_t audit_sig_uid;
-extern u32 audit_sig_sid;
-
extern int audit_filter(int msgtype, unsigned int listtype);
extern void audit_ctl_lock(void);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6eb9c0402da..bdc8cd1b6767 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -12,6 +13,7 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
@@ -29,3 +31,4 @@ ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
+obj-$(CONFIG_BPF_PRELOAD) += preload/
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8ff419b632a6..c6c81eceb68f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -10,11 +10,13 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
#include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)
+ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
+ BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
static void bpf_array_free_percpu(struct bpf_array *array)
{
@@ -60,7 +62,11 @@ int array_map_alloc_check(union bpf_attr *attr)
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
- attr->map_flags & BPF_F_MMAPABLE)
+ attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
+ return -EINVAL;
+
+ if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+ attr->map_flags & BPF_F_PRESERVE_ELEMS)
return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -208,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
-static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
@@ -217,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
if (!map->bypass_spec_v1) {
@@ -487,6 +496,15 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
vma->vm_pgoff + pgoff);
}
+static bool array_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ if (!bpf_map_meta_equal(meta0, meta1))
+ return false;
+ return meta0->map_flags & BPF_F_INNER_MAP ? true :
+ meta0->max_entries == meta1->max_entries;
+}
+
struct bpf_iter_seq_array_map_info {
struct bpf_map *map;
void *percpu_value_buf;
@@ -625,6 +643,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
static int array_map_btf_id;
const struct bpf_map_ops array_map_ops = {
+ .map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
@@ -647,6 +666,7 @@ const struct bpf_map_ops array_map_ops = {
static int percpu_array_map_btf_id;
const struct bpf_map_ops percpu_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
@@ -888,6 +908,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
+ u8 *old_addr, *new_addr, *old_bypass_addr;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
@@ -908,12 +929,13 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* there could be danger of use after free otherwise.
* 2) Initially when we start tracking aux, the program
* is not JITed yet and also does not have a kallsyms
- * entry. We skip these as poke->ip_stable is not
- * active yet. The JIT will do the final fixup before
- * setting it stable. The various poke->ip_stable are
- * successively activated, so tail call updates can
- * arrive from here while JIT is still finishing its
- * final fixup for non-activated poke entries.
+ * entry. We skip these as poke->tailcall_target_stable
+ * is not active yet. The JIT will do the final fixup
+ * before setting it stable. The various
+ * poke->tailcall_target_stable are successively
+ * activated, so tail call updates can arrive from here
+ * while JIT is still finishing its final fixup for
+ * non-activated poke entries.
* 3) On program teardown, the program's kallsym entry gets
* removed out of RCU callback, but we can only untrack
* from sleepable context, therefore bpf_arch_text_poke()
@@ -930,7 +952,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* 5) Any other error happening below from bpf_arch_text_poke()
* is a unexpected bug.
*/
- if (!READ_ONCE(poke->ip_stable))
+ if (!READ_ONCE(poke->tailcall_target_stable))
continue;
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
continue;
@@ -938,12 +960,39 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
poke->tail_call.key != key)
continue;
- ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP,
- old ? (u8 *)old->bpf_func +
- poke->adj_off : NULL,
- new ? (u8 *)new->bpf_func +
- poke->adj_off : NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
+ old_bypass_addr = old ? NULL : poke->bypass_addr;
+ old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+ new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+ if (new) {
+ ret = bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, new_addr);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ if (!old) {
+ ret = bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ poke->bypass_addr,
+ NULL);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ }
+ } else {
+ ret = bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ old_bypass_addr,
+ poke->bypass_addr);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ /* let other CPUs finish the execution of program
+ * so that it will not possible to expose them
+ * to invalid nop, stack unwind, nop state
+ */
+ if (!ret)
+ synchronize_rcu();
+ ret = bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, NULL);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ }
}
}
}
@@ -1003,6 +1052,11 @@ static void prog_array_map_free(struct bpf_map *map)
fd_array_map_free(map);
}
+/* prog_array->aux->{type,jited} is a runtime binding.
+ * Doing static check alone in the verifier is not enough.
+ * Thus, prog_array_map cannot be used as an inner_map
+ * and map_meta_equal is not implemented.
+ */
static int prog_array_map_btf_id;
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
@@ -1090,6 +1144,9 @@ static void perf_event_fd_array_release(struct bpf_map *map,
struct bpf_event_entry *ee;
int i;
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ return;
+
rcu_read_lock();
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
@@ -1099,11 +1156,19 @@ static void perf_event_fd_array_release(struct bpf_map *map,
rcu_read_unlock();
}
+static void perf_event_fd_array_map_free(struct bpf_map *map)
+{
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
static int perf_event_array_map_btf_id;
const struct bpf_map_ops perf_event_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
- .map_free = fd_array_map_free,
+ .map_free = perf_event_fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
@@ -1137,6 +1202,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
static int cgroup_array_map_btf_id;
const struct bpf_map_ops cgroup_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
@@ -1190,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
-static u32 array_of_map_gen_lookup(struct bpf_map *map,
+static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
new file mode 100644
index 000000000000..6edff97ad594
--- /dev/null
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+
+DEFINE_BPF_STORAGE_CACHE(inode_cache);
+
+static struct bpf_local_storage __rcu **
+inode_storage_ptr(void *owner)
+{
+ struct inode *inode = owner;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+ return &bsb->storage;
+}
+
+static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
+ struct bpf_map *map,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage *inode_storage;
+ struct bpf_local_storage_map *smap;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+
+ inode_storage = rcu_dereference(bsb->storage);
+ if (!inode_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
+}
+
+void bpf_inode_storage_free(struct inode *inode)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *local_storage;
+ bool free_inode_storage = false;
+ struct bpf_storage_blob *bsb;
+ struct hlist_node *n;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return;
+
+ rcu_read_lock();
+
+ local_storage = rcu_dereference(bsb->storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Netiher the bpf_prog nor the bpf-map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added-to or deleted-from the
+ * local_storage->list by the bpf_prog or by the bpf-map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ raw_spin_lock_bh(&local_storage->lock);
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ free_inode_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, false);
+ }
+ raw_spin_unlock_bh(&local_storage->lock);
+ rcu_read_unlock();
+
+ /* free_inoode_storage should always be true as long as
+ * local_storage->list was non-empty.
+ */
+ if (free_inode_storage)
+ kfree_rcu(local_storage, rcu);
+}
+
+static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct file *f;
+ int fd;
+
+ fd = *(int *)key;
+ f = fget_raw(fd);
+ if (!f)
+ return NULL;
+
+ sdata = inode_storage_lookup(f->f_inode, map, true);
+ fput(f);
+ return sdata ? sdata->data : NULL;
+}
+
+static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct file *f;
+ int fd;
+
+ fd = *(int *)key;
+ f = fget_raw(fd);
+ if (!f || !inode_storage_ptr(f->f_inode))
+ return -EBADF;
+
+ sdata = bpf_local_storage_update(f->f_inode,
+ (struct bpf_local_storage_map *)map,
+ value, map_flags);
+ fput(f);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = inode_storage_lookup(inode, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata));
+
+ return 0;
+}
+
+static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct file *f;
+ int fd, err;
+
+ fd = *(int *)key;
+ f = fget_raw(fd);
+ if (!f)
+ return -EBADF;
+
+ err = inode_storage_delete(f->f_inode, map);
+ fput(f);
+ return err;
+}
+
+BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ /* explicitly check that the inode_storage_ptr is not
+ * NULL as inode_storage_lookup returns NULL in this case and
+ * bpf_local_storage_update expects the owner to have a
+ * valid storage pointer.
+ */
+ if (!inode_storage_ptr(inode))
+ return (unsigned long)NULL;
+
+ sdata = inode_storage_lookup(inode, map, true);
+ if (sdata)
+ return (unsigned long)sdata->data;
+
+ /* This helper must only called from where the inode is gurranteed
+ * to have a refcount and cannot be freed.
+ */
+ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+ sdata = bpf_local_storage_update(
+ inode, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST);
+ return IS_ERR(sdata) ? (unsigned long)NULL :
+ (unsigned long)sdata->data;
+ }
+
+ return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_inode_storage_delete,
+ struct bpf_map *, map, struct inode *, inode)
+{
+ /* This helper must only called from where the inode is gurranteed
+ * to have a refcount and cannot be freed.
+ */
+ return inode_storage_delete(inode, map);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
+ return &smap->map;
+}
+
+static void inode_storage_map_free(struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
+ bpf_local_storage_map_free(smap);
+}
+
+static int inode_storage_map_btf_id;
+const struct bpf_map_ops inode_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = inode_storage_map_alloc,
+ .map_free = inode_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
+ .map_update_elem = bpf_fd_inode_storage_update_elem,
+ .map_delete_elem = bpf_fd_inode_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_name = "bpf_local_storage_map",
+ .map_btf_id = &inode_storage_map_btf_id,
+ .map_owner_storage_ptr = inode_storage_ptr,
+};
+
+BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode)
+
+const struct bpf_func_proto bpf_inode_storage_get_proto = {
+ .func = bpf_inode_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_inode_storage_delete_proto = {
+ .func = bpf_inode_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 8faa2ce89396..8f10e30ea0b0 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -88,8 +88,8 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
mutex_lock(&seq->lock);
if (!seq->buf) {
- seq->size = PAGE_SIZE;
- seq->buf = kmalloc(seq->size, GFP_KERNEL);
+ seq->size = PAGE_SIZE << 3;
+ seq->buf = kvmalloc(seq->size, GFP_KERNEL);
if (!seq->buf) {
err = -ENOMEM;
goto done;
@@ -390,10 +390,68 @@ out_unlock:
return ret;
}
+static void bpf_iter_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ bpf_iter_show_fdinfo_t show_fdinfo;
+
+ seq_printf(seq,
+ "target_name:\t%s\n",
+ iter_link->tinfo->reg_info->target);
+
+ show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo;
+ if (show_fdinfo)
+ show_fdinfo(&iter_link->aux, seq);
+}
+
+static int bpf_iter_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->iter.target_name);
+ bpf_iter_fill_link_info_t fill_link_info;
+ u32 ulen = info->iter.target_name_len;
+ const char *target_name;
+ u32 target_len;
+
+ if (!ulen ^ !ubuf)
+ return -EINVAL;
+
+ target_name = iter_link->tinfo->reg_info->target;
+ target_len = strlen(target_name);
+ info->iter.target_name_len = target_len + 1;
+
+ if (ubuf) {
+ if (ulen >= target_len + 1) {
+ if (copy_to_user(ubuf, target_name, target_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, target_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+ }
+
+ fill_link_info = iter_link->tinfo->reg_info->fill_link_info;
+ if (fill_link_info)
+ return fill_link_info(&iter_link->aux, info);
+
+ return 0;
+}
+
static const struct bpf_link_ops bpf_iter_link_lops = {
.release = bpf_iter_link_release,
.dealloc = bpf_iter_link_dealloc,
.update_prog = bpf_iter_link_replace,
+ .show_fdinfo = bpf_iter_link_show_fdinfo,
+ .fill_link_info = bpf_iter_link_fill_link_info,
};
bool bpf_link_is_iter(struct bpf_link *link)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
new file mode 100644
index 000000000000..5d3a7af9ba9b
--- /dev/null
+++ b/kernel/bpf/bpf_local_storage.c
@@ -0,0 +1,600 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+
+#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
+static struct bpf_local_storage_map_bucket *
+select_bucket(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
+}
+
+static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (!map->ops->map_local_storage_charge)
+ return 0;
+
+ return map->ops->map_local_storage_charge(smap, owner, size);
+}
+
+static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
+ u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (map->ops->map_local_storage_uncharge)
+ map->ops->map_local_storage_uncharge(smap, owner, size);
+}
+
+static struct bpf_local_storage __rcu **
+owner_storage(struct bpf_local_storage_map *smap, void *owner)
+{
+ struct bpf_map *map = &smap->map;
+
+ return map->ops->map_owner_storage_ptr(owner);
+}
+
+static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->snode);
+}
+
+static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->map_node);
+}
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
+ void *value, bool charge_mem)
+{
+ struct bpf_local_storage_elem *selem;
+
+ if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+ return NULL;
+
+ selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (selem) {
+ if (value)
+ memcpy(SDATA(selem)->data, value, smap->map.value_size);
+ return selem;
+ }
+
+ if (charge_mem)
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ return NULL;
+}
+
+/* local_storage->lock must be held and selem->local_storage == local_storage.
+ * The caller must ensure selem->smap is still valid to be
+ * dereferenced for its smap->elem_size and smap->cache_idx.
+ */
+bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_mem)
+{
+ struct bpf_local_storage_map *smap;
+ bool free_local_storage;
+ void *owner;
+
+ smap = rcu_dereference(SDATA(selem)->smap);
+ owner = local_storage->owner;
+
+ /* All uncharging on the owner must be done first.
+ * The owner may be freed once the last selem is unlinked
+ * from local_storage.
+ */
+ if (uncharge_mem)
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ free_local_storage = hlist_is_singular_node(&selem->snode,
+ &local_storage->list);
+ if (free_local_storage) {
+ mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
+ local_storage->owner = NULL;
+
+ /* After this RCU_INIT, owner may be freed and cannot be used */
+ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+
+ /* local_storage is not freed now. local_storage->lock is
+ * still held and raw_spin_unlock_bh(&local_storage->lock)
+ * will be done by the caller.
+ *
+ * Although the unlock will be done under
+ * rcu_read_lock(), it is more intutivie to
+ * read if kfree_rcu(local_storage, rcu) is done
+ * after the raw_spin_unlock_bh(&local_storage->lock).
+ *
+ * Hence, a "bool free_local_storage" is returned
+ * to the caller which then calls the kfree_rcu()
+ * after unlock.
+ */
+ }
+ hlist_del_init_rcu(&selem->snode);
+ if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
+ SDATA(selem))
+ RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
+
+ kfree_rcu(selem, rcu);
+
+ return free_local_storage;
+}
+
+static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage *local_storage;
+ bool free_local_storage = false;
+
+ if (unlikely(!selem_linked_to_storage(selem)))
+ /* selem has already been unlinked from sk */
+ return;
+
+ local_storage = rcu_dereference(selem->local_storage);
+ raw_spin_lock_bh(&local_storage->lock);
+ if (likely(selem_linked_to_storage(selem)))
+ free_local_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, true);
+ raw_spin_unlock_bh(&local_storage->lock);
+
+ if (free_local_storage)
+ kfree_rcu(local_storage, rcu);
+}
+
+void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ RCU_INIT_POINTER(selem->local_storage, local_storage);
+ hlist_add_head_rcu(&selem->snode, &local_storage->list);
+}
+
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map *smap;
+ struct bpf_local_storage_map_bucket *b;
+
+ if (unlikely(!selem_linked_to_map(selem)))
+ /* selem has already be unlinked from smap */
+ return;
+
+ smap = rcu_dereference(SDATA(selem)->smap);
+ b = select_bucket(smap, selem);
+ raw_spin_lock_bh(&b->lock);
+ if (likely(selem_linked_to_map(selem)))
+ hlist_del_init_rcu(&selem->map_node);
+ raw_spin_unlock_bh(&b->lock);
+}
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+
+ raw_spin_lock_bh(&b->lock);
+ RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+ raw_spin_unlock_bh(&b->lock);
+}
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+{
+ /* Always unlink from map before unlinking from local_storage
+ * because selem will be freed after successfully unlinked from
+ * the local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ __bpf_selem_unlink_storage(selem);
+}
+
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage_data *sdata;
+ struct bpf_local_storage_elem *selem;
+
+ /* Fast path (cache hit) */
+ sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
+ if (sdata && rcu_access_pointer(sdata->smap) == smap)
+ return sdata;
+
+ /* Slow path (cache miss) */
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+ if (rcu_access_pointer(SDATA(selem)->smap) == smap)
+ break;
+
+ if (!selem)
+ return NULL;
+
+ sdata = SDATA(selem);
+ if (cacheit_lockit) {
+ /* spinlock is needed to avoid racing with the
+ * parallel delete. Otherwise, publishing an already
+ * deleted sdata to the cache will become a use-after-free
+ * problem in the next bpf_local_storage_lookup().
+ */
+ raw_spin_lock_bh(&local_storage->lock);
+ if (selem_linked_to_storage(selem))
+ rcu_assign_pointer(local_storage->cache[smap->cache_idx],
+ sdata);
+ raw_spin_unlock_bh(&local_storage->lock);
+ }
+
+ return sdata;
+}
+
+static int check_flags(const struct bpf_local_storage_data *old_sdata,
+ u64 map_flags)
+{
+ if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
+int bpf_local_storage_alloc(void *owner,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *first_selem)
+{
+ struct bpf_local_storage *prev_storage, *storage;
+ struct bpf_local_storage **owner_storage_ptr;
+ int err;
+
+ err = mem_charge(smap, owner, sizeof(*storage));
+ if (err)
+ return err;
+
+ storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN);
+ if (!storage) {
+ err = -ENOMEM;
+ goto uncharge;
+ }
+
+ INIT_HLIST_HEAD(&storage->list);
+ raw_spin_lock_init(&storage->lock);
+ storage->owner = owner;
+
+ bpf_selem_link_storage_nolock(storage, first_selem);
+ bpf_selem_link_map(smap, first_selem);
+
+ owner_storage_ptr =
+ (struct bpf_local_storage **)owner_storage(smap, owner);
+ /* Publish storage to the owner.
+ * Instead of using any lock of the kernel object (i.e. owner),
+ * cmpxchg will work with any kernel object regardless what
+ * the running context is, bh, irq...etc.
+ *
+ * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
+ * is protected by the storage->lock. Hence, when freeing
+ * the owner->storage, the storage->lock must be held before
+ * setting owner->storage ptr to NULL.
+ */
+ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
+ if (unlikely(prev_storage)) {
+ bpf_selem_unlink_map(first_selem);
+ err = -EAGAIN;
+ goto uncharge;
+
+ /* Note that even first_selem was linked to smap's
+ * bucket->list, first_selem can be freed immediately
+ * (instead of kfree_rcu) because
+ * bpf_local_storage_map_free() does a
+ * synchronize_rcu() before walking the bucket->list.
+ * Hence, no one is accessing selem from the
+ * bucket->list under rcu_read_lock().
+ */
+ }
+
+ return 0;
+
+uncharge:
+ kfree(storage);
+ mem_uncharge(smap, owner, sizeof(*storage));
+ return err;
+}
+
+/* sk cannot be going away because it is linking new elem
+ * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
+ * Otherwise, it will become a leak (and other memory issues
+ * during map destruction).
+ */
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *old_sdata = NULL;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *local_storage;
+ int err;
+
+ /* BPF_EXIST and BPF_NOEXIST cannot be both set */
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
+ /* BPF_F_LOCK can only be used in a value with spin_lock */
+ unlikely((map_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(&smap->map)))
+ return ERR_PTR(-EINVAL);
+
+ local_storage = rcu_dereference(*owner_storage(smap, owner));
+ if (!local_storage || hlist_empty(&local_storage->list)) {
+ /* Very first elem for the owner */
+ err = check_flags(NULL, map_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ selem = bpf_selem_alloc(smap, owner, value, true);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+
+ err = bpf_local_storage_alloc(owner, smap, selem);
+ if (err) {
+ kfree(selem);
+ mem_uncharge(smap, owner, smap->elem_size);
+ return ERR_PTR(err);
+ }
+
+ return SDATA(selem);
+ }
+
+ if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
+ /* Hoping to find an old_sdata to do inline update
+ * such that it can avoid taking the local_storage->lock
+ * and changing the lists.
+ */
+ old_sdata =
+ bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ return ERR_PTR(err);
+ if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
+ copy_map_value_locked(&smap->map, old_sdata->data,
+ value, false);
+ return old_sdata;
+ }
+ }
+
+ raw_spin_lock_bh(&local_storage->lock);
+
+ /* Recheck local_storage->list under local_storage->lock */
+ if (unlikely(hlist_empty(&local_storage->list))) {
+ /* A parallel del is happening and local_storage is going
+ * away. It has just been checked before, so very
+ * unlikely. Return instead of retry to keep things
+ * simple.
+ */
+ err = -EAGAIN;
+ goto unlock_err;
+ }
+
+ old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ goto unlock_err;
+
+ if (old_sdata && (map_flags & BPF_F_LOCK)) {
+ copy_map_value_locked(&smap->map, old_sdata->data, value,
+ false);
+ selem = SELEM(old_sdata);
+ goto unlock;
+ }
+
+ /* local_storage->lock is held. Hence, we are sure
+ * we can unlink and uncharge the old_sdata successfully
+ * later. Hence, instead of charging the new selem now
+ * and then uncharge the old selem later (which may cause
+ * a potential but unnecessary charge failure), avoid taking
+ * a charge at all here (the "!old_sdata" check) and the
+ * old_sdata will not be uncharged later during
+ * bpf_selem_unlink_storage_nolock().
+ */
+ selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
+ if (!selem) {
+ err = -ENOMEM;
+ goto unlock_err;
+ }
+
+ /* First, link the new selem to the map */
+ bpf_selem_link_map(smap, selem);
+
+ /* Second, link (and publish) the new selem to local_storage */
+ bpf_selem_link_storage_nolock(local_storage, selem);
+
+ /* Third, remove old selem, SELEM(old_sdata) */
+ if (old_sdata) {
+ bpf_selem_unlink_map(SELEM(old_sdata));
+ bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
+ false);
+ }
+
+unlock:
+ raw_spin_unlock_bh(&local_storage->lock);
+ return SDATA(selem);
+
+unlock_err:
+ raw_spin_unlock_bh(&local_storage->lock);
+ return ERR_PTR(err);
+}
+
+u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+{
+ u64 min_usage = U64_MAX;
+ u16 i, res = 0;
+
+ spin_lock(&cache->idx_lock);
+
+ for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
+ if (cache->idx_usage_counts[i] < min_usage) {
+ min_usage = cache->idx_usage_counts[i];
+ res = i;
+
+ /* Found a free cache_idx */
+ if (!min_usage)
+ break;
+ }
+ }
+ cache->idx_usage_counts[res]++;
+
+ spin_unlock(&cache->idx_lock);
+
+ return res;
+}
+
+void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
+{
+ spin_lock(&cache->idx_lock);
+ cache->idx_usage_counts[idx]--;
+ spin_unlock(&cache->idx_lock);
+}
+
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned int i;
+
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
+ synchronize_rcu();
+
+ /* bpf prog and the userspace can no longer access this map
+ * now. No new selem (of this map) can be added
+ * to the owner->storage or to the map bucket's list.
+ *
+ * The elem of this map can be cleaned up here
+ * or when the storage is freed e.g.
+ * by bpf_sk_storage_free() during __sk_destruct().
+ */
+ for (i = 0; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+
+ rcu_read_lock();
+ /* No one is adding to b->list now */
+ while ((selem = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(&b->list)),
+ struct bpf_local_storage_elem, map_node))) {
+ bpf_selem_unlink(selem);
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+ }
+
+ /* While freeing the storage we may still need to access the map.
+ *
+ * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+ * which then made the above while((selem = ...)) loop
+ * exit immediately.
+ *
+ * However, while freeing the storage one still needs to access the
+ * smap->elem_size to do the uncharging in
+ * bpf_selem_unlink_storage_nolock().
+ *
+ * Hence, wait another rcu grace period for the storage to be freed.
+ */
+ synchronize_rcu();
+
+ kvfree(smap->buckets);
+ kfree(smap);
+}
+
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
+ attr->key_size != sizeof(int) || !attr->value_size ||
+ /* Enforce BTF for userspace sk dumping */
+ !attr->btf_key_type_id || !attr->btf_value_type_id)
+ return -EINVAL;
+
+ if (!bpf_capable())
+ return -EPERM;
+
+ if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ u64 cost;
+ int ret;
+
+ smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+ cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
+
+ ret = bpf_map_charge_init(&smap->map.memory, cost);
+ if (ret < 0) {
+ kfree(smap);
+ return ERR_PTR(ret);
+ }
+
+ smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
+ GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ bpf_map_charge_finish(&smap->map.memory);
+ kfree(smap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size =
+ sizeof(struct bpf_local_storage_elem) + attr->value_size;
+
+ return smap;
+}
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ u32 int_data;
+
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
+
+ int_data = *(u32 *)(key_type + 1);
+ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+
+ return 0;
+}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index fb278144e9fd..78ea8a7bd27f 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -11,6 +11,8 @@
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
#include <linux/bpf_verifier.h>
+#include <net/bpf_sk_storage.h>
+#include <linux/bpf_local_storage.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@@ -45,10 +47,27 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
return 0;
}
+static const struct bpf_func_proto *
+bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_inode_storage_get:
+ return &bpf_inode_storage_get_proto;
+ case BPF_FUNC_inode_storage_delete:
+ return &bpf_inode_storage_delete_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
const struct bpf_prog_ops lsm_prog_ops = {
};
const struct bpf_verifier_ops lsm_verifier_ops = {
- .get_func_proto = tracing_prog_func_proto,
+ .get_func_proto = bpf_lsm_func_proto,
.is_valid_access = btf_ctx_access,
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 969c5d47f81f..4c3b543bb33b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -298,8 +298,7 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return -EINVAL;
mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- NULL, NULL);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype))
return PTR_ERR(mtype);
prev_mend = moff + msize;
@@ -396,8 +395,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
u32 msize;
mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- NULL, NULL);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype)) {
err = PTR_ERR(mtype);
goto reset_unlock;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 91afdd4c82e3..ed7d02e8bc93 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -21,6 +21,8 @@
#include <linux/btf_ids.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/btf_ids.h>
#include <net/sock.h>
/* BTF (BPF Type Format) is the meta data format which describes
@@ -186,11 +188,6 @@
i < btf_type_vlen(struct_type); \
i++, member++)
-#define for_each_vsi(i, struct_type, member) \
- for (i = 0, member = btf_type_var_secinfo(struct_type); \
- i < btf_type_vlen(struct_type); \
- i++, member++)
-
#define for_each_vsi_from(i, from, struct_type, member) \
for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@@ -282,6 +279,91 @@ static const char *btf_type_str(const struct btf_type *t)
return btf_kind_str[BTF_INFO_KIND(t->info)];
}
+/* Chunk size we use in safe copy of data to be shown. */
+#define BTF_SHOW_OBJ_SAFE_SIZE 32
+
+/*
+ * This is the maximum size of a base type value (equivalent to a
+ * 128-bit int); if we are at the end of our safe buffer and have
+ * less than 16 bytes space we can't be assured of being able
+ * to copy the next type safely, so in such cases we will initiate
+ * a new copy.
+ */
+#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16
+
+/* Type name size */
+#define BTF_SHOW_NAME_SIZE 80
+
+/*
+ * Common data to all BTF show operations. Private show functions can add
+ * their own data to a structure containing a struct btf_show and consult it
+ * in the show callback. See btf_type_show() below.
+ *
+ * One challenge with showing nested data is we want to skip 0-valued
+ * data, but in order to figure out whether a nested object is all zeros
+ * we need to walk through it. As a result, we need to make two passes
+ * when handling structs, unions and arrays; the first path simply looks
+ * for nonzero data, while the second actually does the display. The first
+ * pass is signalled by show->state.depth_check being set, and if we
+ * encounter a non-zero value we set show->state.depth_to_show to
+ * the depth at which we encountered it. When we have completed the
+ * first pass, we will know if anything needs to be displayed if
+ * depth_to_show > depth. See btf_[struct,array]_show() for the
+ * implementation of this.
+ *
+ * Another problem is we want to ensure the data for display is safe to
+ * access. To support this, the anonymous "struct {} obj" tracks the data
+ * object and our safe copy of it. We copy portions of the data needed
+ * to the object "copy" buffer, but because its size is limited to
+ * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we
+ * traverse larger objects for display.
+ *
+ * The various data type show functions all start with a call to
+ * btf_show_start_type() which returns a pointer to the safe copy
+ * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the
+ * raw data itself). btf_show_obj_safe() is responsible for
+ * using copy_from_kernel_nofault() to update the safe data if necessary
+ * as we traverse the object's data. skbuff-like semantics are
+ * used:
+ *
+ * - obj.head points to the start of the toplevel object for display
+ * - obj.size is the size of the toplevel object
+ * - obj.data points to the current point in the original data at
+ * which our safe data starts. obj.data will advance as we copy
+ * portions of the data.
+ *
+ * In most cases a single copy will suffice, but larger data structures
+ * such as "struct task_struct" will require many copies. The logic in
+ * btf_show_obj_safe() handles the logic that determines if a new
+ * copy_from_kernel_nofault() is needed.
+ */
+struct btf_show {
+ u64 flags;
+ void *target; /* target of show operation (seq file, buffer) */
+ void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ const struct btf *btf;
+ /* below are used during iteration */
+ struct {
+ u8 depth;
+ u8 depth_to_show;
+ u8 depth_check;
+ u8 array_member:1,
+ array_terminated:1;
+ u16 array_encoding;
+ u32 type_id;
+ int status; /* non-zero for error */
+ const struct btf_type *type;
+ const struct btf_member *member;
+ char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */
+ } state;
+ struct {
+ u32 size;
+ void *head;
+ void *data;
+ u8 safe[BTF_SHOW_OBJ_SAFE_SIZE];
+ } obj;
+};
+
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
@@ -298,9 +380,9 @@ struct btf_kind_operations {
const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
- void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+ void (*show)(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offsets,
- struct seq_file *m);
+ struct btf_show *show);
};
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
@@ -353,16 +435,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
-/* union is only a special case of struct:
- * all its offsetof(member) == 0
- */
-static bool btf_type_is_struct(const struct btf_type *t)
-{
- u8 kind = BTF_INFO_KIND(t->info);
-
- return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
-}
-
static bool __btf_type_is_struct(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
@@ -373,11 +445,6 @@ static bool btf_type_is_array(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
}
-static bool btf_type_is_var(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
-}
-
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -526,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
return (const struct btf_var *)(t + 1);
}
-static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
-{
- return (const struct btf_var_secinfo *)(t + 1);
-}
-
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -677,6 +739,488 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
return true;
}
+/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */
+static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
+ u32 id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t) &&
+ BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
+ id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ return t;
+}
+
+#define BTF_SHOW_MAX_ITER 10
+
+#define BTF_KIND_BIT(kind) (1ULL << kind)
+
+/*
+ * Populate show->state.name with type name information.
+ * Format of type name is
+ *
+ * [.member_name = ] (type_name)
+ */
+static const char *btf_show_name(struct btf_show *show)
+{
+ /* BTF_MAX_ITER array suffixes "[]" */
+ const char *array_suffixes = "[][][][][][][][][][]";
+ const char *array_suffix = &array_suffixes[strlen(array_suffixes)];
+ /* BTF_MAX_ITER pointer suffixes "*" */
+ const char *ptr_suffixes = "**********";
+ const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
+ const char *name = NULL, *prefix = "", *parens = "";
+ const struct btf_member *m = show->state.member;
+ const struct btf_type *t = show->state.type;
+ const struct btf_array *array;
+ u32 id = show->state.type_id;
+ const char *member = NULL;
+ bool show_member = false;
+ u64 kinds = 0;
+ int i;
+
+ show->state.name[0] = '\0';
+
+ /*
+ * Don't show type name if we're showing an array member;
+ * in that case we show the array type so don't need to repeat
+ * ourselves for each member.
+ */
+ if (show->state.array_member)
+ return "";
+
+ /* Retrieve member name, if any. */
+ if (m) {
+ member = btf_name_by_offset(show->btf, m->name_off);
+ show_member = strlen(member) > 0;
+ id = m->type;
+ }
+
+ /*
+ * Start with type_id, as we have resolved the struct btf_type *
+ * via btf_modifier_show() past the parent typedef to the child
+ * struct, int etc it is defined as. In such cases, the type_id
+ * still represents the starting type while the struct btf_type *
+ * in our show->state points at the resolved type of the typedef.
+ */
+ t = btf_type_by_id(show->btf, id);
+ if (!t)
+ return "";
+
+ /*
+ * The goal here is to build up the right number of pointer and
+ * array suffixes while ensuring the type name for a typedef
+ * is represented. Along the way we accumulate a list of
+ * BTF kinds we have encountered, since these will inform later
+ * display; for example, pointer types will not require an
+ * opening "{" for struct, we will just display the pointer value.
+ *
+ * We also want to accumulate the right number of pointer or array
+ * indices in the format string while iterating until we get to
+ * the typedef/pointee/array member target type.
+ *
+ * We start by pointing at the end of pointer and array suffix
+ * strings; as we accumulate pointers and arrays we move the pointer
+ * or array string backwards so it will show the expected number of
+ * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers
+ * and/or arrays and typedefs are supported as a precaution.
+ *
+ * We also want to get typedef name while proceeding to resolve
+ * type it points to so that we can add parentheses if it is a
+ * "typedef struct" etc.
+ */
+ for (i = 0; i < BTF_SHOW_MAX_ITER; i++) {
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ if (!name)
+ name = btf_name_by_offset(show->btf,
+ t->name_off);
+ kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF);
+ id = t->type;
+ break;
+ case BTF_KIND_ARRAY:
+ kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY);
+ parens = "[";
+ if (!t)
+ return "";
+ array = btf_type_array(t);
+ if (array_suffix > array_suffixes)
+ array_suffix -= 2;
+ id = array->type;
+ break;
+ case BTF_KIND_PTR:
+ kinds |= BTF_KIND_BIT(BTF_KIND_PTR);
+ if (ptr_suffix > ptr_suffixes)
+ ptr_suffix -= 1;
+ id = t->type;
+ break;
+ default:
+ id = 0;
+ break;
+ }
+ if (!id)
+ break;
+ t = btf_type_skip_qualifiers(show->btf, id);
+ }
+ /* We may not be able to represent this type; bail to be safe */
+ if (i == BTF_SHOW_MAX_ITER)
+ return "";
+
+ if (!name)
+ name = btf_name_by_offset(show->btf, t->name_off);
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ?
+ "struct" : "union";
+ /* if it's an array of struct/union, parens is already set */
+ if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY))))
+ parens = "{";
+ break;
+ case BTF_KIND_ENUM:
+ prefix = "enum";
+ break;
+ default:
+ break;
+ }
+
+ /* pointer does not require parens */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_PTR))
+ parens = "";
+ /* typedef does not require struct/union/enum prefix */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF))
+ prefix = "";
+
+ if (!name)
+ name = "";
+
+ /* Even if we don't want type name info, we want parentheses etc */
+ if (show->flags & BTF_SHOW_NONAME)
+ snprintf(show->state.name, sizeof(show->state.name), "%s",
+ parens);
+ else
+ snprintf(show->state.name, sizeof(show->state.name),
+ "%s%s%s(%s%s%s%s%s%s)%s",
+ /* first 3 strings comprise ".member = " */
+ show_member ? "." : "",
+ show_member ? member : "",
+ show_member ? " = " : "",
+ /* ...next is our prefix (struct, enum, etc) */
+ prefix,
+ strlen(prefix) > 0 && strlen(name) > 0 ? " " : "",
+ /* ...this is the type name itself */
+ name,
+ /* ...suffixed by the appropriate '*', '[]' suffixes */
+ strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix,
+ array_suffix, parens);
+
+ return show->state.name;
+}
+
+static const char *__btf_show_indent(struct btf_show *show)
+{
+ const char *indents = " ";
+ const char *indent = &indents[strlen(indents)];
+
+ if ((indent - show->state.depth) >= indents)
+ return indent - show->state.depth;
+ return indents;
+}
+
+static const char *btf_show_indent(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show);
+}
+
+static const char *btf_show_newline(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : "\n";
+}
+
+static const char *btf_show_delim(struct btf_show *show)
+{
+ if (show->state.depth == 0)
+ return "";
+
+ if ((show->flags & BTF_SHOW_COMPACT) && show->state.type &&
+ BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION)
+ return "|";
+
+ return ",";
+}
+
+__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!show->state.depth_check) {
+ va_start(args, fmt);
+ show->showfn(show, fmt, args);
+ va_end(args);
+ }
+}
+
+/* Macros are used here as btf_show_type_value[s]() prepends and appends
+ * format specifiers to the format specifier passed in; these do the work of
+ * adding indentation, delimiters etc while the caller simply has to specify
+ * the type value(s) in the format specifier + value(s).
+ */
+#define btf_show_type_value(show, fmt, value) \
+ do { \
+ if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \
+ show->state.depth == 0) { \
+ btf_show(show, "%s%s" fmt "%s%s", \
+ btf_show_indent(show), \
+ btf_show_name(show), \
+ value, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } \
+ } while (0)
+
+#define btf_show_type_values(show, fmt, ...) \
+ do { \
+ btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \
+ btf_show_name(show), \
+ __VA_ARGS__, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } while (0)
+
+/* How much is left to copy to safe buffer after @data? */
+static int btf_show_obj_size_left(struct btf_show *show, void *data)
+{
+ return show->obj.head + show->obj.size - data;
+}
+
+/* Is object pointed to by @data of @size already copied to our safe buffer? */
+static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size)
+{
+ return data >= show->obj.data &&
+ (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE);
+}
+
+/*
+ * If object pointed to by @data of @size falls within our safe buffer, return
+ * the equivalent pointer to the same safe data. Assumes
+ * copy_from_kernel_nofault() has already happened and our safe buffer is
+ * populated.
+ */
+static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size)
+{
+ if (btf_show_obj_is_safe(show, data, size))
+ return show->obj.safe + (data - show->obj.data);
+ return NULL;
+}
+
+/*
+ * Return a safe-to-access version of data pointed to by @data.
+ * We do this by copying the relevant amount of information
+ * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault().
+ *
+ * If BTF_SHOW_UNSAFE is specified, just return data as-is; no
+ * safe copy is needed.
+ *
+ * Otherwise we need to determine if we have the required amount
+ * of data (determined by the @data pointer and the size of the
+ * largest base type we can encounter (represented by
+ * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures
+ * that we will be able to print some of the current object,
+ * and if more is needed a copy will be triggered.
+ * Some objects such as structs will not fit into the buffer;
+ * in such cases additional copies when we iterate over their
+ * members may be needed.
+ *
+ * btf_show_obj_safe() is used to return a safe buffer for
+ * btf_show_start_type(); this ensures that as we recurse into
+ * nested types we always have safe data for the given type.
+ * This approach is somewhat wasteful; it's possible for example
+ * that when iterating over a large union we'll end up copying the
+ * same data repeatedly, but the goal is safety not performance.
+ * We use stack data as opposed to per-CPU buffers because the
+ * iteration over a type can take some time, and preemption handling
+ * would greatly complicate use of the safe buffer.
+ */
+static void *btf_show_obj_safe(struct btf_show *show,
+ const struct btf_type *t,
+ void *data)
+{
+ const struct btf_type *rt;
+ int size_left, size;
+ void *safe = NULL;
+
+ if (show->flags & BTF_SHOW_UNSAFE)
+ return data;
+
+ rt = btf_resolve_size(show->btf, t, &size);
+ if (IS_ERR(rt)) {
+ show->state.status = PTR_ERR(rt);
+ return NULL;
+ }
+
+ /*
+ * Is this toplevel object? If so, set total object size and
+ * initialize pointers. Otherwise check if we still fall within
+ * our safe object data.
+ */
+ if (show->state.depth == 0) {
+ show->obj.size = size;
+ show->obj.head = data;
+ } else {
+ /*
+ * If the size of the current object is > our remaining
+ * safe buffer we _may_ need to do a new copy. However
+ * consider the case of a nested struct; it's size pushes
+ * us over the safe buffer limit, but showing any individual
+ * struct members does not. In such cases, we don't need
+ * to initiate a fresh copy yet; however we definitely need
+ * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left
+ * in our buffer, regardless of the current object size.
+ * The logic here is that as we resolve types we will
+ * hit a base type at some point, and we need to be sure
+ * the next chunk of data is safely available to display
+ * that type info safely. We cannot rely on the size of
+ * the current object here because it may be much larger
+ * than our current buffer (e.g. task_struct is 8k).
+ * All we want to do here is ensure that we can print the
+ * next basic type, which we can if either
+ * - the current type size is within the safe buffer; or
+ * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in
+ * the safe buffer.
+ */
+ safe = __btf_show_obj_safe(show, data,
+ min(size,
+ BTF_SHOW_OBJ_BASE_TYPE_SIZE));
+ }
+
+ /*
+ * We need a new copy to our safe object, either because we haven't
+ * yet copied and are intializing safe data, or because the data
+ * we want falls outside the boundaries of the safe object.
+ */
+ if (!safe) {
+ size_left = btf_show_obj_size_left(show, data);
+ if (size_left > BTF_SHOW_OBJ_SAFE_SIZE)
+ size_left = BTF_SHOW_OBJ_SAFE_SIZE;
+ show->state.status = copy_from_kernel_nofault(show->obj.safe,
+ data, size_left);
+ if (!show->state.status) {
+ show->obj.data = data;
+ safe = show->obj.safe;
+ }
+ }
+
+ return safe;
+}
+
+/*
+ * Set the type we are starting to show and return a safe data pointer
+ * to be used for showing the associated data.
+ */
+static void *btf_show_start_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ show->state.type = t;
+ show->state.type_id = type_id;
+ show->state.name[0] = '\0';
+
+ return btf_show_obj_safe(show, t, data);
+}
+
+static void btf_show_end_type(struct btf_show *show)
+{
+ show->state.type = NULL;
+ show->state.type_id = 0;
+ show->state.name[0] = '\0';
+}
+
+static void *btf_show_start_aggr_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ void *safe_data = btf_show_start_type(show, t, type_id, data);
+
+ if (!safe_data)
+ return safe_data;
+
+ btf_show(show, "%s%s%s", btf_show_indent(show),
+ btf_show_name(show),
+ btf_show_newline(show));
+ show->state.depth++;
+ return safe_data;
+}
+
+static void btf_show_end_aggr_type(struct btf_show *show,
+ const char *suffix)
+{
+ show->state.depth--;
+ btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix,
+ btf_show_delim(show), btf_show_newline(show));
+ btf_show_end_type(show);
+}
+
+static void btf_show_start_member(struct btf_show *show,
+ const struct btf_member *m)
+{
+ show->state.member = m;
+}
+
+static void btf_show_start_array_member(struct btf_show *show)
+{
+ show->state.array_member = 1;
+ btf_show_start_member(show, NULL);
+}
+
+static void btf_show_end_member(struct btf_show *show)
+{
+ show->state.member = NULL;
+}
+
+static void btf_show_end_array_member(struct btf_show *show)
+{
+ show->state.array_member = 0;
+ btf_show_end_member(show);
+}
+
+static void *btf_show_start_array_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ u16 array_encoding,
+ void *data)
+{
+ show->state.array_encoding = array_encoding;
+ show->state.array_terminated = 0;
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_array_type(struct btf_show *show)
+{
+ show->state.array_encoding = 0;
+ show->state.array_terminated = 0;
+ btf_show_end_aggr_type(show, "]");
+}
+
+static void *btf_show_start_struct_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ void *data)
+{
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_struct_type(struct btf_show *show)
+{
+ btf_show_end_aggr_type(show, "}");
+}
+
__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
const char *fmt, ...)
{
@@ -1079,23 +1623,27 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
* *type_size: (x * y * sizeof(u32)). Hence, *type_size always
* corresponds to the return type.
* *elem_type: u32
+ * *elem_id: id of u32
* *total_nelems: (x * y). Hence, individual elem size is
* (*type_size / *total_nelems)
+ * *type_id: id of type if it's changed within the function, 0 if not
*
* type: is not an array (e.g. const struct X)
* return type: type "struct X"
* *type_size: sizeof(struct X)
* *elem_type: same as return type ("struct X")
+ * *elem_id: 0
* *total_nelems: 1
+ * *type_id: id of type if it's changed within the function, 0 if not
*/
-const struct btf_type *
-btf_resolve_size(const struct btf *btf, const struct btf_type *type,
- u32 *type_size, const struct btf_type **elem_type,
- u32 *total_nelems)
+static const struct btf_type *
+__btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size, const struct btf_type **elem_type,
+ u32 *elem_id, u32 *total_nelems, u32 *type_id)
{
const struct btf_type *array_type = NULL;
- const struct btf_array *array;
- u32 i, size, nelems = 1;
+ const struct btf_array *array = NULL;
+ u32 i, size, nelems = 1, id = 0;
for (i = 0; i < MAX_RESOLVE_DEPTH; i++) {
switch (BTF_INFO_KIND(type->info)) {
@@ -1116,6 +1664,7 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@@ -1146,10 +1695,21 @@ resolved:
*total_nelems = nelems;
if (elem_type)
*elem_type = type;
+ if (elem_id)
+ *elem_id = array ? array->type : 0;
+ if (type_id && id)
+ *type_id = id;
return array_type ? : type;
}
+const struct btf_type *
+btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size)
+{
+ return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL);
+}
+
/* The input param "type_id" must point to a needs_resolve type */
static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
u32 *type_id)
@@ -1250,11 +1810,11 @@ static int btf_df_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
-static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offsets,
- struct seq_file *m)
+static void btf_df_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct btf_show *show)
{
- seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+ btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
}
static int btf_int_check_member(struct btf_verifier_env *env,
@@ -1427,7 +1987,7 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
-static void btf_int128_print(struct seq_file *m, void *data)
+static void btf_int128_print(struct btf_show *show, void *data)
{
/* data points to a __int128 number.
* Suppose
@@ -1446,9 +2006,10 @@ static void btf_int128_print(struct seq_file *m, void *data)
lower_num = *(u64 *)data;
#endif
if (upper_num == 0)
- seq_printf(m, "0x%llx", lower_num);
+ btf_show_type_value(show, "0x%llx", lower_num);
else
- seq_printf(m, "0x%llx%016llx", upper_num, lower_num);
+ btf_show_type_values(show, "0x%llx%016llx", upper_num,
+ lower_num);
}
static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
@@ -1492,8 +2053,8 @@ static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
#endif
}
-static void btf_bitfield_seq_show(void *data, u8 bits_offset,
- u8 nr_bits, struct seq_file *m)
+static void btf_bitfield_show(void *data, u8 bits_offset,
+ u8 nr_bits, struct btf_show *show)
{
u16 left_shift_bits, right_shift_bits;
u8 nr_copy_bytes;
@@ -1513,14 +2074,14 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset,
right_shift_bits = BITS_PER_U128 - nr_bits;
btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
- btf_int128_print(m, print_num);
+ btf_int128_print(show, print_num);
}
-static void btf_int_bits_seq_show(const struct btf *btf,
- const struct btf_type *t,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_int_bits_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 nr_bits = BTF_INT_BITS(int_data);
@@ -1533,55 +2094,77 @@ static void btf_int_bits_seq_show(const struct btf *btf,
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
- btf_bitfield_seq_show(data, bits_offset, nr_bits, m);
+ btf_bitfield_show(data, bits_offset, nr_bits, show);
}
-static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_int_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 encoding = BTF_INT_ENCODING(int_data);
bool sign = encoding & BTF_INT_SIGNED;
u8 nr_bits = BTF_INT_BITS(int_data);
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
if (bits_offset || BTF_INT_OFFSET(int_data) ||
BITS_PER_BYTE_MASKED(nr_bits)) {
- btf_int_bits_seq_show(btf, t, data, bits_offset, m);
- return;
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ goto out;
}
switch (nr_bits) {
case 128:
- btf_int128_print(m, data);
+ btf_int128_print(show, safe_data);
break;
case 64:
if (sign)
- seq_printf(m, "%lld", *(s64 *)data);
+ btf_show_type_value(show, "%lld", *(s64 *)safe_data);
else
- seq_printf(m, "%llu", *(u64 *)data);
+ btf_show_type_value(show, "%llu", *(u64 *)safe_data);
break;
case 32:
if (sign)
- seq_printf(m, "%d", *(s32 *)data);
+ btf_show_type_value(show, "%d", *(s32 *)safe_data);
else
- seq_printf(m, "%u", *(u32 *)data);
+ btf_show_type_value(show, "%u", *(u32 *)safe_data);
break;
case 16:
if (sign)
- seq_printf(m, "%d", *(s16 *)data);
+ btf_show_type_value(show, "%d", *(s16 *)safe_data);
else
- seq_printf(m, "%u", *(u16 *)data);
+ btf_show_type_value(show, "%u", *(u16 *)safe_data);
break;
case 8:
+ if (show->state.array_encoding == BTF_INT_CHAR) {
+ /* check for null terminator */
+ if (show->state.array_terminated)
+ break;
+ if (*(char *)data == '\0') {
+ show->state.array_terminated = 1;
+ break;
+ }
+ if (isprint(*(char *)data)) {
+ btf_show_type_value(show, "'%c'",
+ *(char *)safe_data);
+ break;
+ }
+ }
if (sign)
- seq_printf(m, "%d", *(s8 *)data);
+ btf_show_type_value(show, "%d", *(s8 *)safe_data);
else
- seq_printf(m, "%u", *(u8 *)data);
+ btf_show_type_value(show, "%u", *(u8 *)safe_data);
break;
default:
- btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ break;
}
+out:
+ btf_show_end_type(show);
}
static const struct btf_kind_operations int_ops = {
@@ -1590,7 +2173,7 @@ static const struct btf_kind_operations int_ops = {
.check_member = btf_int_check_member,
.check_kflag_member = btf_int_check_kflag_member,
.log_details = btf_int_log,
- .seq_show = btf_int_seq_show,
+ .show = btf_int_show,
};
static int btf_modifier_check_member(struct btf_verifier_env *env,
@@ -1854,34 +2437,44 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
return 0;
}
-static void btf_modifier_seq_show(const struct btf *btf,
- const struct btf_type *t,
- u32 type_id, void *data,
- u8 bits_offset, struct seq_file *m)
+static void btf_modifier_show(const struct btf *btf,
+ const struct btf_type *t,
+ u32 type_id, void *data,
+ u8 bits_offset, struct btf_show *show)
{
if (btf->resolved_ids)
t = btf_type_id_resolve(btf, &type_id);
else
t = btf_type_skip_modifiers(btf, type_id, NULL);
- btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
-static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_var_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
t = btf_type_id_resolve(btf, &type_id);
- btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
-static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_ptr_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
- /* It is a hashed value */
- seq_printf(m, "%p", *(void **)data);
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */
+ if (show->flags & BTF_SHOW_PTR_RAW)
+ btf_show_type_value(show, "0x%px", *(void **)safe_data);
+ else
+ btf_show_type_value(show, "0x%p", *(void **)safe_data);
+ btf_show_end_type(show);
}
static void btf_ref_type_log(struct btf_verifier_env *env,
@@ -1896,7 +2489,7 @@ static struct btf_kind_operations modifier_ops = {
.check_member = btf_modifier_check_member,
.check_kflag_member = btf_modifier_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_modifier_seq_show,
+ .show = btf_modifier_show,
};
static struct btf_kind_operations ptr_ops = {
@@ -1905,7 +2498,7 @@ static struct btf_kind_operations ptr_ops = {
.check_member = btf_ptr_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_ptr_seq_show,
+ .show = btf_ptr_show,
};
static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
@@ -1946,7 +2539,7 @@ static struct btf_kind_operations fwd_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_fwd_type_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static int btf_array_check_member(struct btf_verifier_env *env,
@@ -2105,28 +2698,90 @@ static void btf_array_log(struct btf_verifier_env *env,
array->type, array->index_type, array->nelems);
}
-static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void __btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_array *array = btf_type_array(t);
const struct btf_kind_operations *elem_ops;
const struct btf_type *elem_type;
- u32 i, elem_size, elem_type_id;
+ u32 i, elem_size = 0, elem_type_id;
+ u16 encoding = 0;
elem_type_id = array->type;
- elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL);
+ if (elem_type && btf_type_has_size(elem_type))
+ elem_size = elem_type->size;
+
+ if (elem_type && btf_type_is_int(elem_type)) {
+ u32 int_type = btf_type_int(elem_type);
+
+ encoding = BTF_INT_ENCODING(int_type);
+
+ /*
+ * BTF_INT_CHAR encoding never seems to be set for
+ * char arrays, so if size is 1 and element is
+ * printable as a char, we'll do that.
+ */
+ if (elem_size == 1)
+ encoding = BTF_INT_CHAR;
+ }
+
+ if (!btf_show_start_array_type(show, t, type_id, encoding, data))
+ return;
+
+ if (!elem_type)
+ goto out;
elem_ops = btf_type_ops(elem_type);
- seq_puts(m, "[");
+
for (i = 0; i < array->nelems; i++) {
- if (i)
- seq_puts(m, ",");
- elem_ops->seq_show(btf, elem_type, elem_type_id, data,
- bits_offset, m);
+ btf_show_start_array_member(show);
+
+ elem_ops->show(btf, elem_type, elem_type_id, data,
+ bits_offset, show);
data += elem_size;
+
+ btf_show_end_array_member(show);
+
+ if (show->state.array_terminated)
+ break;
+ }
+out:
+ btf_show_end_array_type(show);
+}
+
+static void btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
+ show->state.member = m;
+
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero array member(s).
+ */
}
- seq_puts(m, "]");
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations array_ops = {
@@ -2135,7 +2790,7 @@ static struct btf_kind_operations array_ops = {
.check_member = btf_array_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_array_log,
- .seq_show = btf_array_seq_show,
+ .show = btf_array_show,
};
static int btf_struct_check_member(struct btf_verifier_env *env,
@@ -2358,15 +3013,18 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
return off;
}
-static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
- const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
const struct btf_member *member;
+ void *safe_data;
u32 i;
- seq_puts(m, "{");
+ safe_data = btf_show_start_struct_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
@@ -2375,23 +3033,65 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 bytes_offset;
u8 bits8_offset;
- if (i)
- seq_puts(m, seq);
+ btf_show_start_member(show, member);
member_offset = btf_member_bit_offset(t, member);
bitfield_size = btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
- btf_bitfield_seq_show(data + bytes_offset, bits8_offset,
- bitfield_size, m);
+ safe_data = btf_show_start_type(show, member_type,
+ member->type,
+ data + bytes_offset);
+ if (safe_data)
+ btf_bitfield_show(safe_data,
+ bits8_offset,
+ bitfield_size, show);
+ btf_show_end_type(show);
} else {
ops = btf_type_ops(member_type);
- ops->seq_show(btf, member_type, member->type,
- data + bytes_offset, bits8_offset, m);
+ ops->show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, show);
}
+
+ btf_show_end_member(show);
}
- seq_puts(m, "}");
+
+ btf_show_end_struct_type(show);
+}
+
+static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
+ /* Restore saved member data here */
+ show->state.member = m;
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero child values.
+ */
+ }
+
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations struct_ops = {
@@ -2400,7 +3100,7 @@ static struct btf_kind_operations struct_ops = {
.check_member = btf_struct_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_struct_log,
- .seq_show = btf_struct_seq_show,
+ .show = btf_struct_show,
};
static int btf_enum_check_member(struct btf_verifier_env *env,
@@ -2531,24 +3231,35 @@ static void btf_enum_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_enum *enums = btf_type_enum(t);
u32 i, nr_enums = btf_type_vlen(t);
- int v = *(int *)data;
+ void *safe_data;
+ int v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(int *)safe_data;
for (i = 0; i < nr_enums; i++) {
- if (v == enums[i].val) {
- seq_printf(m, "%s",
- __btf_name_by_offset(btf,
- enums[i].name_off));
- return;
- }
+ if (v != enums[i].val)
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
}
- seq_printf(m, "%d", v);
+ btf_show_type_value(show, "%d", v);
+ btf_show_end_type(show);
}
static struct btf_kind_operations enum_ops = {
@@ -2557,7 +3268,7 @@ static struct btf_kind_operations enum_ops = {
.check_member = btf_enum_check_member,
.check_kflag_member = btf_enum_check_kflag_member,
.log_details = btf_enum_log,
- .seq_show = btf_enum_seq_show,
+ .show = btf_enum_show,
};
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
@@ -2644,7 +3355,7 @@ static struct btf_kind_operations func_proto_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_func_proto_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static s32 btf_func_check_meta(struct btf_verifier_env *env,
@@ -2678,7 +3389,7 @@ static struct btf_kind_operations func_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static s32 btf_var_check_meta(struct btf_verifier_env *env,
@@ -2742,7 +3453,7 @@ static const struct btf_kind_operations var_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_var_log,
- .seq_show = btf_var_seq_show,
+ .show = btf_var_show,
};
static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
@@ -2868,24 +3579,28 @@ static void btf_datasec_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-static void btf_datasec_seq_show(const struct btf *btf,
- const struct btf_type *t, u32 type_id,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_datasec_show(const struct btf *btf,
+ const struct btf_type *t, u32 type_id,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *var;
u32 i;
- seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off));
+ if (!btf_show_start_type(show, t, type_id, data))
+ return;
+
+ btf_show_type_value(show, "section (\"%s\") = {",
+ __btf_name_by_offset(btf, t->name_off));
for_each_vsi(i, t, vsi) {
var = btf_type_by_id(btf, vsi->type);
if (i)
- seq_puts(m, ",");
- btf_type_ops(var)->seq_show(btf, var, vsi->type,
- data + vsi->offset, bits_offset, m);
+ btf_show(show, ",");
+ btf_type_ops(var)->show(btf, var, vsi->type,
+ data + vsi->offset, bits_offset, show);
}
- seq_puts(m, "}");
+ btf_show_end_type(show);
}
static const struct btf_kind_operations datasec_ops = {
@@ -2894,7 +3609,7 @@ static const struct btf_kind_operations datasec_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_datasec_log,
- .seq_show = btf_datasec_seq_show,
+ .show = btf_datasec_show,
};
static int btf_func_proto_check(struct btf_verifier_env *env,
@@ -3688,7 +4403,7 @@ errout:
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
{
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
if (tgt_prog) {
return tgt_prog->aux->btf;
@@ -3715,7 +4430,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const struct btf_type *t = prog->aux->attach_func_proto;
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
struct btf *btf = bpf_prog_get_target_btf(prog);
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
@@ -3842,7 +4557,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->reg_type = PTR_TO_BTF_ID;
if (tgt_prog) {
- ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
+ enum bpf_prog_type tgt_type;
+
+ if (tgt_prog->type == BPF_PROG_TYPE_EXT)
+ tgt_type = tgt_prog->aux->saved_dst_prog_type;
+ else
+ tgt_type = tgt_prog->type;
+
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
if (ret > 0) {
info->btf_id = ret;
return true;
@@ -3870,16 +4592,22 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return true;
}
-int btf_struct_access(struct bpf_verifier_log *log,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype,
- u32 *next_btf_id)
+enum bpf_struct_walk_result {
+ /* < 0 error */
+ WALK_SCALAR = 0,
+ WALK_PTR,
+ WALK_STRUCT,
+};
+
+static int btf_struct_walk(struct bpf_verifier_log *log,
+ const struct btf_type *t, int off, int size,
+ u32 *next_btf_id)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
const char *tname, *mname;
- u32 vlen;
+ u32 vlen, elem_id, mid;
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
@@ -3915,14 +4643,13 @@ again:
/* Only allow structure for now, can be relaxed for
* other types later.
*/
- elem_type = btf_type_skip_modifiers(btf_vmlinux,
- array_elem->type, NULL);
- if (!btf_type_is_struct(elem_type))
+ t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type,
+ NULL);
+ if (!btf_type_is_struct(t))
goto error;
- off = (off - moff) % elem_type->size;
- return btf_struct_access(log, elem_type, off, size, atype,
- next_btf_id);
+ off = (off - moff) % t->size;
+ goto again;
error:
bpf_log(log, "access beyond struct %s at off %u size %u\n",
@@ -3951,7 +4678,7 @@ error:
*/
if (off <= moff &&
BITS_ROUNDUP_BYTES(end_bit) <= off + size)
- return SCALAR_VALUE;
+ return WALK_SCALAR;
/* off may be accessing a following member
*
@@ -3973,11 +4700,13 @@ error:
break;
/* type of the field */
+ mid = member->type;
mtype = btf_type_by_id(btf_vmlinux, member->type);
mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- &elem_type, &total_nelems);
+ mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize,
+ &elem_type, &elem_id, &total_nelems,
+ &mid);
if (IS_ERR(mtype)) {
bpf_log(log, "field %s doesn't have size\n", mname);
return -EFAULT;
@@ -3991,7 +4720,7 @@ error:
if (btf_type_is_array(mtype)) {
u32 elem_idx;
- /* btf_resolve_size() above helps to
+ /* __btf_resolve_size() above helps to
* linearize a multi-dimensional array.
*
* The logic here is treating an array
@@ -4039,6 +4768,7 @@ error:
elem_idx = (off - moff) / msize;
moff += elem_idx * msize;
mtype = elem_type;
+ mid = elem_id;
}
/* the 'off' we're looking for is either equal to start
@@ -4048,6 +4778,12 @@ error:
/* our field must be inside that union or struct */
t = mtype;
+ /* return if the offset matches the member offset */
+ if (off == moff) {
+ *next_btf_id = mid;
+ return WALK_STRUCT;
+ }
+
/* adjust offset we're looking for */
off -= moff;
goto again;
@@ -4063,11 +4799,10 @@ error:
mname, moff, tname, off, size);
return -EACCES;
}
-
stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
- return PTR_TO_BTF_ID;
+ return WALK_PTR;
}
}
@@ -4084,23 +4819,82 @@ error:
return -EACCES;
}
- return SCALAR_VALUE;
+ return WALK_SCALAR;
}
bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
return -EINVAL;
}
-int btf_resolve_helper_id(struct bpf_verifier_log *log,
- const struct bpf_func_proto *fn, int arg)
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf_type *t, int off, int size,
+ enum bpf_access_type atype __maybe_unused,
+ u32 *next_btf_id)
+{
+ int err;
+ u32 id;
+
+ do {
+ err = btf_struct_walk(log, t, off, size, &id);
+
+ switch (err) {
+ case WALK_PTR:
+ /* If we found the pointer or scalar on t+off,
+ * we're done.
+ */
+ *next_btf_id = id;
+ return PTR_TO_BTF_ID;
+ case WALK_SCALAR:
+ return SCALAR_VALUE;
+ case WALK_STRUCT:
+ /* We found nested struct, so continue the search
+ * by diving in it. At this point the offset is
+ * aligned with the new type, so set it to 0.
+ */
+ t = btf_type_by_id(btf_vmlinux, id);
+ off = 0;
+ break;
+ default:
+ /* It's either error or unknown return value..
+ * scream and leave.
+ */
+ if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value"))
+ return -EINVAL;
+ return err;
+ }
+ } while (t);
+
+ return -EINVAL;
+}
+
+bool btf_struct_ids_match(struct bpf_verifier_log *log,
+ int off, u32 id, u32 need_type_id)
{
- int id;
+ const struct btf_type *type;
+ int err;
- if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux)
- return -EINVAL;
- id = fn->btf_id[arg];
- if (!id || id > btf_vmlinux->nr_types)
- return -EINVAL;
- return id;
+ /* Are we already done? */
+ if (need_type_id == id && off == 0)
+ return true;
+
+again:
+ type = btf_type_by_id(btf_vmlinux, id);
+ if (!type)
+ return false;
+ err = btf_struct_walk(log, type, off, 1, &id);
+ if (err != WALK_STRUCT)
+ return false;
+
+ /* We found nested struct object. If it matches
+ * the requested ID, we're done. Otherwise let's
+ * continue the search with offset 0 in the new
+ * type.
+ */
+ if (need_type_id != id) {
+ off = 0;
+ goto again;
+ }
+
+ return true;
}
static int __get_type_size(struct btf *btf, u32 btf_id,
@@ -4298,7 +5092,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
}
/* Compare BTFs of given program with BTF of target program */
-int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
+int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
struct btf *btf2, const struct btf_type *t2)
{
struct btf *btf1 = prog->aux->btf;
@@ -4306,7 +5100,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
u32 btf_id = 0;
if (!prog->aux->func_info) {
- bpf_log(&env->log, "Program extension requires BTF\n");
+ bpf_log(log, "Program extension requires BTF\n");
return -EINVAL;
}
@@ -4318,7 +5112,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
if (!t1 || !btf_type_is_func(t1))
return -EFAULT;
- return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
+ return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
/* Compare BTF of a function with given bpf_reg_state.
@@ -4469,7 +5263,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
- prog_type = prog->aux->linked_prog->type;
+ prog_type = prog->aux->dst_prog->type;
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
@@ -4516,12 +5310,93 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return 0;
}
+static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
+ struct btf_show *show)
+{
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ show->btf = btf;
+ memset(&show->state, 0, sizeof(show->state));
+ memset(&show->obj, 0, sizeof(show->obj));
+
+ btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
+}
+
+static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ seq_vprintf((struct seq_file *)show->target, fmt, args);
+}
+
+int btf_type_seq_show_flags(const struct btf *btf, u32 type_id,
+ void *obj, struct seq_file *m, u64 flags)
+{
+ struct btf_show sseq;
+
+ sseq.target = m;
+ sseq.showfn = btf_seq_show;
+ sseq.flags = flags;
+
+ btf_type_show(btf, type_id, obj, &sseq);
+
+ return sseq.state.status;
+}
+
void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
struct seq_file *m)
{
- const struct btf_type *t = btf_type_by_id(btf, type_id);
+ (void) btf_type_seq_show_flags(btf, type_id, obj, m,
+ BTF_SHOW_NONAME | BTF_SHOW_COMPACT |
+ BTF_SHOW_ZERO | BTF_SHOW_UNSAFE);
+}
+
+struct btf_show_snprintf {
+ struct btf_show show;
+ int len_left; /* space left in string */
+ int len; /* length we would have written */
+};
+
+static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
+ int len;
- btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+ len = vsnprintf(show->target, ssnprintf->len_left, fmt, args);
+
+ if (len < 0) {
+ ssnprintf->len_left = 0;
+ ssnprintf->len = len;
+ } else if (len > ssnprintf->len_left) {
+ /* no space, drive on to get length we would have written */
+ ssnprintf->len_left = 0;
+ ssnprintf->len += len;
+ } else {
+ ssnprintf->len_left -= len;
+ ssnprintf->len += len;
+ show->target += len;
+ }
+}
+
+int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
+ char *buf, int len, u64 flags)
+{
+ struct btf_show_snprintf ssnprintf;
+
+ ssnprintf.show.target = buf;
+ ssnprintf.show.flags = flags;
+ ssnprintf.show.showfn = btf_snprintf_show;
+ ssnprintf.len_left = len;
+ ssnprintf.len = 0;
+
+ btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
+
+ /* If we encontered an error, return it. */
+ if (ssnprintf.show.state.status)
+ return ssnprintf.show.state.status;
+
+ /* Otherwise return length we would have written */
+ return ssnprintf.len;
}
#ifdef CONFIG_PROC_FS
@@ -4661,3 +5536,15 @@ u32 btf_id(const struct btf *btf)
{
return btf->id;
}
+
+static int btf_id_cmp_func(const void *a, const void *b)
+{
+ const int *pa = a, *pb = b;
+
+ return *pa - *pb;
+}
+
+bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
+{
+ return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ed0b3578867c..9268d77898b7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -25,7 +25,7 @@
#include <linux/moduleloader.h>
#include <linux/bpf.h>
#include <linux/btf.h>
-#include <linux/frame.h>
+#include <linux/objtool.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -98,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->jit_requested = ebpf_jit_enabled();
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+ mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->dst_mutex);
return fp;
}
@@ -253,6 +255,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
void __bpf_prog_free(struct bpf_prog *fp)
{
if (fp->aux) {
+ mutex_destroy(&fp->aux->used_maps_mutex);
+ mutex_destroy(&fp->aux->dst_mutex);
free_percpu(fp->aux->stats);
kfree(fp->aux->poke_tab);
kfree(fp->aux);
@@ -773,7 +777,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
if (size > poke_tab_max)
return -ENOSPC;
- if (poke->ip || poke->ip_stable || poke->adj_off)
+ if (poke->tailcall_target || poke->tailcall_target_stable ||
+ poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
return -EINVAL;
switch (poke->reason) {
@@ -1747,8 +1752,9 @@ bool bpf_prog_array_compatible(struct bpf_array *array,
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
- int i;
+ int i, ret = 0;
+ mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
struct bpf_array *array;
@@ -1757,11 +1763,15 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
continue;
array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp))
- return -EINVAL;
+ if (!bpf_prog_array_compatible(array, fp)) {
+ ret = -EINVAL;
+ goto out;
+ }
}
- return 0;
+out:
+ mutex_unlock(&aux->used_maps_mutex);
+ return ret;
}
static void bpf_prog_select_func(struct bpf_prog *fp)
@@ -2130,7 +2140,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
#endif
- bpf_trampoline_put(aux->trampoline);
+ if (aux->dst_trampoline)
+ bpf_trampoline_put(aux->dst_trampoline);
for (i = 0; i < aux->func_cnt; i++)
bpf_jit_free(aux->func[i]);
if (aux->func_cnt) {
@@ -2146,8 +2157,8 @@ void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
- if (aux->linked_prog)
- bpf_prog_put(aux->linked_prog);
+ if (aux->dst_prog)
+ bpf_prog_put(aux->dst_prog);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
@@ -2208,6 +2219,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
+const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 6386b7bb98f2..c61a23b564aa 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -79,8 +79,6 @@ struct bpf_cpu_map {
static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -157,8 +155,7 @@ static void cpu_map_kthread_stop(struct work_struct *work)
kthread_stop(rcpu->kthread);
}
-static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_frame *xdpf,
+static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf,
struct sk_buff *skb)
{
unsigned int hard_start_headroom;
@@ -367,7 +364,7 @@ static int cpu_map_kthread_run(void *data)
struct sk_buff *skb = skbs[i];
int ret;
- skb = cpu_map_build_skb(rcpu, xdpf, skb);
+ skb = cpu_map_build_skb(xdpf, skb);
if (!skb) {
xdp_return_frame(xdpf);
continue;
@@ -658,6 +655,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
static int cpu_map_btf_id;
const struct bpf_map_ops cpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = cpu_map_alloc,
.map_free = cpu_map_free,
.map_delete_elem = cpu_map_delete_elem,
@@ -669,7 +667,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_btf_id = &cpu_map_btf_id,
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
+static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
@@ -678,7 +676,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
int i;
if (unlikely(!bq->count))
- return 0;
+ return;
q = rcpu->queue;
spin_lock(&q->producer_lock);
@@ -701,13 +699,12 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
/* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
- return 0;
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
+static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
@@ -728,8 +725,6 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
if (!bq->flush_node.prev)
list_add(&bq->flush_node, flush_list);
-
- return 0;
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 10abb06065bb..2b5ca93c17de 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -341,14 +341,14 @@ bool dev_map_can_have_prog(struct bpf_map *map)
return false;
}
-static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
+static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
int sent = 0, drops = 0, err = 0;
int i;
if (unlikely(!bq->count))
- return 0;
+ return;
for (i = 0; i < bq->count; i++) {
struct xdp_frame *xdpf = bq->q[i];
@@ -369,7 +369,7 @@ out:
trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
- return 0;
+ return;
error:
/* If ndo_xdp_xmit fails with an errno, no frames have been
* xmit'ed and it's our responsibility to them free all.
@@ -421,8 +421,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
- struct net_device *dev_rx)
+static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -441,8 +441,6 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
if (!bq->flush_node.prev)
list_add(&bq->flush_node, flush_list);
-
- return 0;
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
@@ -462,7 +460,8 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- return bq_enqueue(dev, xdpf, dev_rx);
+ bq_enqueue(dev, xdpf, dev_rx);
+ return 0;
}
static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
@@ -751,6 +750,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
static int dev_map_btf_id;
const struct bpf_map_ops dev_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -764,6 +764,7 @@ const struct bpf_map_ops dev_map_ops = {
static int dev_map_hash_map_btf_id;
const struct bpf_map_ops dev_map_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 78dfff6a501b..1815e97d4c9c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -9,6 +9,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
@@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- /* Must be called with rcu_read_lock. */
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
* bpf_prog
* __htab_map_lookup_elem
*/
-static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
return __htab_lru_map_lookup_elem(map, key, false);
}
-static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+static int htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
@@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info {
struct bpf_map *map;
struct bpf_htab *htab;
void *percpu_value_buf; // non-zero means percpu hash
- unsigned long flags;
u32 bucket_id;
u32 skip_elems;
};
@@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
struct htab_elem *prev_elem)
{
const struct bpf_htab *htab = info->htab;
- unsigned long flags = info->flags;
u32 skip_elems = info->skip_elems;
u32 bucket_id = info->bucket_id;
struct hlist_nulls_head *head;
@@ -1656,19 +1654,18 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
/* not found, unlock and go to the next bucket */
b = &htab->buckets[bucket_id++];
- htab_unlock_bucket(htab, b, flags);
+ rcu_read_unlock();
skip_elems = 0;
}
for (i = bucket_id; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
- flags = htab_lock_bucket(htab, b);
+ rcu_read_lock();
count = 0;
head = &b->head;
hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
if (count >= skip_elems) {
- info->flags = flags;
info->bucket_id = i;
info->skip_elems = count;
return elem;
@@ -1676,7 +1673,7 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
count++;
}
- htab_unlock_bucket(htab, b, flags);
+ rcu_read_unlock();
skip_elems = 0;
}
@@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
{
- struct bpf_iter_seq_hash_map_info *info = seq->private;
-
if (!v)
(void)__bpf_hash_map_seq_show(seq, NULL);
else
- htab_unlock_bucket(info->htab,
- &info->htab->buckets[info->bucket_id],
- info->flags);
+ rcu_read_unlock();
}
static int bpf_iter_init_hash_map(void *priv_data,
@@ -1810,6 +1803,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
static int htab_map_btf_id;
const struct bpf_map_ops htab_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -1827,6 +1821,7 @@ const struct bpf_map_ops htab_map_ops = {
static int htab_lru_map_btf_id;
const struct bpf_map_ops htab_lru_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -1947,6 +1942,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
static int htab_percpu_map_btf_id;
const struct bpf_map_ops htab_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -1963,6 +1959,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
static int htab_lru_percpu_map_btf_id;
const struct bpf_map_ops htab_lru_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
@@ -2073,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
-static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+static int htab_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index be43ab3e619f..25520f5eeaf6 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -601,6 +601,56 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
+ const void __user *, user_ptr)
+{
+ int ret = copy_from_user(dst, user_ptr, size);
+
+ if (unlikely(ret)) {
+ memset(dst, 0, size);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_proto = {
+ .func = bpf_copy_from_user,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
+{
+ if (cpu >= nr_cpu_ids)
+ return (unsigned long)NULL;
+
+ return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+}
+
+const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
+ .func = bpf_per_cpu_ptr,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
+{
+ return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+}
+
+const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
+ .func = bpf_this_cpu_ptr,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -661,8 +711,16 @@ bpf_base_func_proto(enum bpf_func_id func_id)
if (!perfmon_capable())
return NULL;
return bpf_get_trace_printk_proto();
+ case BPF_FUNC_snprintf_btf:
+ if (!perfmon_capable())
+ return NULL;
+ return &bpf_snprintf_btf_proto;
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
+ case BPF_FUNC_bpf_per_cpu_ptr:
+ return &bpf_per_cpu_ptr_proto;
+ case BPF_FUNC_bpf_this_cpu_ptr:
+ return &bpf_this_cpu_ptr_proto;
default:
break;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fb878ba3f22f..dd4b7fd60ee7 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include "preload/bpf_preload.h"
enum bpf_type {
BPF_TYPE_UNSPEC = 0,
@@ -226,10 +227,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
else
prev_key = key;
+ rcu_read_lock();
if (map->ops->map_get_next_key(map, prev_key, key)) {
map_iter(m)->done = true;
- return NULL;
+ key = NULL;
}
+ rcu_read_unlock();
return key;
}
@@ -369,9 +372,10 @@ static struct dentry *
bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
{
/* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
- * extensions.
+ * extensions. That allows popoulate_bpffs() create special files.
*/
- if (strchr(dentry->d_name.name, '.'))
+ if ((dir->i_mode & S_IALLUGO) &&
+ strchr(dentry->d_name.name, '.'))
return ERR_PTR(-EPERM);
return simple_lookup(dir, dentry, flags);
@@ -409,6 +413,27 @@ static const struct inode_operations bpf_dir_iops = {
.unlink = simple_unlink,
};
+/* pin iterator link into bpffs */
+static int bpf_iter_link_pin_kernel(struct dentry *parent,
+ const char *name, struct bpf_link *link)
+{
+ umode_t mode = S_IFREG | S_IRUSR;
+ struct dentry *dentry;
+ int ret;
+
+ inode_lock(parent->d_inode);
+ dentry = lookup_one_len(name, parent, strlen(name));
+ if (IS_ERR(dentry)) {
+ inode_unlock(parent->d_inode);
+ return PTR_ERR(dentry);
+ }
+ ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
+ &bpf_iter_fops);
+ dput(dentry);
+ inode_unlock(parent->d_inode);
+ return ret;
+}
+
static int bpf_obj_do_pin(const char __user *pathname, void *raw,
enum bpf_type type)
{
@@ -638,6 +663,91 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
return 0;
}
+struct bpf_preload_ops *bpf_preload_ops;
+EXPORT_SYMBOL_GPL(bpf_preload_ops);
+
+static bool bpf_preload_mod_get(void)
+{
+ /* If bpf_preload.ko wasn't loaded earlier then load it now.
+ * When bpf_preload is built into vmlinux the module's __init
+ * function will populate it.
+ */
+ if (!bpf_preload_ops) {
+ request_module("bpf_preload");
+ if (!bpf_preload_ops)
+ return false;
+ }
+ /* And grab the reference, so the module doesn't disappear while the
+ * kernel is interacting with the kernel module and its UMD.
+ */
+ if (!try_module_get(bpf_preload_ops->owner)) {
+ pr_err("bpf_preload module get failed.\n");
+ return false;
+ }
+ return true;
+}
+
+static void bpf_preload_mod_put(void)
+{
+ if (bpf_preload_ops)
+ /* now user can "rmmod bpf_preload" if necessary */
+ module_put(bpf_preload_ops->owner);
+}
+
+static DEFINE_MUTEX(bpf_preload_lock);
+
+static int populate_bpffs(struct dentry *parent)
+{
+ struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
+ struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
+ int err = 0, i;
+
+ /* grab the mutex to make sure the kernel interactions with bpf_preload
+ * UMD are serialized
+ */
+ mutex_lock(&bpf_preload_lock);
+
+ /* if bpf_preload.ko wasn't built into vmlinux then load it */
+ if (!bpf_preload_mod_get())
+ goto out;
+
+ if (!bpf_preload_ops->info.tgid) {
+ /* preload() will start UMD that will load BPF iterator programs */
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ links[i] = bpf_link_by_id(objs[i].link_id);
+ if (IS_ERR(links[i])) {
+ err = PTR_ERR(links[i]);
+ goto out_put;
+ }
+ }
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, links[i]);
+ if (err)
+ goto out_put;
+ /* do not unlink successfully pinned links even
+ * if later link fails to pin
+ */
+ links[i] = NULL;
+ }
+ /* finish() will tell UMD process to exit */
+ err = bpf_preload_ops->finish();
+ if (err)
+ goto out_put;
+ }
+out_put:
+ bpf_preload_mod_put();
+out:
+ mutex_unlock(&bpf_preload_lock);
+ for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
+ if (!IS_ERR_OR_NULL(links[i]))
+ bpf_link_put(links[i]);
+ return err;
+}
+
static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr bpf_rfiles[] = { { "" } };
@@ -654,8 +764,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
inode = sb->s_root->d_inode;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
+ populate_bpffs(sb->s_root);
inode->i_mode |= S_ISVTX | opts->mode;
-
return 0;
}
@@ -705,6 +815,8 @@ static int __init bpf_init(void)
{
int ret;
+ mutex_init(&bpf_preload_lock);
+
ret = sysfs_create_mount_point(fs_kobj, "bpf");
if (ret)
return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 44474bf3ab7a..00e32f2ec3e6 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -732,6 +732,7 @@ static int trie_check_btf(const struct bpf_map *map,
static int trie_map_btf_id;
const struct bpf_map_ops trie_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = trie_alloc,
.map_free = trie_free,
.map_get_next_key = trie_get_next_key,
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 17738c93bec8..39ab0b68cade 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -17,23 +17,17 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
if (IS_ERR(inner_map))
return inner_map;
- /* prog_array->aux->{type,jited} is a runtime binding.
- * Doing static check alone in the verifier is not enough.
- */
- if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
- inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
- inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
- }
-
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta) {
fdput(f);
return ERR_PTR(-EINVAL);
}
+ if (!inner_map->ops->map_meta_equal) {
+ fdput(f);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
if (map_value_has_spin_lock(inner_map)) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
@@ -81,15 +75,14 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
- meta0->map_flags == meta1->map_flags &&
- meta0->max_entries == meta1->max_entries;
+ meta0->map_flags == meta1->map_flags;
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
struct file *map_file /* not used */,
int ufd)
{
- struct bpf_map *inner_map;
+ struct bpf_map *inner_map, *inner_map_meta;
struct fd f;
f = fdget(ufd);
@@ -97,7 +90,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
if (IS_ERR(inner_map))
return inner_map;
- if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
+ inner_map_meta = map->inner_map_meta;
+ if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map))
bpf_map_inc(inner_map);
else
inner_map = ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index a507bf6ef8b9..bcb7534afb3c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -11,8 +11,6 @@ struct bpf_map;
struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
void bpf_map_meta_free(struct bpf_map *map_meta);
-bool bpf_map_meta_equal(const struct bpf_map *meta0,
- const struct bpf_map *meta1);
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
void bpf_map_fd_put_ptr(void *ptr);
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index af86048e5afd..6a9542af4212 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -149,6 +149,19 @@ static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
bpf_map_put_with_uref(aux->map);
}
+void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_printf(seq, "map_id:\t%u\n", aux->map->id);
+}
+
+int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.map.map_id = aux->map->id;
+ return 0;
+}
+
DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
struct bpf_map *map, void *key, void *value)
@@ -156,6 +169,8 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.target = "bpf_map_elem",
.attach_target = bpf_iter_attach_map,
.detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index b367430e611c..3d897de89061 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
raw_spin_lock_init(&head->lock);
head->first = NULL;
}
+ raw_spin_lock_init(&s->extralist.lock);
+ s->extralist.first = NULL;
return 0;
}
@@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
raw_spin_unlock(&head->lock);
}
+static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ if (!raw_spin_trylock(&s->extralist.lock))
+ return false;
+
+ pcpu_freelist_push_node(&s->extralist, node);
+ raw_spin_unlock(&s->extralist.lock);
+ return true;
+}
+
+static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ int cpu, orig_cpu;
+
+ orig_cpu = cpu = raw_smp_processor_id();
+ while (1) {
+ struct pcpu_freelist_head *head;
+
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_spin_trylock(&head->lock)) {
+ pcpu_freelist_push_node(head, node);
+ raw_spin_unlock(&head->lock);
+ return;
+ }
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = 0;
+
+ /* cannot lock any per cpu lock, try extralist */
+ if (cpu == orig_cpu &&
+ pcpu_freelist_try_push_extra(s, node))
+ return;
+ }
+}
+
void __pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
- struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
-
- ___pcpu_freelist_push(head, node);
+ if (in_nmi())
+ ___pcpu_freelist_push_nmi(s, node);
+ else
+ ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
}
void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -81,7 +121,7 @@ again:
}
}
-struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
@@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
if (cpu >= nr_cpu_ids)
cpu = 0;
if (cpu == orig_cpu)
- return NULL;
+ break;
+ }
+
+ /* per cpu lists are all empty, try extralist */
+ raw_spin_lock(&s->extralist.lock);
+ node = s->extralist.first;
+ if (node)
+ s->extralist.first = node->next;
+ raw_spin_unlock(&s->extralist.lock);
+ return node;
+}
+
+static struct pcpu_freelist_node *
+___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_head *head;
+ struct pcpu_freelist_node *node;
+ int orig_cpu, cpu;
+
+ orig_cpu = cpu = raw_smp_processor_id();
+ while (1) {
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_spin_trylock(&head->lock)) {
+ node = head->first;
+ if (node) {
+ head->first = node->next;
+ raw_spin_unlock(&head->lock);
+ return node;
+ }
+ raw_spin_unlock(&head->lock);
+ }
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = 0;
+ if (cpu == orig_cpu)
+ break;
}
+
+ /* cannot pop from per cpu lists, try extralist */
+ if (!raw_spin_trylock(&s->extralist.lock))
+ return NULL;
+ node = s->extralist.first;
+ if (node)
+ s->extralist.first = node->next;
+ raw_spin_unlock(&s->extralist.lock);
+ return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ if (in_nmi())
+ return ___pcpu_freelist_pop_nmi(s);
+ return ___pcpu_freelist_pop(s);
}
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index fbf8a8a28979..3c76553cfe57 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -13,6 +13,7 @@ struct pcpu_freelist_head {
struct pcpu_freelist {
struct pcpu_freelist_head __percpu *freelist;
+ struct pcpu_freelist_head extralist;
};
struct pcpu_freelist_node {
diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore
new file mode 100644
index 000000000000..856a4c5ad0dd
--- /dev/null
+++ b/kernel/bpf/preload/.gitignore
@@ -0,0 +1,4 @@
+/FEATURE-DUMP.libbpf
+/bpf_helper_defs.h
+/feature
+/bpf_preload_umd
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
new file mode 100644
index 000000000000..ace49111d3a3
--- /dev/null
+++ b/kernel/bpf/preload/Kconfig
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config USERMODE_DRIVER
+ bool
+ default n
+
+menuconfig BPF_PRELOAD
+ bool "Preload BPF file system with kernel specific program and map iterators"
+ depends on BPF
+ # The dependency on !COMPILE_TEST prevents it from being enabled
+ # in allmodconfig or allyesconfig configurations
+ depends on !COMPILE_TEST
+ select USERMODE_DRIVER
+ help
+ This builds kernel module with several embedded BPF programs that are
+ pinned into BPF FS mount point as human readable files that are
+ useful in debugging and introspection of BPF programs and maps.
+
+if BPF_PRELOAD
+config BPF_PRELOAD_UMD
+ tristate "bpf_preload kernel module with user mode driver"
+ depends on CC_CAN_LINK
+ depends on m || CC_CAN_LINK_STATIC
+ default m
+ help
+ This builds bpf_preload kernel module with embedded user mode driver.
+endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
new file mode 100644
index 000000000000..23ee310b6eb4
--- /dev/null
+++ b/kernel/bpf/preload/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+
+LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
+LIBBPF_A = $(obj)/libbpf.a
+LIBBPF_OUT = $(abspath $(obj))
+
+$(LIBBPF_A):
+ $(Q)$(MAKE) -C $(LIBBPF_SRCS) OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a
+
+userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
+ -I $(srctree)/tools/lib/ -Wno-unused-result
+
+userprogs := bpf_preload_umd
+
+clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/
+
+bpf_preload_umd-objs := iterators/iterators.o
+bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
+
+$(obj)/bpf_preload_umd: $(LIBBPF_A)
+
+$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
+
+obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
+bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
new file mode 100644
index 000000000000..2f9932276f2e
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_PRELOAD_H
+#define _BPF_PRELOAD_H
+
+#include <linux/usermode_driver.h>
+#include "iterators/bpf_preload_common.h"
+
+struct bpf_preload_ops {
+ struct umd_info info;
+ int (*preload)(struct bpf_preload_info *);
+ int (*finish)(void);
+ struct module *owner;
+};
+extern struct bpf_preload_ops *bpf_preload_ops;
+#define BPF_PRELOAD_LINKS 2
+#endif
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
new file mode 100644
index 000000000000..79c5772465f1
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pid.h>
+#include <linux/fs.h>
+#include <linux/sched/signal.h>
+#include "bpf_preload.h"
+
+extern char bpf_preload_umd_start;
+extern char bpf_preload_umd_end;
+
+static int preload(struct bpf_preload_info *obj);
+static int finish(void);
+
+static struct bpf_preload_ops umd_ops = {
+ .info.driver_name = "bpf_preload",
+ .preload = preload,
+ .finish = finish,
+ .owner = THIS_MODULE,
+};
+
+static int preload(struct bpf_preload_info *obj)
+{
+ int magic = BPF_PRELOAD_START;
+ loff_t pos = 0;
+ int i, err;
+ ssize_t n;
+
+ err = fork_usermode_driver(&umd_ops.info);
+ if (err)
+ return err;
+
+ /* send the start magic to let UMD proceed with loading BPF progs */
+ n = kernel_write(umd_ops.info.pipe_to_umh,
+ &magic, sizeof(magic), &pos);
+ if (n != sizeof(magic))
+ return -EPIPE;
+
+ /* receive bpf_link IDs and names from UMD */
+ pos = 0;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ n = kernel_read(umd_ops.info.pipe_from_umh,
+ &obj[i], sizeof(*obj), &pos);
+ if (n != sizeof(*obj))
+ return -EPIPE;
+ }
+ return 0;
+}
+
+static int finish(void)
+{
+ int magic = BPF_PRELOAD_END;
+ struct pid *tgid;
+ loff_t pos = 0;
+ ssize_t n;
+
+ /* send the last magic to UMD. It will do a normal exit. */
+ n = kernel_write(umd_ops.info.pipe_to_umh,
+ &magic, sizeof(magic), &pos);
+ if (n != sizeof(magic))
+ return -EPIPE;
+ tgid = umd_ops.info.tgid;
+ wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+ umd_ops.info.tgid = NULL;
+ return 0;
+}
+
+static int __init load_umd(void)
+{
+ int err;
+
+ err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
+ &bpf_preload_umd_end - &bpf_preload_umd_start);
+ if (err)
+ return err;
+ bpf_preload_ops = &umd_ops;
+ return err;
+}
+
+static void __exit fini_umd(void)
+{
+ bpf_preload_ops = NULL;
+ /* kill UMD in case it's still there due to earlier error */
+ kill_pid(umd_ops.info.tgid, SIGKILL, 1);
+ umd_ops.info.tgid = NULL;
+ umd_unload_blob(&umd_ops.info);
+}
+late_initcall(load_umd);
+module_exit(fini_umd);
+MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S
new file mode 100644
index 000000000000..f1f40223b5c3
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload_umd_blob.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .section .init.rodata, "a"
+ .global bpf_preload_umd_start
+bpf_preload_umd_start:
+ .incbin "kernel/bpf/preload/bpf_preload_umd"
+ .global bpf_preload_umd_end
+bpf_preload_umd_end:
diff --git a/kernel/bpf/preload/iterators/.gitignore b/kernel/bpf/preload/iterators/.gitignore
new file mode 100644
index 000000000000..ffdb70230c8b
--- /dev/null
+++ b/kernel/bpf/preload/iterators/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/.output
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
new file mode 100644
index 000000000000..28fa8c1440f4
--- /dev/null
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0
+OUTPUT := .output
+CLANG ?= clang
+LLC ?= llc
+LLVM_STRIP ?= llvm-strip
+DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf)
+BPFOBJ := $(OUTPUT)/libbpf.a
+BPF_INCLUDE := $(OUTPUT)
+INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \
+ -I$(abspath ../../../../tools/include/uapi)
+CFLAGS := -g -Wall
+
+abs_out := $(abspath $(OUTPUT))
+ifeq ($(V),1)
+Q =
+msg =
+else
+Q = @
+msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))";
+MAKEFLAGS += --no-print-directory
+submake_extras := feature_display=0
+endif
+
+.DELETE_ON_ERROR:
+
+.PHONY: all clean
+
+all: iterators.skel.h
+
+clean:
+ $(call msg,CLEAN)
+ $(Q)rm -rf $(OUTPUT) iterators
+
+iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+ $(call msg,GEN-SKEL,$@)
+ $(Q)$(BPFTOOL) gen skeleton $< > $@
+
+
+$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
+ $(call msg,BPF,$@)
+ $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \
+ -c $(filter %.c,$^) -o $@ && \
+ $(LLVM_STRIP) -g $@
+
+$(OUTPUT):
+ $(call msg,MKDIR,$@)
+ $(Q)mkdir -p $(OUTPUT)
+
+$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \
+ OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
+
+$(DEFAULT_BPFTOOL):
+ $(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \
+ prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install
diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README
new file mode 100644
index 000000000000..7fd6d39a9ad2
--- /dev/null
+++ b/kernel/bpf/preload/iterators/README
@@ -0,0 +1,4 @@
+WARNING:
+If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h".
+Make sure to have clang 10 installed.
+See Documentation/bpf/bpf_devel_QA.rst
diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h
new file mode 100644
index 000000000000..8464d1a48c05
--- /dev/null
+++ b/kernel/bpf/preload/iterators/bpf_preload_common.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_PRELOAD_COMMON_H
+#define _BPF_PRELOAD_COMMON_H
+
+#define BPF_PRELOAD_START 0x5555
+#define BPF_PRELOAD_END 0xAAAA
+
+struct bpf_preload_info {
+ char link_name[16];
+ int link_id;
+};
+
+#endif
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
new file mode 100644
index 000000000000..52aa7b38e8b8
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
+struct seq_file;
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+};
+
+struct bpf_map {
+ __u32 id;
+ char name[16];
+ __u32 max_entries;
+};
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+};
+
+struct btf_type {
+ __u32 name_off;
+};
+
+struct btf_header {
+ __u32 str_len;
+};
+
+struct btf {
+ const char *strings;
+ struct btf_type **types;
+ struct btf_header hdr;
+};
+
+struct bpf_prog_aux {
+ __u32 id;
+ char name[16];
+ const char *attach_func_name;
+ struct bpf_prog *dst_prog;
+ struct bpf_func_info *func_info;
+ struct btf *btf;
+};
+
+struct bpf_prog {
+ struct bpf_prog_aux *aux;
+};
+
+struct bpf_iter__bpf_prog {
+ struct bpf_iter_meta *meta;
+ struct bpf_prog *prog;
+};
+#pragma clang attribute pop
+
+static const char *get_name(struct btf *btf, long btf_id, const char *fallback)
+{
+ struct btf_type **types, *t;
+ unsigned int name_off;
+ const char *str;
+
+ if (!btf)
+ return fallback;
+ str = btf->strings;
+ types = btf->types;
+ bpf_probe_read_kernel(&t, sizeof(t), types + btf_id);
+ name_off = BPF_CORE_READ(t, name_off);
+ if (name_off >= btf->hdr.str_len)
+ return fallback;
+ return str + name_off;
+}
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u64 seq_num = ctx->meta->seq_num;
+ struct bpf_map *map = ctx->map;
+
+ if (!map)
+ return 0;
+
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " id name max_entries\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries);
+ return 0;
+}
+
+SEC("iter/bpf_prog")
+int dump_bpf_prog(struct bpf_iter__bpf_prog *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u64 seq_num = ctx->meta->seq_num;
+ struct bpf_prog *prog = ctx->prog;
+ struct bpf_prog_aux *aux;
+
+ if (!prog)
+ return 0;
+
+ aux = prog->aux;
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " id name attached\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %s %s\n", aux->id,
+ get_name(aux->btf, aux->func_info[0].type_id, aux->name),
+ aux->attach_func_name, aux->dst_prog->aux->name);
+ return 0;
+}
+char LICENSE[] SEC("license") = "GPL";
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
new file mode 100644
index 000000000000..b7ff87939172
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <argp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include <sys/mount.h>
+#include "iterators.skel.h"
+#include "bpf_preload_common.h"
+
+int to_kernel = -1;
+int from_kernel = 0;
+
+static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
+{
+ struct bpf_preload_info obj = {};
+ struct bpf_link_info info = {};
+ __u32 info_len = sizeof(info);
+ int err;
+
+ err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
+ if (err)
+ return err;
+ obj.link_id = info.id;
+ if (strlen(link_name) >= sizeof(obj.link_name))
+ return -E2BIG;
+ strcpy(obj.link_name, link_name);
+ if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj))
+ return -EPIPE;
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
+ struct iterators_bpf *skel;
+ int err, magic;
+ int debug_fd;
+
+ debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
+ if (debug_fd < 0)
+ return 1;
+ to_kernel = dup(1);
+ close(1);
+ dup(debug_fd);
+ /* now stdin and stderr point to /dev/console */
+
+ read(from_kernel, &magic, sizeof(magic));
+ if (magic != BPF_PRELOAD_START) {
+ printf("bad start magic %d\n", magic);
+ return 1;
+ }
+ setrlimit(RLIMIT_MEMLOCK, &rlim);
+ /* libbpf opens BPF object and loads it into the kernel */
+ skel = iterators_bpf__open_and_load();
+ if (!skel) {
+ /* iterators.skel.h is little endian.
+ * libbpf doesn't support automatic little->big conversion
+ * of BPF bytecode yet.
+ * The program load will fail in such case.
+ */
+ printf("Failed load could be due to wrong endianness\n");
+ return 1;
+ }
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto cleanup;
+
+ /* send two bpf_link IDs with names to the kernel */
+ err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
+ if (err)
+ goto cleanup;
+ err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
+ if (err)
+ goto cleanup;
+
+ /* The kernel will proceed with pinnging the links in bpffs.
+ * UMD will wait on read from pipe.
+ */
+ read(from_kernel, &magic, sizeof(magic));
+ if (magic != BPF_PRELOAD_END) {
+ printf("bad final magic %d\n", magic);
+ err = -EINVAL;
+ }
+cleanup:
+ iterators_bpf__destroy(skel);
+
+ return err != 0;
+}
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
new file mode 100644
index 000000000000..cf9a6a94b3a4
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.skel.h
@@ -0,0 +1,412 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/* THIS FILE IS AUTOGENERATED! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <stdlib.h>
+#include <bpf/libbpf.h>
+
+struct iterators_bpf {
+ struct bpf_object_skeleton *skeleton;
+ struct bpf_object *obj;
+ struct {
+ struct bpf_map *rodata;
+ } maps;
+ struct {
+ struct bpf_program *dump_bpf_map;
+ struct bpf_program *dump_bpf_prog;
+ } progs;
+ struct {
+ struct bpf_link *dump_bpf_map;
+ struct bpf_link *dump_bpf_prog;
+ } links;
+ struct iterators_bpf__rodata {
+ char dump_bpf_map____fmt[35];
+ char dump_bpf_map____fmt_1[14];
+ char dump_bpf_prog____fmt[32];
+ char dump_bpf_prog____fmt_2[17];
+ } *rodata;
+};
+
+static void
+iterators_bpf__destroy(struct iterators_bpf *obj)
+{
+ if (!obj)
+ return;
+ if (obj->skeleton)
+ bpf_object__destroy_skeleton(obj->skeleton);
+ free(obj);
+}
+
+static inline int
+iterators_bpf__create_skeleton(struct iterators_bpf *obj);
+
+static inline struct iterators_bpf *
+iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
+{
+ struct iterators_bpf *obj;
+
+ obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
+ if (!obj)
+ return NULL;
+ if (iterators_bpf__create_skeleton(obj))
+ goto err;
+ if (bpf_object__open_skeleton(obj->skeleton, opts))
+ goto err;
+
+ return obj;
+err:
+ iterators_bpf__destroy(obj);
+ return NULL;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ return iterators_bpf__open_opts(NULL);
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *obj)
+{
+ return bpf_object__load_skeleton(obj->skeleton);
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *obj;
+
+ obj = iterators_bpf__open();
+ if (!obj)
+ return NULL;
+ if (iterators_bpf__load(obj)) {
+ iterators_bpf__destroy(obj);
+ return NULL;
+ }
+ return obj;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *obj)
+{
+ return bpf_object__attach_skeleton(obj->skeleton);
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *obj)
+{
+ return bpf_object__detach_skeleton(obj->skeleton);
+}
+
+static inline int
+iterators_bpf__create_skeleton(struct iterators_bpf *obj)
+{
+ struct bpf_object_skeleton *s;
+
+ s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
+ if (!s)
+ return -1;
+ obj->skeleton = s;
+
+ s->sz = sizeof(*s);
+ s->name = "iterators_bpf";
+ s->obj = &obj->obj;
+
+ /* maps */
+ s->map_cnt = 1;
+ s->map_skel_sz = sizeof(*s->maps);
+ s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
+ if (!s->maps)
+ goto err;
+
+ s->maps[0].name = "iterator.rodata";
+ s->maps[0].map = &obj->maps.rodata;
+ s->maps[0].mmaped = (void **)&obj->rodata;
+
+ /* programs */
+ s->prog_cnt = 2;
+ s->prog_skel_sz = sizeof(*s->progs);
+ s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
+ if (!s->progs)
+ goto err;
+
+ s->progs[0].name = "dump_bpf_map";
+ s->progs[0].prog = &obj->progs.dump_bpf_map;
+ s->progs[0].link = &obj->links.dump_bpf_map;
+
+ s->progs[1].name = "dump_bpf_prog";
+ s->progs[1].prog = &obj->progs.dump_bpf_prog;
+ s->progs[1].link = &obj->links.dump_bpf_prog;
+
+ s->data_sz = 7176;
+ s->data = (void *)"\
+\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
+\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
+\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
+\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
+\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
+\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
+\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
+\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
+\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
+\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
+\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
+\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
+\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
+\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
+\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
+\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
+\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
+\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
+\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
+\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
+\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
+\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
+\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
+\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
+\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
+\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
+\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
+\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
+\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
+\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
+\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
+\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
+\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
+\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
+\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
+\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
+\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
+\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
+\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
+\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
+\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
+\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
+\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
+\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
+\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
+\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
+\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
+\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
+\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
+\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
+\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
+\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
+\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
+\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
+\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
+\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
+\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
+\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
+\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
+\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
+\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
+\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
+\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
+\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
+\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
+\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
+\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
+\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
+\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
+\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
+\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
+\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
+\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
+\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
+\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
+\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
+\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
+\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
+\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
+\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
+\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
+\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
+\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
+\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
+\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
+\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
+\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
+\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
+\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
+\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
+\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
+\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
+\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
+\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
+\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
+\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
+\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
+\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
+\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
+\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
+\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
+\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
+\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
+\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
+\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
+\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
+\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
+\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
+\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
+\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
+\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
+\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
+\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
+\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
+\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
+\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
+\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
+\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
+\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
+\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
+\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
+\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
+\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
+\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
+\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
+\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
+\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
+\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
+\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
+\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
+\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
+\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
+\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
+\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
+\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
+\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
+\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
+\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
+\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
+\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
+\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
+\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
+\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
+\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
+\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
+\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
+\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
+\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
+\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
+\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
+\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
+\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
+\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
+\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
+\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
+\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
+\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
+\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
+\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
+\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
+\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
+\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
+\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
+\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
+\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
+\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
+\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
+\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
+\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
+\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
+\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
+\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
+\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
+\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
+\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
+\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
+\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
+\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
+\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
+\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
+\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
+\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
+\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
+\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
+\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
+\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
+\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
+\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
+\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
+\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
+\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
+\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
+\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
+\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
+\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+
+ return 0;
+err:
+ bpf_object__destroy_skeleton(s);
+ return -1;
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 44184f82916a..0ee2347ba510 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -257,6 +257,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
static int queue_map_btf_id;
const struct bpf_map_ops queue_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
.map_free = queue_stack_map_free,
@@ -273,6 +274,7 @@ const struct bpf_map_ops queue_map_ops = {
static int stack_map_btf_id;
const struct bpf_map_ops stack_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
.map_free = queue_stack_map_free,
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 90b29c5b1da7..a55cd542f2ce 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -191,7 +191,7 @@ int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
rcu_read_lock();
sk = reuseport_array_lookup_elem(map, key);
if (sk) {
- *(u64 *)value = sock_gen_cookie(sk);
+ *(u64 *)value = __sock_gen_cookie(sk);
err = 0;
} else {
err = -ENOENT;
@@ -351,6 +351,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
static int reuseport_array_map_btf_id;
const struct bpf_map_ops reuseport_array_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = reuseport_array_alloc_check,
.map_alloc = reuseport_array_alloc,
.map_free = reuseport_array_free,
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 002f8a5c9e51..31cb04a4dd2d 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -287,6 +287,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
static int ringbuf_map_btf_id;
const struct bpf_map_ops ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
.map_mmap = ringbuf_map_mmap,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index cfed0ac44d38..06065fa27124 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -665,18 +665,17 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
return __bpf_get_stack(regs, task, NULL, buf, size, flags);
}
-BTF_ID_LIST(bpf_get_task_stack_btf_ids)
-BTF_ID(struct, task_struct)
+BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_get_task_stack_btf_ids[0],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
- .btf_id = bpf_get_task_stack_btf_ids,
};
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
@@ -839,6 +838,7 @@ static void stack_map_free(struct bpf_map *map)
static int stack_trace_map_btf_id;
const struct bpf_map_ops stack_trace_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = stack_map_alloc,
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1bf960aa615c..1110ecd7d1f3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4,6 +4,7 @@
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
+#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
@@ -29,6 +30,7 @@
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/bpf-netns.h>
+#include <linux/rcupdate_trace.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -90,6 +92,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
}
const struct bpf_map_ops bpf_map_offload_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
@@ -157,10 +160,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
if (bpf_map_is_dev_bound(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
- map->map_type == BPF_MAP_TYPE_SOCKHASH ||
- map->map_type == BPF_MAP_TYPE_SOCKMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
return bpf_fd_array_map_update_elem(map, f.file, key, value,
flags);
@@ -768,7 +772,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
- map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
return -ENOTSUPP;
if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
map->value_size) {
@@ -1728,10 +1733,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
btf_put(prog->aux->btf);
bpf_prog_free_linfo(prog);
- if (deferred)
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
- else
+ if (deferred) {
+ if (prog->aux->sleepable)
+ call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ } else {
__bpf_prog_put_rcu(&prog->aux->rcu);
+ }
}
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
@@ -2101,6 +2110,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
+ BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32))
return -EINVAL;
@@ -2145,17 +2155,18 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
prog->expected_attach_type = attr->expected_attach_type;
prog->aux->attach_btf_id = attr->attach_btf_id;
if (attr->attach_prog_fd) {
- struct bpf_prog *tgt_prog;
+ struct bpf_prog *dst_prog;
- tgt_prog = bpf_prog_get(attr->attach_prog_fd);
- if (IS_ERR(tgt_prog)) {
- err = PTR_ERR(tgt_prog);
+ dst_prog = bpf_prog_get(attr->attach_prog_fd);
+ if (IS_ERR(dst_prog)) {
+ err = PTR_ERR(dst_prog);
goto free_prog_nouncharge;
}
- prog->aux->linked_prog = tgt_prog;
+ prog->aux->dst_prog = dst_prog;
}
prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
err = security_bpf_prog_alloc(prog->aux);
if (err)
@@ -2488,11 +2499,23 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
struct bpf_tracing_link {
struct bpf_link link;
enum bpf_attach_type attach_type;
+ struct bpf_trampoline *trampoline;
+ struct bpf_prog *tgt_prog;
};
static void bpf_tracing_link_release(struct bpf_link *link)
{
- WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link);
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
+ tr_link->trampoline));
+
+ bpf_trampoline_put(tr_link->trampoline);
+
+ /* tgt_prog is NULL if target is a kernel function */
+ if (tr_link->tgt_prog)
+ bpf_prog_put(tr_link->tgt_prog);
}
static void bpf_tracing_link_dealloc(struct bpf_link *link)
@@ -2532,10 +2555,15 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
.fill_link_info = bpf_tracing_link_fill_link_info,
};
-static int bpf_tracing_prog_attach(struct bpf_prog *prog)
+static int bpf_tracing_prog_attach(struct bpf_prog *prog,
+ int tgt_prog_fd,
+ u32 btf_id)
{
struct bpf_link_primer link_primer;
+ struct bpf_prog *tgt_prog = NULL;
+ struct bpf_trampoline *tr = NULL;
struct bpf_tracing_link *link;
+ u64 key = 0;
int err;
switch (prog->type) {
@@ -2564,6 +2592,28 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
goto out_put_prog;
}
+ if (!!tgt_prog_fd != !!btf_id) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ if (tgt_prog_fd) {
+ /* For now we only allow new targets for BPF_PROG_TYPE_EXT */
+ if (prog->type != BPF_PROG_TYPE_EXT) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ tgt_prog = bpf_prog_get(tgt_prog_fd);
+ if (IS_ERR(tgt_prog)) {
+ err = PTR_ERR(tgt_prog);
+ tgt_prog = NULL;
+ goto out_put_prog;
+ }
+
+ key = bpf_trampoline_compute_key(tgt_prog, btf_id);
+ }
+
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
@@ -2573,20 +2623,100 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
&bpf_tracing_link_lops, prog);
link->attach_type = prog->expected_attach_type;
- err = bpf_link_prime(&link->link, &link_primer);
- if (err) {
- kfree(link);
- goto out_put_prog;
+ mutex_lock(&prog->aux->dst_mutex);
+
+ /* There are a few possible cases here:
+ *
+ * - if prog->aux->dst_trampoline is set, the program was just loaded
+ * and not yet attached to anything, so we can use the values stored
+ * in prog->aux
+ *
+ * - if prog->aux->dst_trampoline is NULL, the program has already been
+ * attached to a target and its initial target was cleared (below)
+ *
+ * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
+ * target_btf_id using the link_create API.
+ *
+ * - if tgt_prog == NULL when this function was called using the old
+ * raw_tracepoint_open API, and we need a target from prog->aux
+ *
+ * The combination of no saved target in prog->aux, and no target
+ * specified on load is illegal, and we reject that here.
+ */
+ if (!prog->aux->dst_trampoline && !tgt_prog) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (!prog->aux->dst_trampoline ||
+ (key && key != prog->aux->dst_trampoline->key)) {
+ /* If there is no saved target, or the specified target is
+ * different from the destination specified at load time, we
+ * need a new trampoline and a check for compatibility
+ */
+ struct bpf_attach_target_info tgt_info = {};
+
+ err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
+ &tgt_info);
+ if (err)
+ goto out_unlock;
+
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ } else {
+ /* The caller didn't specify a target, or the target was the
+ * same as the destination supplied during program load. This
+ * means we can reuse the trampoline and reference from program
+ * load time, and there is no need to allocate a new one. This
+ * can only happen once for any program, as the saved values in
+ * prog->aux are cleared below.
+ */
+ tr = prog->aux->dst_trampoline;
+ tgt_prog = prog->aux->dst_prog;
}
- err = bpf_trampoline_link_prog(prog);
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto out_unlock;
+
+ err = bpf_trampoline_link_prog(prog, tr);
if (err) {
bpf_link_cleanup(&link_primer);
- goto out_put_prog;
+ link = NULL;
+ goto out_unlock;
}
+ link->tgt_prog = tgt_prog;
+ link->trampoline = tr;
+
+ /* Always clear the trampoline and target prog from prog->aux to make
+ * sure the original attach destination is not kept alive after a
+ * program is (re-)attached to another target.
+ */
+ if (prog->aux->dst_prog &&
+ (tgt_prog_fd || tr != prog->aux->dst_trampoline))
+ /* got extra prog ref from syscall, or attaching to different prog */
+ bpf_prog_put(prog->aux->dst_prog);
+ if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
+ /* we allocated a new trampoline, so free the old one */
+ bpf_trampoline_put(prog->aux->dst_trampoline);
+
+ prog->aux->dst_prog = NULL;
+ prog->aux->dst_trampoline = NULL;
+ mutex_unlock(&prog->aux->dst_mutex);
+
return bpf_link_settle(&link_primer);
+out_unlock:
+ if (tr && tr != prog->aux->dst_trampoline)
+ bpf_trampoline_put(tr);
+ mutex_unlock(&prog->aux->dst_mutex);
+ kfree(link);
out_put_prog:
+ if (tgt_prog_fd && tgt_prog)
+ bpf_prog_put(tgt_prog);
bpf_prog_put(prog);
return err;
}
@@ -2634,7 +2764,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
u32 ulen = info->raw_tracepoint.tp_name_len;
size_t tp_len = strlen(tp_name);
- if (ulen && !ubuf)
+ if (!ulen ^ !ubuf)
return -EINVAL;
info->raw_tracepoint.tp_name_len = tp_len + 1;
@@ -2700,7 +2830,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
tp_name = prog->aux->attach_func_name;
break;
}
- return bpf_tracing_prog_attach(prog);
+ return bpf_tracing_prog_attach(prog, 0, 0);
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
if (strncpy_from_user(buf,
@@ -2969,7 +3099,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
}
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3152,21 +3282,25 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
const struct bpf_map *map;
int i;
+ mutex_lock(&prog->aux->used_maps_mutex);
for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
map = prog->aux->used_maps[i];
if (map == (void *)addr) {
*type = BPF_PSEUDO_MAP_FD;
- return map;
+ goto out;
}
if (!map->ops->map_direct_value_meta)
continue;
if (!map->ops->map_direct_value_meta(map, addr, off)) {
*type = BPF_PSEUDO_MAP_VALUE;
- return map;
+ goto out;
}
}
+ map = NULL;
- return NULL;
+out:
+ mutex_unlock(&prog->aux->used_maps_mutex);
+ return map;
}
static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
@@ -3284,6 +3418,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
memcpy(info.tag, prog->tag, sizeof(prog->tag));
memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+ mutex_lock(&prog->aux->used_maps_mutex);
ulen = info.nr_map_ids;
info.nr_map_ids = prog->aux->used_map_cnt;
ulen = min_t(u32, info.nr_map_ids, ulen);
@@ -3293,9 +3428,12 @@ static int bpf_prog_get_info_by_fd(struct file *file,
for (i = 0; i < ulen; i++)
if (put_user(prog->aux->used_maps[i]->id,
- &user_map_ids[i]))
+ &user_map_ids[i])) {
+ mutex_unlock(&prog->aux->used_maps_mutex);
return -EFAULT;
+ }
}
+ mutex_unlock(&prog->aux->used_maps_mutex);
err = set_info_rec_size(&info);
if (err)
@@ -3876,10 +4014,15 @@ err_put:
static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
- if (attr->link_create.attach_type == BPF_TRACE_ITER &&
- prog->expected_attach_type == BPF_TRACE_ITER)
- return bpf_iter_link_attach(attr, prog);
+ if (attr->link_create.attach_type != prog->expected_attach_type)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_ITER)
+ return bpf_iter_link_attach(attr, prog);
+ else if (prog->type == BPF_PROG_TYPE_EXT)
+ return bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id);
return -EINVAL;
}
@@ -3893,18 +4036,25 @@ static int link_create(union bpf_attr *attr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC)
- return -EINVAL;
-
- prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype);
+ prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
ret = bpf_prog_attach_check_attach_type(prog,
attr->link_create.attach_type);
if (ret)
- goto err_out;
+ goto out;
+
+ if (prog->type == BPF_PROG_TYPE_EXT) {
+ ret = tracing_bpf_link_attach(attr, prog);
+ goto out;
+ }
+
+ ptype = attach_type_to_prog_type(attr->link_create.attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
+ ret = -EINVAL;
+ goto out;
+ }
switch (ptype) {
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -3932,7 +4082,7 @@ static int link_create(union bpf_attr *attr)
ret = -EINVAL;
}
-err_out:
+out:
if (ret < 0)
bpf_prog_put(prog);
return ret;
@@ -4014,40 +4164,50 @@ static int link_detach(union bpf_attr *attr)
return ret;
}
-static int bpf_link_inc_not_zero(struct bpf_link *link)
+static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
- return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
-#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
-
-static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_link *bpf_link_by_id(u32 id)
{
struct bpf_link *link;
- u32 id = attr->link_id;
- int fd, err;
- if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!id)
+ return ERR_PTR(-ENOENT);
spin_lock_bh(&link_idr_lock);
- link = idr_find(&link_idr, id);
/* before link is "settled", ID is 0, pretend it doesn't exist yet */
+ link = idr_find(&link_idr, id);
if (link) {
if (link->id)
- err = bpf_link_inc_not_zero(link);
+ link = bpf_link_inc_not_zero(link);
else
- err = -EAGAIN;
+ link = ERR_PTR(-EAGAIN);
} else {
- err = -ENOENT;
+ link = ERR_PTR(-ENOENT);
}
spin_unlock_bh(&link_idr_lock);
+ return link;
+}
- if (err)
- return err;
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ u32 id = attr->link_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ link = bpf_link_by_id(id);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
fd = bpf_link_new_fd(link);
if (fd < 0)
@@ -4133,6 +4293,68 @@ static int bpf_iter_create(union bpf_attr *attr)
return err;
}
+#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
+
+static int bpf_prog_bind_map(union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct bpf_map **used_maps_old, **used_maps_new;
+ int i, ret = 0;
+
+ if (CHECK_ATTR(BPF_PROG_BIND_MAP))
+ return -EINVAL;
+
+ if (attr->prog_bind_map.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ map = bpf_map_get(attr->prog_bind_map.map_fd);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
+ goto out_prog_put;
+ }
+
+ mutex_lock(&prog->aux->used_maps_mutex);
+
+ used_maps_old = prog->aux->used_maps;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++)
+ if (used_maps_old[i] == map) {
+ bpf_map_put(map);
+ goto out_unlock;
+ }
+
+ used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
+ sizeof(used_maps_new[0]),
+ GFP_KERNEL);
+ if (!used_maps_new) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(used_maps_new, used_maps_old,
+ sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
+ used_maps_new[prog->aux->used_map_cnt] = map;
+
+ prog->aux->used_map_cnt++;
+ prog->aux->used_maps = used_maps_new;
+
+ kfree(used_maps_old);
+
+out_unlock:
+ mutex_unlock(&prog->aux->used_maps_mutex);
+
+ if (ret)
+ bpf_map_put(map);
+out_prog_put:
+ bpf_prog_put(prog);
+ return ret;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr;
@@ -4266,6 +4488,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_LINK_DETACH:
err = link_detach(&attr);
break;
+ case BPF_PROG_BIND_MAP:
+ err = bpf_prog_bind_map(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 3b495773de5a..11b3380887fa 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -30,15 +30,15 @@ static struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
- if (!__start_BTF)
+ bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
+
+ if (!__start_BTF || bin_attr_btf_vmlinux.size == 0)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
if (!btf_kobj)
return -ENOMEM;
- bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
-
return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux);
}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 99af4cea1102..5b6af30bfbcd 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -22,7 +22,8 @@ struct bpf_iter_seq_task_info {
};
static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
- u32 *tid)
+ u32 *tid,
+ bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
@@ -36,6 +37,12 @@ retry:
if (!task) {
++*tid;
goto retry;
+ } else if (skip_if_dup_files && task->tgid != task->pid &&
+ task->files == task->group_leader->files) {
+ put_task_struct(task);
+ task = NULL;
+ ++*tid;
+ goto retry;
}
}
rcu_read_unlock();
@@ -48,7 +55,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid);
+ task = task_seq_get_next(info->common.ns, &info->tid, false);
if (!task)
return NULL;
@@ -65,7 +72,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid);
+ task = task_seq_get_next(info->common.ns, &info->tid, false);
if (!task)
return NULL;
@@ -148,7 +155,7 @@ again:
curr_files = *fstruct;
curr_fd = info->fd;
} else {
- curr_task = task_seq_get_next(ns, &curr_tid);
+ curr_task = task_seq_get_next(ns, &curr_tid, true);
if (!curr_task)
return NULL;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 9be85aa4ec5f..35c5887d82ff 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -7,6 +7,8 @@
#include <linux/rbtree_latch.h>
#include <linux/perf_event.h>
#include <linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -63,7 +65,7 @@ static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
bpf_image_ksym_add(tr->image, ksym);
}
-struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
@@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
* updates to trampoline would change the code from underneath the
* preempted task. Hence wait for tasks to voluntarily schedule or go
* to userspace.
+ * The same trampoline can hold both sleepable and non-sleepable progs.
+ * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
+ * programs finish executing.
+ * Wait for these two grace periods together.
*/
-
- synchronize_rcu_tasks();
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
&tr->func.model, flags, tprogs,
@@ -256,14 +261,12 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-int bpf_trampoline_link_prog(struct bpf_prog *prog)
+int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
- struct bpf_trampoline *tr;
int err = 0;
int cnt;
- tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog);
mutex_lock(&tr->mutex);
if (tr->extension_prog) {
@@ -296,7 +299,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog)
}
hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
tr->progs_cnt[kind]++;
- err = bpf_trampoline_update(prog->aux->trampoline);
+ err = bpf_trampoline_update(tr);
if (err) {
hlist_del(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
@@ -307,13 +310,11 @@ out:
}
/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
+int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
- struct bpf_trampoline *tr;
int err;
- tr = prog->aux->trampoline;
kind = bpf_attach_type_to_tramp(prog);
mutex_lock(&tr->mutex);
if (kind == BPF_TRAMP_REPLACE) {
@@ -325,12 +326,32 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
}
hlist_del(&prog->aux->tramp_hlist);
tr->progs_cnt[kind]--;
- err = bpf_trampoline_update(prog->aux->trampoline);
+ err = bpf_trampoline_update(tr);
out:
mutex_unlock(&tr->mutex);
return err;
}
+struct bpf_trampoline *bpf_trampoline_get(u64 key,
+ struct bpf_attach_target_info *tgt_info)
+{
+ struct bpf_trampoline *tr;
+
+ tr = bpf_trampoline_lookup(key);
+ if (!tr)
+ return NULL;
+
+ mutex_lock(&tr->mutex);
+ if (tr->func.addr)
+ goto out;
+
+ memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
+ tr->func.addr = (void *)tgt_info->tgt_addr;
+out:
+ mutex_unlock(&tr->mutex);
+ return tr;
+}
+
void bpf_trampoline_put(struct bpf_trampoline *tr)
{
if (!tr)
@@ -344,7 +365,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
bpf_image_ksym_del(&tr->ksym);
- /* wait for tasks to get out of trampoline before freeing it */
+ /* This code will be executed when all bpf progs (both sleepable and
+ * non-sleepable) went through
+ * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
+ * Hence no need for another synchronize_rcu_tasks_trace() here,
+ * but synchronize_rcu_tasks() is still needed, since trampoline
+ * may not have had any sleepable programs and we need to wait
+ * for tasks to get out of trampoline code before freeing it.
+ */
synchronize_rcu_tasks();
bpf_jit_free_exec(tr->image);
hlist_del(&tr->hlist);
@@ -394,6 +422,17 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
rcu_read_unlock();
}
+void notrace __bpf_prog_enter_sleepable(void)
+{
+ rcu_read_lock_trace();
+ might_fault();
+}
+
+void notrace __bpf_prog_exit_sleepable(void)
+{
+ rcu_read_unlock_trace();
+}
+
int __weak
arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47e74f09fa37..39d7f44e7c92 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,7 @@
#include <linux/ctype.h>
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
#include "disasm.h"
@@ -238,6 +239,7 @@ struct bpf_call_arg_meta {
int ref_obj_id;
int func_id;
u32 btf_id;
+ u32 ret_btf_id;
};
struct btf *btf_vmlinux;
@@ -435,6 +437,15 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
return type == ARG_PTR_TO_SOCK_COMMON;
}
+static bool arg_type_may_be_null(enum bpf_arg_type type)
+{
+ return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
+ type == ARG_PTR_TO_MEM_OR_NULL ||
+ type == ARG_PTR_TO_CTX_OR_NULL ||
+ type == ARG_PTR_TO_SOCKET_OR_NULL ||
+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+}
+
/* Determine whether the function releases some resources allocated by another
* function call. The first reference type argument will be assumed to be
* released by release_reference().
@@ -477,7 +488,12 @@ static bool is_acquire_function(enum bpf_func_id func_id,
static bool is_ptr_cast_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_tcp_sock ||
- func_id == BPF_FUNC_sk_fullsock;
+ func_id == BPF_FUNC_sk_fullsock ||
+ func_id == BPF_FUNC_skc_to_tcp_sock ||
+ func_id == BPF_FUNC_skc_to_tcp6_sock ||
+ func_id == BPF_FUNC_skc_to_udp6_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
/* string representation of 'enum bpf_reg_type' */
@@ -503,6 +519,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
[PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
+ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
[PTR_TO_MEM] = "mem",
[PTR_TO_MEM_OR_NULL] = "mem_or_null",
[PTR_TO_RDONLY_BUF] = "rdonly_buf",
@@ -569,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
+ if (t == PTR_TO_BTF_ID ||
+ t == PTR_TO_BTF_ID_OR_NULL ||
+ t == PTR_TO_PERCPU_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@@ -991,14 +1010,9 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
static void __mark_reg_not_init(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
-/* Mark the unknown part of a register (variable offset or scalar value) as
- * known to have the value @imm.
- */
-static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
+/* This helper doesn't clear reg->id */
+static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
- /* Clear id, off, and union(map_ptr, range) */
- memset(((u8 *)reg) + sizeof(reg->type), 0,
- offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
reg->var_off = tnum_const(imm);
reg->smin_value = (s64)imm;
reg->smax_value = (s64)imm;
@@ -1011,6 +1025,17 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
reg->u32_max_value = (u32)imm;
}
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
+{
+ /* Clear id, off, and union(map_ptr, range) */
+ memset(((u8 *)reg) + sizeof(reg->type), 0,
+ offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
+ ___mark_reg_known(reg, imm);
+}
+
static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
reg->var_off = tnum_const_subreg(reg->var_off, imm);
@@ -1489,6 +1514,13 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
+ if (code == (BPF_JMP | BPF_CALL) &&
+ insn[i].imm == BPF_FUNC_tail_call &&
+ insn[i].src_reg != BPF_PSEUDO_CALL)
+ subprog[cur_subprog].has_tail_call = true;
+ if (BPF_CLASS(code) == BPF_LD &&
+ (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
+ subprog[cur_subprog].has_ld_abs = true;
if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
@@ -2183,6 +2215,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_RDONLY_BUF_OR_NULL:
case PTR_TO_RDWR_BUF:
case PTR_TO_RDWR_BUF_OR_NULL:
+ case PTR_TO_PERCPU_BTF_ID:
return true;
default:
return false;
@@ -2200,6 +2233,20 @@ static bool register_is_const(struct bpf_reg_state *reg)
return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
}
+static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
+{
+ return tnum_is_unknown(reg->var_off) &&
+ reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
+ reg->umin_value == 0 && reg->umax_value == U64_MAX &&
+ reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
+ reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
+}
+
+static bool register_is_bounded(struct bpf_reg_state *reg)
+{
+ return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
+}
+
static bool __is_pointer_value(bool allow_ptr_leaks,
const struct bpf_reg_state *reg)
{
@@ -2251,7 +2298,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
if (value_regno >= 0)
reg = &cur->regs[value_regno];
- if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
+ if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
@@ -2625,11 +2672,18 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
#define MAX_PACKET_OFF 0xffff
+static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
+{
+ return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
+}
+
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
{
- switch (env->prog->type) {
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
/* Program types only with direct read access go here! */
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
@@ -2639,7 +2693,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_CGROUP_SKB:
if (t == BPF_WRITE)
return false;
- /* fallthrough */
+ fallthrough;
/* Program types with direct read + write access go here! */
case BPF_PROG_TYPE_SCHED_CLS:
@@ -2970,10 +3024,37 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
+ bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
+ int j;
process_func:
+ /* protect against potential stack overflow that might happen when
+ * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
+ * depth for such case down to 256 so that the worst case scenario
+ * would result in 8k stack size (32 which is tailcall limit * 256 =
+ * 8k).
+ *
+ * To get the idea what might happen, see an example:
+ * func1 -> sub rsp, 128
+ * subfunc1 -> sub rsp, 256
+ * tailcall1 -> add rsp, 256
+ * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
+ * subfunc2 -> sub rsp, 64
+ * subfunc22 -> sub rsp, 128
+ * tailcall2 -> add rsp, 128
+ * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
+ *
+ * tailcall will unwind the current stack frame but it will not get rid
+ * of caller's stack as shown on the example above.
+ */
+ if (idx && subprog[idx].has_tail_call && depth >= 256) {
+ verbose(env,
+ "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
+ depth);
+ return -EACCES;
+ }
/* round up to 32-bytes, since this is granularity
* of interpreter stack size
*/
@@ -3002,6 +3083,10 @@ continue_func:
i);
return -EFAULT;
}
+
+ if (subprog[idx].has_tail_call)
+ tail_call_reachable = true;
+
frame++;
if (frame >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep !\n",
@@ -3010,6 +3095,15 @@ continue_func:
}
goto process_func;
}
+ /* if tail call got detected across bpf2bpf calls then mark each of the
+ * currently present subprog frames as tail call reachable subprogs;
+ * this info will be utilized by JIT so that we will be preserving the
+ * tail call counter throughout bpf2bpf calls combined with tailcalls
+ */
+ if (tail_call_reachable)
+ for (j = 0; j < frame; j++)
+ subprog[ret_prog[j]].tail_call_reachable = true;
+
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
*/
@@ -3585,18 +3679,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
struct bpf_func_state *state = func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
- if (reg->type != PTR_TO_STACK) {
- /* Allow zero-byte read from NULL, regardless of pointer type */
- if (zero_size_allowed && access_size == 0 &&
- register_is_null(reg))
- return 0;
-
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type],
- reg_type_str[PTR_TO_STACK]);
- return -EACCES;
- }
-
if (tnum_is_const(reg->var_off)) {
min_off = max_off = reg->var_off.value + reg->off;
err = __check_stack_boundary(env, regno, min_off, access_size,
@@ -3741,9 +3823,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
access_size, zero_size_allowed,
"rdwr",
&env->prog->aux->max_rdwr_access);
- default: /* scalar_value|ptr_to_stack or invalid ptr */
+ case PTR_TO_STACK:
return check_stack_boundary(env, regno, access_size,
zero_size_allowed, meta);
+ default: /* scalar_value or invalid ptr */
+ /* Allow zero-byte read from NULL, regardless of pointer type */
+ if (zero_size_allowed && access_size == 0 &&
+ register_is_null(reg))
+ return 0;
+
+ verbose(env, "R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type],
+ reg_type_str[PTR_TO_STACK]);
+ return -EACCES;
}
}
@@ -3775,10 +3867,6 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
- if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "R%d is not a pointer to map_value\n", regno);
- return -EINVAL;
- }
if (!is_const) {
verbose(env,
"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
@@ -3845,12 +3933,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
-static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type)
-{
- return type == ARG_PTR_TO_ALLOC_MEM ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
-}
-
static bool arg_type_is_alloc_size(enum bpf_arg_type type)
{
return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
@@ -3872,14 +3954,194 @@ static int int_ptr_type_to_size(enum bpf_arg_type type)
return -EINVAL;
}
+static int resolve_map_arg_type(struct bpf_verifier_env *env,
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_arg_type *arg_type)
+{
+ if (!meta->map_ptr) {
+ /* kernel subsystem misconfigured verifier */
+ verbose(env, "invalid map_ptr to access map->type\n");
+ return -EACCES;
+ }
+
+ switch (meta->map_ptr->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
+ *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
+ } else {
+ verbose(env, "invalid arg_type for sockmap/sockhash\n");
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
+struct bpf_reg_types {
+ const enum bpf_reg_type types[10];
+ u32 *btf_id;
+};
+
+static const struct bpf_reg_types map_key_value_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_PACKET,
+ PTR_TO_PACKET_META,
+ PTR_TO_MAP_VALUE,
+ },
+};
+
+static const struct bpf_reg_types sock_types = {
+ .types = {
+ PTR_TO_SOCK_COMMON,
+ PTR_TO_SOCKET,
+ PTR_TO_TCP_SOCK,
+ PTR_TO_XDP_SOCK,
+ },
+};
+
+#ifdef CONFIG_NET
+static const struct bpf_reg_types btf_id_sock_common_types = {
+ .types = {
+ PTR_TO_SOCK_COMMON,
+ PTR_TO_SOCKET,
+ PTR_TO_TCP_SOCK,
+ PTR_TO_XDP_SOCK,
+ PTR_TO_BTF_ID,
+ },
+ .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+};
+#endif
+
+static const struct bpf_reg_types mem_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_PACKET,
+ PTR_TO_PACKET_META,
+ PTR_TO_MAP_VALUE,
+ PTR_TO_MEM,
+ PTR_TO_RDONLY_BUF,
+ PTR_TO_RDWR_BUF,
+ },
+};
+
+static const struct bpf_reg_types int_ptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_PACKET,
+ PTR_TO_PACKET_META,
+ PTR_TO_MAP_VALUE,
+ },
+};
+
+static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
+static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
+static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
+static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
+static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
+static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
+static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+
+static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
+ [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
+ [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
+ [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
+ [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
+ [ARG_CONST_SIZE] = &scalar_types,
+ [ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
+ [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
+ [ARG_CONST_MAP_PTR] = &const_map_ptr_types,
+ [ARG_PTR_TO_CTX] = &context_types,
+ [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
+ [ARG_PTR_TO_SOCK_COMMON] = &sock_types,
+#ifdef CONFIG_NET
+ [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
+#endif
+ [ARG_PTR_TO_SOCKET] = &fullsock_types,
+ [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
+ [ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
+ [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
+ [ARG_PTR_TO_MEM] = &mem_types,
+ [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
+ [ARG_PTR_TO_UNINIT_MEM] = &mem_types,
+ [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
+ [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
+ [ARG_PTR_TO_INT] = &int_ptr_types,
+ [ARG_PTR_TO_LONG] = &int_ptr_types,
+ [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
+};
+
+static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type,
+ const u32 *arg_btf_id)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ enum bpf_reg_type expected, type = reg->type;
+ const struct bpf_reg_types *compatible;
+ int i, j;
+
+ compatible = compatible_reg_types[arg_type];
+ if (!compatible) {
+ verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
+ expected = compatible->types[i];
+ if (expected == NOT_INIT)
+ break;
+
+ if (type == expected)
+ goto found;
+ }
+
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
+ for (j = 0; j + 1 < i; j++)
+ verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
+ verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
+ return -EACCES;
+
+found:
+ if (type == PTR_TO_BTF_ID) {
+ if (!arg_btf_id) {
+ if (!compatible->btf_id) {
+ verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
+ return -EFAULT;
+ }
+ arg_btf_id = compatible->btf_id;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id,
+ *arg_btf_id)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, kernel_type_name(reg->btf_id),
+ kernel_type_name(*arg_btf_id));
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
+ regno);
+ return -EACCES;
+ }
+ }
+
+ return 0;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn)
{
u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- enum bpf_reg_type expected_type, type = reg->type;
enum bpf_arg_type arg_type = fn->arg_type[arg];
+ enum bpf_reg_type type = reg->type;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -3904,120 +4166,32 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EACCES;
}
- if (arg_type == ARG_PTR_TO_MAP_KEY ||
- arg_type == ARG_PTR_TO_MAP_VALUE ||
+ if (arg_type == ARG_PTR_TO_MAP_VALUE ||
arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
- expected_type = PTR_TO_STACK;
- if (register_is_null(reg) &&
- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL)
- /* final test in check_stack_boundary() */;
- else if (!type_is_pkt_pointer(type) &&
- type != PTR_TO_MAP_VALUE &&
- type != expected_type)
- goto err_type;
- } else if (arg_type == ARG_CONST_SIZE ||
- arg_type == ARG_CONST_SIZE_OR_ZERO ||
- arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) {
- expected_type = SCALAR_VALUE;
- if (type != expected_type)
- goto err_type;
- } else if (arg_type == ARG_CONST_MAP_PTR) {
- expected_type = CONST_PTR_TO_MAP;
- if (type != expected_type)
- goto err_type;
- } else if (arg_type == ARG_PTR_TO_CTX ||
- arg_type == ARG_PTR_TO_CTX_OR_NULL) {
- expected_type = PTR_TO_CTX;
- if (!(register_is_null(reg) &&
- arg_type == ARG_PTR_TO_CTX_OR_NULL)) {
- if (type != expected_type)
- goto err_type;
- err = check_ctx_reg(env, reg, regno);
- if (err < 0)
- return err;
- }
- } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
- expected_type = PTR_TO_SOCK_COMMON;
- /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
- if (!type_is_sk_pointer(type))
- goto err_type;
- if (reg->ref_obj_id) {
- if (meta->ref_obj_id) {
- verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id,
- meta->ref_obj_id);
- return -EFAULT;
- }
- meta->ref_obj_id = reg->ref_obj_id;
- }
- } else if (arg_type == ARG_PTR_TO_SOCKET ||
- arg_type == ARG_PTR_TO_SOCKET_OR_NULL) {
- expected_type = PTR_TO_SOCKET;
- if (!(register_is_null(reg) &&
- arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) {
- if (type != expected_type)
- goto err_type;
- }
- } else if (arg_type == ARG_PTR_TO_BTF_ID) {
- expected_type = PTR_TO_BTF_ID;
- if (type != expected_type)
- goto err_type;
- if (!fn->check_btf_id) {
- if (reg->btf_id != meta->btf_id) {
- verbose(env, "Helper has type %s got %s in R%d\n",
- kernel_type_name(meta->btf_id),
- kernel_type_name(reg->btf_id), regno);
-
- return -EACCES;
- }
- } else if (!fn->check_btf_id(reg->btf_id, arg)) {
- verbose(env, "Helper does not support %s in R%d\n",
- kernel_type_name(reg->btf_id), regno);
+ err = resolve_map_arg_type(env, meta, &arg_type);
+ if (err)
+ return err;
+ }
- return -EACCES;
- }
- if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) {
- verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
- regno);
- return -EACCES;
- }
- } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
- if (meta->func_id == BPF_FUNC_spin_lock) {
- if (process_spin_lock(env, regno, true))
- return -EACCES;
- } else if (meta->func_id == BPF_FUNC_spin_unlock) {
- if (process_spin_lock(env, regno, false))
- return -EACCES;
- } else {
- verbose(env, "verifier internal error\n");
- return -EFAULT;
- }
- } else if (arg_type_is_mem_ptr(arg_type)) {
- expected_type = PTR_TO_STACK;
- /* One exception here. In case function allows for NULL to be
- * passed in as argument, it's a SCALAR_VALUE type. Final test
- * happens during stack boundary checking.
+ if (register_is_null(reg) && arg_type_may_be_null(arg_type))
+ /* A NULL register has a SCALAR_VALUE type, so skip
+ * type checking.
*/
- if (register_is_null(reg) &&
- (arg_type == ARG_PTR_TO_MEM_OR_NULL ||
- arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL))
- /* final test in check_stack_boundary() */;
- else if (!type_is_pkt_pointer(type) &&
- type != PTR_TO_MAP_VALUE &&
- type != PTR_TO_MEM &&
- type != PTR_TO_RDONLY_BUF &&
- type != PTR_TO_RDWR_BUF &&
- type != expected_type)
- goto err_type;
- meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
- } else if (arg_type_is_alloc_mem_ptr(arg_type)) {
- expected_type = PTR_TO_MEM;
- if (register_is_null(reg) &&
- arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)
- /* final test in check_stack_boundary() */;
- else if (type != expected_type)
- goto err_type;
+ goto skip_type_check;
+
+ err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
+ if (err)
+ return err;
+
+ if (type == PTR_TO_CTX) {
+ err = check_ctx_reg(env, reg, regno);
+ if (err < 0)
+ return err;
+ }
+
+skip_type_check:
+ if (reg->ref_obj_id) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
regno, reg->ref_obj_id,
@@ -4025,15 +4199,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EFAULT;
}
meta->ref_obj_id = reg->ref_obj_id;
- } else if (arg_type_is_int_ptr(arg_type)) {
- expected_type = PTR_TO_STACK;
- if (!type_is_pkt_pointer(type) &&
- type != PTR_TO_MAP_VALUE &&
- type != expected_type)
- goto err_type;
- } else {
- verbose(env, "unsupported arg_type %d\n", arg_type);
- return -EFAULT;
}
if (arg_type == ARG_CONST_MAP_PTR) {
@@ -4072,6 +4237,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
err = check_helper_mem_access(env, regno,
meta->map_ptr->value_size, false,
meta);
+ } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
+ if (!reg->btf_id) {
+ verbose(env, "Helper has invalid btf_id in R%d\n", regno);
+ return -EACCES;
+ }
+ meta->ret_btf_id = reg->btf_id;
+ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ if (meta->func_id == BPF_FUNC_spin_lock) {
+ if (process_spin_lock(env, regno, true))
+ return -EACCES;
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+ if (process_spin_lock(env, regno, false))
+ return -EACCES;
+ } else {
+ verbose(env, "verifier internal error\n");
+ return -EFAULT;
+ }
+ } else if (arg_type_is_mem_ptr(arg_type)) {
+ /* The access to this pointer is only checked when we hit the
+ * next is_mem_size argument below.
+ */
+ meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM);
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
@@ -4137,10 +4324,43 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
}
return err;
-err_type:
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[type], reg_type_str[expected_type]);
- return -EACCES;
+}
+
+static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
+{
+ enum bpf_attach_type eatype = env->prog->expected_attach_type;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+
+ if (func_id != BPF_FUNC_map_update_elem)
+ return false;
+
+ /* It's not possible to get access to a locked struct sock in these
+ * contexts, so updating is safe.
+ */
+ switch (type) {
+ case BPF_PROG_TYPE_TRACING:
+ if (eatype == BPF_TRACE_ITER)
+ return true;
+ break;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return true;
+ default:
+ break;
+ }
+
+ verbose(env, "cannot update sockmap in this context\n");
+ return false;
+}
+
+static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
+{
+ return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
}
static int check_map_func_compatibility(struct bpf_verifier_env *env,
@@ -4214,7 +4434,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_map &&
func_id != BPF_FUNC_sk_select_reuseport &&
- func_id != BPF_FUNC_map_lookup_elem)
+ func_id != BPF_FUNC_map_lookup_elem &&
+ !may_update_sockmap(env, func_id))
goto error;
break;
case BPF_MAP_TYPE_SOCKHASH:
@@ -4223,7 +4444,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_delete_elem &&
func_id != BPF_FUNC_msg_redirect_hash &&
func_id != BPF_FUNC_sk_select_reuseport &&
- func_id != BPF_FUNC_map_lookup_elem)
+ func_id != BPF_FUNC_map_lookup_elem &&
+ !may_update_sockmap(env, func_id))
goto error;
break;
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
@@ -4242,6 +4464,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_sk_storage_delete)
goto error;
break;
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ if (func_id != BPF_FUNC_inode_storage_get &&
+ func_id != BPF_FUNC_inode_storage_delete)
+ goto error;
+ break;
default:
break;
}
@@ -4251,8 +4478,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_tail_call:
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
- if (env->subprog_cnt > 1) {
- verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
+ if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
return -EINVAL;
}
break;
@@ -4315,6 +4542,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
goto error;
break;
+ case BPF_FUNC_inode_storage_get:
+ case BPF_FUNC_inode_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -4402,10 +4634,26 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
return count <= 1;
}
+static bool check_btf_id_ok(const struct bpf_func_proto *fn)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
+ if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
+ return false;
+
+ if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+ return false;
+ }
+
+ return true;
+}
+
static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
+ check_btf_id_ok(fn) &&
check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
}
@@ -4775,6 +5023,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
+ if (fn->allowed && !fn->allowed(env->prog)) {
+ verbose(env, "helper call is not allowed in probe\n");
+ return -EINVAL;
+ }
+
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -4796,11 +5049,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
meta.func_id = func_id;
/* check args */
for (i = 0; i < 5; i++) {
- if (!fn->check_btf_id) {
- err = btf_resolve_helper_id(&env->log, fn, i);
- if (err > 0)
- meta.btf_id = err;
- }
err = check_func_arg(env, i, &meta, fn);
if (err)
return err;
@@ -4904,6 +5152,35 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
regs[BPF_REG_0].mem_size = meta.mem_size;
+ } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
+ const struct btf_type *t;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
+ if (!btf_type_is_struct(t)) {
+ u32 tsize;
+ const struct btf_type *ret;
+ const char *tname;
+
+ /* resolve the type size of ksym. */
+ ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+ if (IS_ERR(ret)) {
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+ verbose(env, "unable to resolve the size of type '%s': %ld\n",
+ tname, PTR_ERR(ret));
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].type =
+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+ PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].mem_size = tsize;
+ } else {
+ regs[BPF_REG_0].type =
+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+ PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ }
} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
int ret_btf_id;
@@ -5219,6 +5496,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
case CONST_PTR_TO_MAP:
+ /* smin_val represents the known value */
+ if (known && smin_val == 0 && opcode == BPF_ADD)
+ break;
+ fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
@@ -5634,8 +5915,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
u64 umax_val = src_reg->umax_value;
if (src_known && dst_known) {
- __mark_reg_known(dst_reg, dst_reg->var_off.value &
- src_reg->var_off.value);
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}
@@ -5667,8 +5947,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->smin_value;
- u32 umin_val = src_reg->umin_value;
+ s32 smin_val = src_reg->s32_min_value;
+ u32 umin_val = src_reg->u32_min_value;
/* Assuming scalar64_min_max_or will be called so it is safe
* to skip updating register for known case.
@@ -5691,8 +5971,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
/* ORing two positives gives a positive, so safe to
* cast result into s64.
*/
- dst_reg->s32_min_value = dst_reg->umin_value;
- dst_reg->s32_max_value = dst_reg->umax_value;
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
}
}
@@ -5705,8 +5985,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
u64 umin_val = src_reg->umin_value;
if (src_known && dst_known) {
- __mark_reg_known(dst_reg, dst_reg->var_off.value |
- src_reg->var_off.value);
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
return;
}
@@ -5732,6 +6011,67 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
+ s32 smin_val = src_reg->s32_min_value;
+
+ /* Assuming scalar64_min_max_xor will be called so it is safe
+ * to skip updating register for known case.
+ */
+ if (src_known && dst_known)
+ return;
+
+ /* We get both minimum and maximum from the var32_off. */
+ dst_reg->u32_min_value = var32_off.value;
+ dst_reg->u32_max_value = var32_off.value | var32_off.mask;
+
+ if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
+ /* XORing two positive sign numbers gives a positive,
+ * so safe to cast u32 result into s32.
+ */
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ }
+}
+
+static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_is_const(src_reg->var_off);
+ bool dst_known = tnum_is_const(dst_reg->var_off);
+ s64 smin_val = src_reg->smin_value;
+
+ if (src_known && dst_known) {
+ /* dst_reg->var_off.value has been updated earlier */
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
+ return;
+ }
+
+ /* We get both minimum and maximum from the var_off. */
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+
+ if (dst_reg->smin_value >= 0 && smin_val >= 0) {
+ /* XORing two positive sign numbers gives a positive,
+ * so safe to cast u64 result into s64.
+ */
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
+
+ __update_reg_bounds(dst_reg);
+}
+
static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{
@@ -6040,6 +6380,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg);
break;
+ case BPF_XOR:
+ dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
+ scalar32_min_max_xor(dst_reg, &src_reg);
+ scalar_min_max_xor(dst_reg, &src_reg);
+ break;
case BPF_LSH:
if (umax_val >= insn_bitness) {
/* Shifts greater than 31 or 63 are undefined.
@@ -6111,6 +6456,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
src_reg = NULL;
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
+ else
+ /* Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by find_equal_scalars()
+ */
+ dst_reg->id = 0;
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@@ -6244,6 +6594,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* case: R1 = R2
* copy register state to dest reg
*/
+ if (src_reg->type == SCALAR_VALUE && !src_reg->id)
+ /* Assign src and dst registers the same ID
+ * that will be used by find_equal_scalars()
+ * to propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
*dst_reg = *src_reg;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
@@ -6256,6 +6612,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
*dst_reg = *src_reg;
+ /* Make sure ID is cleared otherwise
+ * dst_reg min/max could be incorrectly
+ * propagated into src_reg by find_equal_scalars()
+ */
+ dst_reg->id = 0;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else {
@@ -6646,14 +7007,18 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *reg =
opcode == BPF_JEQ ? true_reg : false_reg;
- /* For BPF_JEQ, if this is false we know nothing Jon Snow, but
- * if it is true we know the value for sure. Likewise for
- * BPF_JNE.
+ /* JEQ/JNE comparison doesn't change the register equivalence.
+ * r1 = r2;
+ * if (r1 == 42) goto label;
+ * ...
+ * label: // here both r1 and r2 are known to be 42.
+ *
+ * Hence when marking register as known preserve it's ID.
*/
if (is_jmp32)
__mark_reg32_known(reg, val32);
else
- __mark_reg_known(reg, val);
+ ___mark_reg_known(reg, val);
break;
}
case BPF_JSET:
@@ -7044,6 +7409,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
return true;
}
+static void find_equal_scalars(struct bpf_verifier_state *vstate,
+ struct bpf_reg_state *known_reg)
+{
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= vstate->curframe; i++) {
+ state = vstate->frame[i];
+ for (j = 0; j < MAX_BPF_REG; j++) {
+ reg = &state->regs[j];
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ *reg = *known_reg;
+ }
+
+ bpf_for_each_spilled_reg(j, state, reg) {
+ if (!reg)
+ continue;
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ *reg = *known_reg;
+ }
+ }
+}
+
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
@@ -7172,6 +7561,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
reg_combine_min_max(&other_branch_regs[insn->src_reg],
&other_branch_regs[insn->dst_reg],
src_reg, dst_reg, opcode);
+ if (src_reg->id &&
+ !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
+ find_equal_scalars(this_branch, src_reg);
+ find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+ }
+
}
} else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch_regs[insn->dst_reg],
@@ -7179,6 +7574,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
opcode, is_jmp32);
}
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
+ !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
+ find_equal_scalars(this_branch, dst_reg);
+ find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+ }
+
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
@@ -7210,6 +7611,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_insn_aux_data *aux = cur_aux(env);
struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *dst_reg;
struct bpf_map *map;
int err;
@@ -7226,25 +7628,45 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ dst_reg = &regs[insn->dst_reg];
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- regs[insn->dst_reg].type = SCALAR_VALUE;
+ dst_reg->type = SCALAR_VALUE;
__mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
+ mark_reg_known_zero(env, regs, insn->dst_reg);
+
+ dst_reg->type = aux->btf_var.reg_type;
+ switch (dst_reg->type) {
+ case PTR_TO_MEM:
+ dst_reg->mem_size = aux->btf_var.mem_size;
+ break;
+ case PTR_TO_BTF_ID:
+ case PTR_TO_PERCPU_BTF_ID:
+ dst_reg->btf_id = aux->btf_var.btf_id;
+ break;
+ default:
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EFAULT;
+ }
+ return 0;
+ }
+
map = env->used_maps[aux->map_index];
mark_reg_known_zero(env, regs, insn->dst_reg);
- regs[insn->dst_reg].map_ptr = map;
+ dst_reg->map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- regs[insn->dst_reg].off = aux->map_off;
+ dst_reg->type = PTR_TO_MAP_VALUE;
+ dst_reg->off = aux->map_off;
if (map_value_has_spin_lock(map))
- regs[insn->dst_reg].id = ++env->id_gen;
+ dst_reg->id = ++env->id_gen;
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
- regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ dst_reg->type = CONST_PTR_TO_MAP;
} else {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
@@ -7287,7 +7709,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
u8 mode = BPF_MODE(insn->code);
int i, err;
- if (!may_access_skb(env->prog->type)) {
+ if (!may_access_skb(resolve_prog_type(env->prog))) {
verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
@@ -7297,18 +7719,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
- if (env->subprog_cnt > 1) {
- /* when program has LD_ABS insn JITs and interpreter assume
- * that r1 == ctx == skb which is not the case for callees
- * that can have arbitrary arguments. It's problematic
- * for main prog as well since JITs would need to analyze
- * all functions in order to make proper register save/restore
- * decisions in the main prog. Hence disallow LD_ABS with calls
- */
- verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
- return -EINVAL;
- }
-
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
@@ -7375,11 +7785,12 @@ static int check_return_code(struct bpf_verifier_env *env)
const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
- env->prog->type == BPF_PROG_TYPE_LSM) &&
+ if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
+ prog_type == BPF_PROG_TYPE_LSM) &&
!prog->aux->attach_func_proto->type)
return 0;
@@ -7398,7 +7809,7 @@ static int check_return_code(struct bpf_verifier_env *env)
return -EACCES;
}
- switch (env->prog->type) {
+ switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
@@ -7718,6 +8129,23 @@ err_free:
return ret;
}
+static int check_abnormal_return(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ if (env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
/* The minimum supported BTF func info size */
#define MIN_BPF_FUNCINFO_SIZE 8
#define MAX_FUNCINFO_REC_SIZE 252
@@ -7726,20 +8154,24 @@ static int check_btf_func(struct bpf_verifier_env *env,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
+ const struct btf_type *type, *func_proto, *ret_type;
u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
struct bpf_func_info *krecord;
struct bpf_func_info_aux *info_aux = NULL;
- const struct btf_type *type;
struct bpf_prog *prog;
const struct btf *btf;
void __user *urecord;
u32 prev_offset = 0;
+ bool scalar_return;
int ret = -ENOMEM;
nfuncs = attr->func_info_cnt;
- if (!nfuncs)
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
return 0;
+ }
if (nfuncs != env->subprog_cnt) {
verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
@@ -7787,25 +8219,23 @@ static int check_btf_func(struct bpf_verifier_env *env,
}
/* check insn_off */
+ ret = -EINVAL;
if (i == 0) {
if (krecord[i].insn_off) {
verbose(env,
"nonzero insn_off %u for the first func info record",
krecord[i].insn_off);
- ret = -EINVAL;
goto err_free;
}
} else if (krecord[i].insn_off <= prev_offset) {
verbose(env,
"same or smaller insn offset (%u) than previous func info record (%u)",
krecord[i].insn_off, prev_offset);
- ret = -EINVAL;
goto err_free;
}
if (env->subprog_info[i].start != krecord[i].insn_off) {
verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
- ret = -EINVAL;
goto err_free;
}
@@ -7814,10 +8244,26 @@ static int check_btf_func(struct bpf_verifier_env *env,
if (!type || !btf_type_is_func(type)) {
verbose(env, "invalid type id %d in func info",
krecord[i].type_id);
- ret = -EINVAL;
goto err_free;
}
info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+
+ func_proto = btf_type_by_id(btf, type->type);
+ if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
+ /* btf_func_check() already verified it during BTF load */
+ goto err_free;
+ ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ scalar_return =
+ btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
+ if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+ if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+
prev_offset = krecord[i].insn_off;
urecord += urec_size;
}
@@ -7978,8 +8424,11 @@ static int check_btf_info(struct bpf_verifier_env *env,
struct btf *btf;
int err;
- if (!attr->func_info_cnt && !attr->line_info_cnt)
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
return 0;
+ }
btf = btf_get_by_fd(attr->prog_btf_fd);
if (IS_ERR(btf))
@@ -9119,6 +9568,92 @@ process_bpf_exit:
return 0;
}
+/* replace pseudo btf_id with kernel symbol address */
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux)
+{
+ u32 datasec_id, type, id = insn->imm;
+ const struct btf_var_secinfo *vsi;
+ const struct btf_type *datasec;
+ const struct btf_type *t;
+ const char *sym_name;
+ bool percpu = false;
+ u64 addr;
+ int i;
+
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
+ }
+
+ if (insn[1].imm != 0) {
+ verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
+ return -EINVAL;
+ }
+
+ t = btf_type_by_id(btf_vmlinux, id);
+ if (!t) {
+ verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
+ return -ENOENT;
+ }
+
+ if (!btf_type_is_var(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
+ id);
+ return -EINVAL;
+ }
+
+ sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
+ addr = kallsyms_lookup_name(sym_name);
+ if (!addr) {
+ verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
+ sym_name);
+ return -ENOENT;
+ }
+
+ datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
+ BTF_KIND_DATASEC);
+ if (datasec_id > 0) {
+ datasec = btf_type_by_id(btf_vmlinux, datasec_id);
+ for_each_vsi(i, datasec, vsi) {
+ if (vsi->type == id) {
+ percpu = true;
+ break;
+ }
+ }
+ }
+
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ type = t->type;
+ t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
+ if (percpu) {
+ aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+ aux->btf_var.btf_id = type;
+ } else if (!btf_type_is_struct(t)) {
+ const struct btf_type *ret;
+ const char *tname;
+ u32 tsize;
+
+ /* resolve the type size of ksym. */
+ ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+ if (IS_ERR(ret)) {
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+ verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
+ tname, PTR_ERR(ret));
+ return -EINVAL;
+ }
+ aux->btf_var.reg_type = PTR_TO_MEM;
+ aux->btf_var.mem_size = tsize;
+ } else {
+ aux->btf_var.reg_type = PTR_TO_BTF_ID;
+ aux->btf_var.btf_id = type;
+ }
+ return 0;
+}
+
static int check_map_prealloc(struct bpf_map *map)
{
return (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -9154,6 +9689,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_prog *prog)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
/*
* Validate that trace type programs use preallocated hash maps.
*
@@ -9171,8 +9707,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
* now, but warnings are emitted so developers are made aware of
* the unsafety and can fix their programs before this is enforced.
*/
- if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) {
- if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
+ if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
+ if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
@@ -9184,8 +9720,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
- if ((is_tracing_prog_type(prog->type) ||
- prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
+ if ((is_tracing_prog_type(prog_type) ||
+ prog_type == BPF_PROG_TYPE_SOCKET_FILTER) &&
map_value_has_spin_lock(map)) {
verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -9202,6 +9738,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (prog->aux->sleepable)
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_ARRAY:
+ if (!is_preallocated_map(map)) {
+ verbose(env,
+ "Sleepable programs can only use preallocated hash maps\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ verbose(env,
+ "Sleepable programs can only use array and hash maps\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -9211,10 +9764,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}
-/* look for pseudo eBPF instructions that access map FDs and
- * replace them with actual map pointers
+/* find and rewrite pseudo imm in ld_imm64 instructions:
+ *
+ * 1. if it accesses map FD, replace it with actual map pointer.
+ * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
+ *
+ * NOTE: btf_vmlinux is required for converting pseudo btf_id.
*/
-static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
+static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -9255,6 +9812,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
/* valid generic load 64-bit imm */
goto next_insn;
+ if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
+ aux = &env->insn_aux_data[i];
+ err = check_pseudo_btf_id(env, insn, aux);
+ if (err)
+ return err;
+ goto next_insn;
+ }
+
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
@@ -9436,6 +10001,18 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
}
}
+static void adjust_poke_descs(struct bpf_prog *prog, u32 len)
+{
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+ int i, sz = prog->aux->size_poke_tab;
+ struct bpf_jit_poke_descriptor *desc;
+
+ for (i = 0; i < sz; i++) {
+ desc = &tab[i];
+ desc->insn_idx += len - 1;
+ }
+}
+
static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
const struct bpf_insn *patch, u32 len)
{
@@ -9452,6 +10029,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
if (adjust_insn_aux_data(env, new_prog, off, len))
return NULL;
adjust_subprog_starts(env, off, len);
+ adjust_poke_descs(new_prog, len);
return new_prog;
}
@@ -9897,7 +10475,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
- } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
+ } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
verbose(env, "Writes through BTF pointers are not allowed\n");
return -EINVAL;
}
@@ -9982,6 +10560,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog, **func, *tmp;
int i, j, subprog_start, subprog_end = 0, len, subprog;
+ struct bpf_map *map_ptr;
struct bpf_insn *insn;
void *old_bpf_func;
int err, num_exentries;
@@ -10049,6 +10628,31 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;
+ for (j = 0; j < prog->aux->size_poke_tab; j++) {
+ u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
+ int ret;
+
+ if (!(insn_idx >= subprog_start &&
+ insn_idx <= subprog_end))
+ continue;
+
+ ret = bpf_jit_add_poke_descriptor(func[i],
+ &prog->aux->poke_tab[j]);
+ if (ret < 0) {
+ verbose(env, "adding tail call poke descriptor failed\n");
+ goto out_free;
+ }
+
+ func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
+
+ map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
+ ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
+ if (ret < 0) {
+ verbose(env, "tracking tail call prog failed\n");
+ goto out_free;
+ }
+ }
+
/* Use bpf_prog_F_tag to indicate functions in stack traces.
* Long term would need debug info to populate names
*/
@@ -10067,6 +10671,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
+ func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
func[i] = bpf_int_jit_compile(func[i]);
if (!func[i]->jited) {
err = -ENOTSUPP;
@@ -10074,6 +10679,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
}
cond_resched();
}
+
+ /* Untrack main program's aux structs so that during map_poke_run()
+ * we will not stumble upon the unfilled poke descriptors; each
+ * of the main program's poke descs got distributed across subprogs
+ * and got tracked onto map, so we are sure that none of them will
+ * be missed after the operation below
+ */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+ }
+
/* at this point all bpf functions were successfully JITed
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
@@ -10142,9 +10760,16 @@ static int jit_subprogs(struct bpf_verifier_env *env)
bpf_prog_free_unused_jited_linfo(prog);
return 0;
out_free:
- for (i = 0; i < env->subprog_cnt; i++)
- if (func[i])
- bpf_jit_free(func[i]);
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (!func[i])
+ continue;
+
+ for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
+ map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
+ map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
+ }
+ bpf_jit_free(func[i]);
+ }
kfree(func);
out_undo_insn:
/* cleanup main prog to be interpreted */
@@ -10178,6 +10803,13 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
+ /* When JIT fails the progs with bpf2bpf calls and tail_calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ return -EINVAL;
+ }
for (i = 0; i < prog->len; i++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
@@ -10341,8 +10973,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* the program array.
*/
prog->cb_access = 1;
- env->prog->aux->stack_depth = MAX_BPF_STACK;
- env->prog->aux->max_pkt_offset = MAX_PACKET_OFF;
+ if (!allow_tail_call_in_subprogs(env))
+ prog->aux->stack_depth = MAX_BPF_STACK;
+ prog->aux->max_pkt_offset = MAX_PACKET_OFF;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -10362,6 +10995,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
.reason = BPF_POKE_REASON_TAIL_CALL,
.tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
.tail_call.key = bpf_map_key_immediate(aux),
+ .insn_idx = i + delta,
};
ret = bpf_jit_add_poke_descriptor(prog, &desc);
@@ -10427,7 +11061,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ if (cnt == -EOPNOTSUPP)
+ goto patch_map_ops_generic;
+ if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -10457,7 +11093,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
-
+patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
@@ -10810,59 +11446,79 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
}
#define SECURITY_PREFIX "security_"
-static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr)
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
{
if (within_error_injection_list(addr) ||
- !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name,
- sizeof(SECURITY_PREFIX) - 1))
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
return 0;
return -EINVAL;
}
-static int check_attach_btf_id(struct bpf_verifier_env *env)
+/* non exhaustive list of sleepable bpf_lsm_*() functions */
+BTF_SET_START(btf_sleepable_lsm_hooks)
+#ifdef CONFIG_BPF_LSM
+BTF_ID(func, bpf_lsm_bprm_committed_creds)
+#else
+BTF_ID_UNUSED
+#endif
+BTF_SET_END(btf_sleepable_lsm_hooks)
+
+static int check_sleepable_lsm_hook(u32 btf_id)
+{
+ return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id);
+}
+
+/* list of non-sleepable functions that are otherwise on
+ * ALLOW_ERROR_INJECTION list
+ */
+BTF_SET_START(btf_non_sleepable_error_inject)
+/* Three functions below can be called from sleepable and non-sleepable context.
+ * Assume non-sleepable from bpf safety point of view.
+ */
+BTF_ID(func, __add_to_page_cache_locked)
+BTF_ID(func, should_fail_alloc_page)
+BTF_ID(func, should_failslab)
+BTF_SET_END(btf_non_sleepable_error_inject)
+
+static int check_non_sleepable_error_inject(u32 btf_id)
+{
+ return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
+}
+
+int bpf_check_attach_target(struct bpf_verifier_log *log,
+ const struct bpf_prog *prog,
+ const struct bpf_prog *tgt_prog,
+ u32 btf_id,
+ struct bpf_attach_target_info *tgt_info)
{
- struct bpf_prog *prog = env->prog;
bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
- u32 btf_id = prog->aux->attach_btf_id;
const char prefix[] = "btf_trace_";
- struct btf_func_model fmodel;
int ret = 0, subprog = -1, i;
- struct bpf_trampoline *tr;
const struct btf_type *t;
bool conservative = true;
const char *tname;
struct btf *btf;
- long addr;
- u64 key;
-
- if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
- return check_struct_ops_btf_id(env);
-
- if (prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM &&
- !prog_extension)
- return 0;
+ long addr = 0;
if (!btf_id) {
- verbose(env, "Tracing programs must provide btf_id\n");
+ bpf_log(log, "Tracing programs must provide btf_id\n");
return -EINVAL;
}
- btf = bpf_prog_get_target_btf(prog);
+ btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux;
if (!btf) {
- verbose(env,
+ bpf_log(log,
"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
return -EINVAL;
}
t = btf_type_by_id(btf, btf_id);
if (!t) {
- verbose(env, "attach_btf_id %u is invalid\n", btf_id);
+ bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
return -EINVAL;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
- verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id);
+ bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
return -EINVAL;
}
if (tgt_prog) {
@@ -10874,26 +11530,24 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
break;
}
if (subprog == -1) {
- verbose(env, "Subprog %s doesn't exist\n", tname);
+ bpf_log(log, "Subprog %s doesn't exist\n", tname);
return -EINVAL;
}
conservative = aux->func_info_aux[subprog].unreliable;
if (prog_extension) {
if (conservative) {
- verbose(env,
+ bpf_log(log,
"Cannot replace static functions\n");
return -EINVAL;
}
if (!prog->jit_requested) {
- verbose(env,
+ bpf_log(log,
"Extension programs should be JITed\n");
return -EINVAL;
}
- env->ops = bpf_verifier_ops[tgt_prog->type];
- prog->expected_attach_type = tgt_prog->expected_attach_type;
}
if (!tgt_prog->jited) {
- verbose(env, "Can attach to only JITed progs\n");
+ bpf_log(log, "Can attach to only JITed progs\n");
return -EINVAL;
}
if (tgt_prog->type == prog->type) {
@@ -10901,7 +11555,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
* Cannot attach program extension to another extension.
* It's ok to attach fentry/fexit to extension program.
*/
- verbose(env, "Cannot recursively attach\n");
+ bpf_log(log, "Cannot recursively attach\n");
return -EINVAL;
}
if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
@@ -10923,32 +11577,30 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
* reasonable stack size. Hence extending fentry is not
* allowed.
*/
- verbose(env, "Cannot extend fentry/fexit\n");
+ bpf_log(log, "Cannot extend fentry/fexit\n");
return -EINVAL;
}
- key = ((u64)aux->id) << 32 | btf_id;
} else {
if (prog_extension) {
- verbose(env, "Cannot replace kernel functions\n");
+ bpf_log(log, "Cannot replace kernel functions\n");
return -EINVAL;
}
- key = btf_id;
}
switch (prog->expected_attach_type) {
case BPF_TRACE_RAW_TP:
if (tgt_prog) {
- verbose(env,
+ bpf_log(log,
"Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
return -EINVAL;
}
if (!btf_type_is_typedef(t)) {
- verbose(env, "attach_btf_id %u is not a typedef\n",
+ bpf_log(log, "attach_btf_id %u is not a typedef\n",
btf_id);
return -EINVAL;
}
if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
- verbose(env, "attach_btf_id %u points to wrong type name %s\n",
+ bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
btf_id, tname);
return -EINVAL;
}
@@ -10962,29 +11614,20 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
/* should never happen in valid vmlinux build */
return -EINVAL;
- /* remember two read only pointers that are valid for
- * the life time of the kernel
- */
- prog->aux->attach_func_name = tname;
- prog->aux->attach_func_proto = t;
- prog->aux->attach_btf_trace = true;
- return 0;
+ break;
case BPF_TRACE_ITER:
if (!btf_type_is_func(t)) {
- verbose(env, "attach_btf_id %u is not a function\n",
+ bpf_log(log, "attach_btf_id %u is not a function\n",
btf_id);
return -EINVAL;
}
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
- prog->aux->attach_func_name = tname;
- prog->aux->attach_func_proto = t;
- if (!bpf_iter_prog_supported(prog))
- return -EINVAL;
- ret = btf_distill_func_proto(&env->log, btf, t,
- tname, &fmodel);
- return ret;
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
+ if (ret)
+ return ret;
+ break;
default:
if (!prog_extension)
return -EINVAL;
@@ -10993,42 +11636,30 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
case BPF_LSM_MAC:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- prog->aux->attach_func_name = tname;
- if (prog->type == BPF_PROG_TYPE_LSM) {
- ret = bpf_lsm_verify_prog(&env->log, prog);
- if (ret < 0)
- return ret;
- }
-
if (!btf_type_is_func(t)) {
- verbose(env, "attach_btf_id %u is not a function\n",
+ bpf_log(log, "attach_btf_id %u is not a function\n",
btf_id);
return -EINVAL;
}
if (prog_extension &&
- btf_check_type_match(env, prog, btf, t))
+ btf_check_type_match(log, prog, btf, t))
return -EINVAL;
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_func_proto(t))
return -EINVAL;
- tr = bpf_trampoline_lookup(key);
- if (!tr)
- return -ENOMEM;
- /* t is either vmlinux type or another program's type */
- prog->aux->attach_func_proto = t;
- mutex_lock(&tr->mutex);
- if (tr->func.addr) {
- prog->aux->trampoline = tr;
- goto out;
- }
- if (tgt_prog && conservative) {
- prog->aux->attach_func_proto = NULL;
+
+ if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) &&
+ (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type ||
+ prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
+ return -EINVAL;
+
+ if (tgt_prog && conservative)
t = NULL;
- }
- ret = btf_distill_func_proto(&env->log, btf, t,
- tname, &tr->func.model);
+
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
if (ret < 0)
- goto out;
+ return ret;
+
if (tgt_prog) {
if (subprog == 0)
addr = (long) tgt_prog->bpf_func;
@@ -11037,31 +11668,137 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
} else {
addr = kallsyms_lookup_name(tname);
if (!addr) {
- verbose(env,
+ bpf_log(log,
"The address of function %s cannot be found\n",
tname);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
}
- if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
- ret = check_attach_modify_return(prog, addr);
- if (ret)
- verbose(env, "%s() is not modifiable\n",
- prog->aux->attach_func_name);
+ if (prog->aux->sleepable) {
+ ret = -EINVAL;
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
+ */
+ if (!check_non_sleepable_error_inject(btf_id) &&
+ within_error_injection_list(addr))
+ ret = 0;
+ break;
+ case BPF_PROG_TYPE_LSM:
+ /* LSM progs check that they are attached to bpf_lsm_*() funcs.
+ * Only some of them are sleepable.
+ */
+ if (check_sleepable_lsm_hook(btf_id))
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+ if (ret) {
+ bpf_log(log, "%s is not sleepable\n", tname);
+ return ret;
+ }
+ } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
+ if (tgt_prog) {
+ bpf_log(log, "can't modify return codes of BPF programs\n");
+ return -EINVAL;
+ }
+ ret = check_attach_modify_return(addr, tname);
+ if (ret) {
+ bpf_log(log, "%s() is not modifiable\n", tname);
+ return ret;
+ }
}
- if (ret)
- goto out;
- tr->func.addr = (void *)addr;
- prog->aux->trampoline = tr;
-out:
- mutex_unlock(&tr->mutex);
- if (ret)
- bpf_trampoline_put(tr);
+ break;
+ }
+ tgt_info->tgt_addr = addr;
+ tgt_info->tgt_name = tname;
+ tgt_info->tgt_type = t;
+ return 0;
+}
+
+static int check_attach_btf_id(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
+ struct bpf_attach_target_info tgt_info = {};
+ u32 btf_id = prog->aux->attach_btf_id;
+ struct bpf_trampoline *tr;
+ int ret;
+ u64 key;
+
+ if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM) {
+ verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
+ return -EINVAL;
+ }
+
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
+ return check_struct_ops_btf_id(env);
+
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM &&
+ prog->type != BPF_PROG_TYPE_EXT)
+ return 0;
+
+ ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
+ if (ret)
return ret;
+
+ if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
+ /* to make freplace equivalent to their targets, they need to
+ * inherit env->ops and expected_attach_type for the rest of the
+ * verification
+ */
+ env->ops = bpf_verifier_ops[tgt_prog->type];
+ prog->expected_attach_type = tgt_prog->expected_attach_type;
+ }
+
+ /* store info about the attachment target that will be used later */
+ prog->aux->attach_func_proto = tgt_info.tgt_type;
+ prog->aux->attach_func_name = tgt_info.tgt_name;
+
+ if (tgt_prog) {
+ prog->aux->saved_dst_prog_type = tgt_prog->type;
+ prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
}
+
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ prog->aux->attach_btf_trace = true;
+ return 0;
+ } else if (prog->expected_attach_type == BPF_TRACE_ITER) {
+ if (!bpf_iter_prog_supported(prog))
+ return -EINVAL;
+ return 0;
+ }
+
+ if (prog->type == BPF_PROG_TYPE_LSM) {
+ ret = bpf_lsm_verify_prog(&env->log, prog);
+ if (ret < 0)
+ return ret;
+ }
+
+ key = bpf_trampoline_compute_key(tgt_prog, btf_id);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ prog->aux->dst_trampoline = tr;
+ return 0;
+}
+
+struct btf *bpf_get_btf_vmlinux(void)
+{
+ if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ mutex_lock(&bpf_verifier_lock);
+ if (!btf_vmlinux)
+ btf_vmlinux = btf_parse_vmlinux();
+ mutex_unlock(&bpf_verifier_lock);
+ }
+ return btf_vmlinux;
}
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
@@ -11097,12 +11834,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->ops = bpf_verifier_ops[env->prog->type];
is_priv = bpf_capable();
- if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
- mutex_lock(&bpf_verifier_lock);
- if (!btf_vmlinux)
- btf_vmlinux = btf_parse_vmlinux();
- mutex_unlock(&bpf_verifier_lock);
- }
+ bpf_get_btf_vmlinux();
/* grab the mutex to protect few globals used by verifier */
if (!is_priv)
@@ -11145,10 +11877,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
- ret = replace_map_fd_with_map_ptr(env);
- if (ret < 0)
- goto skip_full_check;
-
if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
@@ -11174,6 +11902,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret)
goto skip_full_check;
+ ret = resolve_pseudo_ldimm64(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dd247747ec14..e41c21819ba0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2006,7 +2006,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;
@@ -3682,6 +3681,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
struct cgroup_subsys_state *css;
int ret;
+ if (!nbytes)
+ return 0;
+
/*
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 642415b8c3c9..57b5b5d0a5fd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* The top cpuset doesn't have any online cpu as a
* consequence of a race between cpuset_hotplug_work
* and cpu hotplug notifier. But we know the top
- * cpuset's effective_cpus is on its way to to be
+ * cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
cpumask_copy(pmask, cpu_online_mask);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 44a259338e33..f7e1d0eccdbc 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -15,18 +15,28 @@
static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
-static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
/*
- * __atomic_notifier_call_chain has a RCU read critical section, which
+ * atomic_notifier_call_chain has a RCU read critical section, which
* could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
* RCU know this.
*/
rcu_irq_enter_irqson();
- ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
- nr_to_call, nr_calls);
+ ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL);
+ rcu_irq_exit_irqson();
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
+{
+ int ret;
+
+ rcu_irq_enter_irqson();
+ ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL);
rcu_irq_exit_irqson();
return notifier_to_errno(ret);
@@ -80,18 +90,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
*/
int cpu_pm_enter(void)
{
- int nr_calls = 0;
- int ret = 0;
-
- ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
- if (ret)
- /*
- * Inform listeners (nr_calls - 1) about failure of CPU PM
- * PM entry who are notified earlier to prepare for it.
- */
- cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
-
- return ret;
+ return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED);
}
EXPORT_SYMBOL_GPL(cpu_pm_enter);
@@ -109,7 +108,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+ return cpu_pm_notify(CPU_PM_EXIT);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -131,18 +130,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*/
int cpu_cluster_pm_enter(void)
{
- int nr_calls = 0;
- int ret = 0;
-
- ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
- if (ret)
- /*
- * Inform listeners (nr_calls - 1) about failure of CPU cluster
- * PM entry who are notified earlier to prepare for it.
- */
- cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
-
- return ret;
+ return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
@@ -163,7 +151,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 847a9d1fa634..c99de4a21458 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -9,6 +9,7 @@ config HAS_DMA
default y
config DMA_OPS
+ depends on HAS_DMA
bool
#
@@ -43,6 +44,12 @@ config ARCH_HAS_DMA_SET_MASK
config ARCH_HAS_DMA_WRITE_COMBINE
bool
+#
+# Select if the architectures provides the arch_dma_mark_clean hook
+#
+config ARCH_HAS_DMA_MARK_CLEAN
+ bool
+
config DMA_DECLARE_COHERENT
bool
@@ -68,9 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
-config DMA_NONCOHERENT_CACHE_SYNC
- bool
-
config DMA_VIRT_OPS
bool
depends on HAS_DMA
@@ -114,10 +118,21 @@ config DMA_CMA
You can disable CMA by specifying "cma=0" on the kernel's command
line.
- For more information see <include/linux/dma-contiguous.h>.
+ For more information see <kernel/dma/contiguous.c>.
If unsure, say "n".
if DMA_CMA
+
+config DMA_PERNUMA_CMA
+ bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
+ default NUMA && ARM64
+ help
+ Enable this option to get pernuma CMA areas so that devices like
+ ARM64 SMMU can get local memory by DMA coherent APIs.
+
+ You can set the size of pernuma CMA by specifying "cma_pernuma=size"
+ on the kernel's command line.
+
comment "Default contiguous memory area size:"
config CMA_SIZE_MBYTES
@@ -162,7 +177,7 @@ endchoice
config CMA_ALIGNMENT
int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
- range 4 12
+ range 2 12
default 8
help
DMA mapping framework by default aligns all buffers to the smallest
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 32c7c1942bbd..dc755ab68aab 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_HAS_DMA) += mapping.o direct.o
+obj-$(CONFIG_DMA_OPS) += ops_helpers.o
obj-$(CONFIG_DMA_OPS) += dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 2a0c4985f38e..5b5b6c7ec7f2 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -7,7 +7,8 @@
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
struct dma_coherent_mem {
void *virt_base;
@@ -32,9 +33,8 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
struct dma_coherent_mem * mem)
{
if (mem->use_dev_dma_pfn_offset)
- return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT;
- else
- return mem->device_base;
+ return phys_to_dma(dev, PFN_PHYS(mem->pfn_base));
+ return mem->device_base;
}
static int dma_init_coherent_memory(phys_addr_t phys_addr,
@@ -107,6 +107,23 @@ static int dma_assign_coherent_memory(struct device *dev,
return 0;
}
+/*
+ * Declare a region of memory to be handed out by dma_alloc_coherent() when it
+ * is asked for coherent memory for this device. This shall only be used
+ * from platform code, usually based on the device tree description.
+ *
+ * phys_addr is the CPU physical address to which the memory is currently
+ * assigned (this will be ioremapped so the CPU can access the region).
+ *
+ * device_addr is the DMA address the device needs to be programmed with to
+ * actually address this memory (this will be handed out as the dma_addr_t in
+ * dma_alloc_coherent()).
+ *
+ * size is the size of the area (must be a multiple of PAGE_SIZE).
+ *
+ * As a simplification for the platforms, only *one* such region of memory may
+ * be declared per device.
+ */
int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_addr_t device_addr, size_t size)
{
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index cff7e60968b9..16b95ff12e4d 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -5,6 +5,34 @@
* Written by:
* Marek Szyprowski <m.szyprowski@samsung.com>
* Michal Nazarewicz <mina86@mina86.com>
+ *
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
*/
#define pr_fmt(fmt) "cma: " fmt
@@ -16,12 +44,11 @@
#endif
#include <asm/page.h>
-#include <asm/dma-contiguous.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/sizes.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
#include <linux/cma.h>
#ifdef CONFIG_CMA_SIZE_MBYTES
@@ -69,20 +96,24 @@ static int __init early_cma(char *p)
}
early_param("cma", early_cma);
+#ifdef CONFIG_DMA_PERNUMA_CMA
+
+static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
+static phys_addr_t pernuma_size_bytes __initdata;
+
+static int __init early_cma_pernuma(char *p)
+{
+ pernuma_size_bytes = memparse(p, &p);
+ return 0;
+}
+early_param("cma_pernuma", early_cma_pernuma);
+#endif
+
#ifdef CONFIG_CMA_SIZE_PERCENTAGE
static phys_addr_t __init __maybe_unused cma_early_percent_memory(void)
{
- struct memblock_region *reg;
- unsigned long total_pages = 0;
-
- /*
- * We cannot use memblock_phys_mem_size() here, because
- * memblock_analyze() has not been called yet.
- */
- for_each_memblock(memory, reg)
- total_pages += memblock_region_memory_end_pfn(reg) -
- memblock_region_memory_base_pfn(reg);
+ unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size());
return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
}
@@ -96,6 +127,34 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
#endif
+#ifdef CONFIG_DMA_PERNUMA_CMA
+void __init dma_pernuma_cma_reserve(void)
+{
+ int nid;
+
+ if (!pernuma_size_bytes)
+ return;
+
+ for_each_online_node(nid) {
+ int ret;
+ char name[CMA_MAX_NAME];
+ struct cma **cma = &dma_contiguous_pernuma_area[nid];
+
+ snprintf(name, sizeof(name), "pernuma%d", nid);
+ ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
+ 0, false, name, cma, nid);
+ if (ret) {
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ continue;
+ }
+
+ pr_debug("%s: reserved %llu MiB on node %d\n", __func__,
+ (unsigned long long)pernuma_size_bytes / SZ_1M, nid);
+ }
+}
+#endif
+
/**
* dma_contiguous_reserve() - reserve area(s) for contiguous memory handling
* @limit: End address of the reserved memory (optional, 0 for any).
@@ -143,6 +202,11 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
}
}
+void __weak
+dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
+{
+}
+
/**
* dma_contiguous_reserve_area() - reserve custom contiguous area
* @size: Size of the reserved area (in bytes),
@@ -228,23 +292,44 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
* @size: Requested allocation size.
* @gfp: Allocation flags.
*
- * This function allocates contiguous memory buffer for specified device. It
- * tries to use device specific contiguous memory area if available, or the
- * default global one.
+ * tries to use device specific contiguous memory area if available, or it
+ * tries to use per-numa cma, if the allocation fails, it will fallback to
+ * try default global one.
*
- * Note that it byapss one-page size of allocations from the global area as
- * the addresses within one page are always contiguous, so there is no need
- * to waste CMA pages for that kind; it also helps reduce fragmentations.
+ * Note that it bypass one-page size of allocations from the per-numa and
+ * global area as the addresses within one page are always contiguous, so
+ * there is no need to waste CMA pages for that kind; it also helps reduce
+ * fragmentations.
*/
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{
+#ifdef CONFIG_DMA_PERNUMA_CMA
+ int nid = dev_to_node(dev);
+#endif
+
/* CMA can be used only in the context which permits sleeping */
if (!gfpflags_allow_blocking(gfp))
return NULL;
if (dev->cma_area)
return cma_alloc_aligned(dev->cma_area, size, gfp);
- if (size <= PAGE_SIZE || !dma_contiguous_default_area)
+ if (size <= PAGE_SIZE)
+ return NULL;
+
+#ifdef CONFIG_DMA_PERNUMA_CMA
+ if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
+ struct cma *cma = dma_contiguous_pernuma_area[nid];
+ struct page *page;
+
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
+ }
+#endif
+ if (!dma_contiguous_default_area)
return NULL;
+
return cma_alloc_aligned(dma_contiguous_default_area, size, gfp);
}
@@ -261,9 +346,27 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
*/
void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
{
- if (!cma_release(dev_get_cma_area(dev), page,
- PAGE_ALIGN(size) >> PAGE_SHIFT))
- __free_pages(page, get_order(size));
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ /* if dev has its own cma, free page from there */
+ if (dev->cma_area) {
+ if (cma_release(dev->cma_area, page, count))
+ return;
+ } else {
+ /*
+ * otherwise, page is from either per-numa cma or default cma
+ */
+#ifdef CONFIG_DMA_PERNUMA_CMA
+ if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
+ page, count))
+ return;
+#endif
+ if (cma_release(dma_contiguous_default_area, page, count))
+ return;
+ }
+
+ /* not in any cma, free from buddy */
+ __free_pages(page, get_order(size));
}
/*
@@ -279,14 +382,14 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev)
{
- dev_set_cma_area(dev, rmem->priv);
+ dev->cma_area = rmem->priv;
return 0;
}
static void rmem_cma_device_release(struct reserved_mem *rmem,
struct device *dev)
{
- dev_set_cma_area(dev, NULL);
+ dev->cma_area = NULL;
}
static const struct reserved_mem_ops rmem_cma_ops = {
@@ -327,7 +430,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
dma_contiguous_early_fixup(rmem->base, rmem->size);
if (default_cma)
- dma_contiguous_set_default(cma);
+ dma_contiguous_default_area = cma;
rmem->ops = &rmem_cma_ops;
rmem->priv = cma;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 8e9f7b301c6d..14de1271463f 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -9,10 +9,9 @@
#include <linux/sched/task_stack.h>
#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
#include <linux/sched/task.h>
#include <linux/stacktrace.h>
-#include <linux/dma-debug.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/debugfs.h>
@@ -24,8 +23,8 @@
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/slab.h>
-
#include <asm/sections.h>
+#include "debug.h"
#define HASH_SIZE 16384ULL
#define HASH_FN_SHIFT 13
@@ -1219,7 +1218,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
entry->dev = dev;
entry->type = dma_debug_single;
entry->pfn = page_to_pfn(page);
- entry->offset = offset,
+ entry->offset = offset;
entry->dev_addr = dma_addr;
entry->size = size;
entry->direction = direction;
@@ -1235,7 +1234,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
add_dma_entry(entry);
}
-EXPORT_SYMBOL(debug_dma_map_page);
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
@@ -1290,7 +1288,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
return;
check_unmap(&ref);
}
-EXPORT_SYMBOL(debug_dma_unmap_page);
void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
int nents, int mapped_ents, int direction)
@@ -1310,7 +1307,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->type = dma_debug_sg;
entry->dev = dev;
entry->pfn = page_to_pfn(sg_page(s));
- entry->offset = s->offset,
+ entry->offset = s->offset;
entry->size = sg_dma_len(s);
entry->dev_addr = sg_dma_address(s);
entry->direction = direction;
@@ -1328,7 +1325,6 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
add_dma_entry(entry);
}
}
-EXPORT_SYMBOL(debug_dma_map_sg);
static int get_nr_mapped_entries(struct device *dev,
struct dma_debug_entry *ref)
@@ -1380,7 +1376,6 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
check_unmap(&ref);
}
}
-EXPORT_SYMBOL(debug_dma_unmap_sg);
void debug_dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t dma_addr, void *virt)
@@ -1466,7 +1461,6 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
add_dma_entry(entry);
}
-EXPORT_SYMBOL(debug_dma_map_resource);
void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
@@ -1484,7 +1478,6 @@ void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
check_unmap(&ref);
}
-EXPORT_SYMBOL(debug_dma_unmap_resource);
void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
size_t size, int direction)
@@ -1503,7 +1496,6 @@ void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
check_sync(dev, &ref, true);
}
-EXPORT_SYMBOL(debug_dma_sync_single_for_cpu);
void debug_dma_sync_single_for_device(struct device *dev,
dma_addr_t dma_handle, size_t size,
@@ -1523,7 +1515,6 @@ void debug_dma_sync_single_for_device(struct device *dev,
check_sync(dev, &ref, false);
}
-EXPORT_SYMBOL(debug_dma_sync_single_for_device);
void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
@@ -1556,7 +1547,6 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
check_sync(dev, &ref, true);
}
}
-EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
@@ -1588,7 +1578,6 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
check_sync(dev, &ref, false);
}
}
-EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
static int __init dma_debug_driver_setup(char *str)
{
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
new file mode 100644
index 000000000000..83643b3010b2
--- /dev/null
+++ b/kernel/dma/debug.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2008 Advanced Micro Devices, Inc.
+ *
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ */
+
+#ifndef _KERNEL_DMA_DEBUG_H
+#define _KERNEL_DMA_DEBUG_H
+
+#ifdef CONFIG_DMA_API_DEBUG
+extern void debug_dma_map_page(struct device *dev, struct page *page,
+ size_t offset, size_t size,
+ int direction, dma_addr_t dma_addr);
+
+extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, int direction);
+
+extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction);
+
+extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir);
+
+extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt);
+
+extern void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t addr);
+
+extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
+ size_t size, int direction,
+ dma_addr_t dma_addr);
+
+extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int direction);
+
+extern void debug_dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+
+extern void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction);
+
+extern void debug_dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction);
+
+extern void debug_dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction);
+#else /* CONFIG_DMA_API_DEBUG */
+static inline void debug_dma_map_page(struct device *dev, struct page *page,
+ size_t offset, size_t size,
+ int direction, dma_addr_t dma_addr)
+{
+}
+
+static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction)
+{
+}
+
+static inline void debug_dma_unmap_sg(struct device *dev,
+ struct scatterlist *sglist,
+ int nelems, int dir)
+{
+}
+
+static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt)
+{
+}
+
+static inline void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t addr)
+{
+}
+
+static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+}
+
+static inline void debug_dma_unmap_resource(struct device *dev,
+ dma_addr_t dma_addr, size_t size,
+ int direction)
+{
+}
+
+static inline void debug_dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+}
+
+static inline void debug_dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+}
+#endif /* CONFIG_DMA_API_DEBUG */
+#endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index db6ef07aec3b..06c111544f61 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -1,21 +1,22 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (C) 2018 Christoph Hellwig.
+ * Copyright (C) 2018-2020 Christoph Hellwig.
*
* DMA operations that map physical memory directly without using an IOMMU.
*/
#include <linux/memblock.h> /* for max_pfn */
#include <linux/export.h>
#include <linux/mm.h>
-#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
#include <linux/scatterlist.h>
-#include <linux/dma-contiguous.h>
#include <linux/pfn.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
+#include <linux/slab.h>
+#include "direct.h"
/*
- * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
+ * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use
* it for entirely different regions. In that case the arch code needs to
* override the variable below for dma-direct to work properly.
*/
@@ -25,7 +26,7 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
{
if (force_dma_unencrypted(dev))
- return __phys_to_dma(dev, phys);
+ return phys_to_dma_unencrypted(dev, phys);
return phys_to_dma(dev, phys);
}
@@ -48,11 +49,6 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
{
u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
- if (force_dma_unencrypted(dev))
- *phys_limit = __dma_to_phys(dev, dma_limit);
- else
- *phys_limit = dma_to_phys(dev, dma_limit);
-
/*
* Optimistically try the zone that the physical address mask falls
* into first. If that returns memory that isn't actually addressable
@@ -61,6 +57,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
* Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
* zones.
*/
+ *phys_limit = dma_to_phys(dev, dma_limit);
if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
return GFP_DMA;
if (*phys_limit <= DMA_BIT_MASK(32))
@@ -70,45 +67,16 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
- return phys_to_dma_direct(dev, phys) + size - 1 <=
- min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
-}
-
-/*
- * Decrypting memory is allowed to block, so if this device requires
- * unencrypted memory it must come from atomic pools.
- */
-static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
- unsigned long attrs)
-{
- if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
- return false;
- if (gfpflags_allow_blocking(gfp))
- return false;
- if (force_dma_unencrypted(dev))
- return true;
- if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
- return false;
- if (dma_alloc_need_uncached(dev, attrs))
- return true;
- return false;
-}
+ dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);
-static inline bool dma_should_free_from_pool(struct device *dev,
- unsigned long attrs)
-{
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
- return true;
- if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev))
+ if (dma_addr == DMA_MAPPING_ERROR)
return false;
- if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
- return true;
- return false;
+ return dma_addr + size - 1 <=
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
- gfp_t gfp, unsigned long attrs)
+ gfp_t gfp)
{
int node = dev_to_node(dev);
struct page *page = NULL;
@@ -116,11 +84,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
- if (attrs & DMA_ATTR_NO_WARN)
- gfp |= __GFP_NOWARN;
-
- /* we always manually zero the memory once we are done: */
- gfp &= ~__GFP_ZERO;
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
@@ -151,7 +114,23 @@ again:
return page;
}
-void *dma_direct_alloc_pages(struct device *dev, size_t size,
+static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+ u64 phys_mask;
+ void *ret;
+
+ gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+ &phys_mask);
+ page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
+ if (!page)
+ return NULL;
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return ret;
+}
+
+void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
struct page *page;
@@ -159,35 +138,44 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
int err;
size = PAGE_ALIGN(size);
-
- if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
- u64 phys_mask;
-
- gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_mask);
- page = dma_alloc_from_pool(dev, size, &ret, gfp,
- dma_coherent_ok);
- if (!page)
- return NULL;
- goto done;
- }
-
- page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
- if (!page)
- return NULL;
+ if (attrs & DMA_ATTR_NO_WARN)
+ gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ if (!page)
+ return NULL;
/* remove any dirty cache lines on the kernel alias */
if (!PageHighMem(page))
arch_dma_prep_coherent(page, size);
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
/* return the page pointer as the opaque cookie */
- ret = page;
- goto done;
+ return page;
}
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !dev_is_dma_coherent(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+
+ /*
+ * Remapping or decrypting memory may block. If either is required and
+ * we can't block, allocate the memory from the atomic pools.
+ */
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ !gfpflags_allow_blocking(gfp) &&
+ (force_dma_unencrypted(dev) ||
+ (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+ return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+ /* we always manually zero the memory once we are done */
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ if (!page)
+ return NULL;
+
if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_alloc_need_uncached(dev, attrs)) ||
+ !dev_is_dma_coherent(dev)) ||
(IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
@@ -230,17 +218,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
memset(ret, 0, size);
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- dma_alloc_need_uncached(dev, attrs)) {
+ !dev_is_dma_coherent(dev)) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
goto out_encrypt_pages;
}
done:
- if (force_dma_unencrypted(dev))
- *dma_handle = __phys_to_dma(dev, page_to_phys(page));
- else
- *dma_handle = phys_to_dma(dev, page_to_phys(page));
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
out_encrypt_pages:
@@ -256,16 +241,11 @@ out_free_pages:
return NULL;
}
-void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
- dma_addr_t dma_addr, unsigned long attrs)
+void dma_direct_free(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
unsigned int page_order = get_order(size);
- /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
- if (dma_should_free_from_pool(dev, attrs) &&
- dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
- return;
-
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
@@ -273,6 +253,18 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
return;
}
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !dev_is_dma_coherent(dev)) {
+ arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
+ return;
+ }
+
+ /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+ return;
+
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
@@ -284,25 +276,60 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
}
-void *dma_direct_alloc(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
{
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_alloc_need_uncached(dev, attrs))
- return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
- return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+ struct page *page;
+ void *ret;
+
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
+ return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+ page = __dma_direct_alloc_pages(dev, size, gfp);
+ if (!page)
+ return NULL;
+ if (PageHighMem(page)) {
+ /*
+ * Depending on the cma= arguments and per-arch setup
+ * dma_alloc_contiguous could return highmem pages.
+ * Without remapping there is no way to return them here,
+ * so log an error and fail.
+ */
+ dev_info(dev, "Rejecting highmem page from CMA.\n");
+ goto out_free_pages;
+ }
+
+ ret = page_address(page);
+ if (force_dma_unencrypted(dev)) {
+ if (set_memory_decrypted((unsigned long)ret,
+ 1 << get_order(size)))
+ goto out_free_pages;
+ }
+ memset(ret, 0, size);
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+out_free_pages:
+ dma_free_contiguous(dev, page, size);
+ return NULL;
}
-void dma_direct_free(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
+void dma_direct_free_pages(struct device *dev, size_t size,
+ struct page *page, dma_addr_t dma_addr,
+ enum dma_data_direction dir)
{
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- dma_alloc_need_uncached(dev, attrs))
- arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
- else
- dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
+ unsigned int page_order = get_order(size);
+ void *vaddr = page_address(page);
+
+ /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(dev, vaddr, size))
+ return;
+
+ if (force_dma_unencrypted(dev))
+ set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
+
+ dma_free_contiguous(dev, page, size);
}
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
@@ -345,6 +372,9 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
if (unlikely(is_swiotlb_buffer(paddr)))
swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
SYNC_FOR_CPU);
+
+ if (dir == DMA_FROM_DEVICE)
+ arch_dma_mark_clean(paddr, sg->length);
}
if (!dev_is_dma_coherent(dev))
@@ -453,13 +483,13 @@ int dma_direct_supported(struct device *dev, u64 mask)
return 1;
/*
- * This check needs to be against the actual bit mask value, so
- * use __phys_to_dma() here so that the SME encryption mask isn't
+ * This check needs to be against the actual bit mask value, so use
+ * phys_to_dma_unencrypted() here so that the SME encryption mask isn't
* part of the check.
*/
if (IS_ENABLED(CONFIG_ZONE_DMA))
min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
- return mask >= __phys_to_dma(dev, min_mask);
+ return mask >= phys_to_dma_unencrypted(dev, min_mask);
}
size_t dma_direct_max_mapping_size(struct device *dev)
@@ -476,3 +506,45 @@ bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
return !dev_is_dma_coherent(dev) ||
is_swiotlb_buffer(dma_to_phys(dev, dma_addr));
}
+
+/**
+ * dma_direct_set_offset - Assign scalar offset for a single DMA range.
+ * @dev: device pointer; needed to "own" the alloced memory.
+ * @cpu_start: beginning of memory region covered by this offset.
+ * @dma_start: beginning of DMA/PCI region covered by this offset.
+ * @size: size of the region.
+ *
+ * This is for the simple case of a uniform offset which cannot
+ * be discovered by "dma-ranges".
+ *
+ * It returns -ENOMEM if out of memory, -EINVAL if a map
+ * already exists, 0 otherwise.
+ *
+ * Note: any call to this from a driver is a bug. The mapping needs
+ * to be described by the device tree or other firmware interfaces.
+ */
+int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
+ dma_addr_t dma_start, u64 size)
+{
+ struct bus_dma_region *map;
+ u64 offset = (u64)cpu_start - (u64)dma_start;
+
+ if (dev->dma_range_map) {
+ dev_err(dev, "attempt to add DMA range to existing map\n");
+ return -EINVAL;
+ }
+
+ if (!offset)
+ return 0;
+
+ map = kcalloc(2, sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+ map[0].cpu_start = cpu_start;
+ map[0].dma_start = dma_start;
+ map[0].offset = offset;
+ map[0].size = size;
+ dev->dma_range_map = map;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dma_direct_set_offset);
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
new file mode 100644
index 000000000000..b98615578737
--- /dev/null
+++ b/kernel/dma/direct.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without using an IOMMU.
+ */
+#ifndef _KERNEL_DMA_DIRECT_H
+#define _KERNEL_DMA_DIRECT_H
+
+#include <linux/dma-direct.h>
+
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs);
+bool dma_direct_can_mmap(struct device *dev);
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs);
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, unsigned long attrs);
+size_t dma_direct_max_mapping_size(struct device *dev);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_sg(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(paddr, size, dir);
+}
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev)) {
+ arch_sync_dma_for_cpu(paddr, size, dir);
+ arch_sync_dma_for_cpu_all();
+ }
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
+
+ if (dir == DMA_FROM_DEVICE)
+ arch_dma_mark_clean(paddr, size);
+}
+
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+ struct page *page, unsigned long offset, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+ dma_addr_t dma_addr = phys_to_dma(dev, phys);
+
+ if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+ return swiotlb_map(dev, phys, size, dir, attrs);
+
+ if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+ if (swiotlb_force != SWIOTLB_NO_FORCE)
+ return swiotlb_map(dev, phys, size, dir, attrs);
+
+ dev_WARN_ONCE(dev, 1,
+ "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(phys, size, dir);
+ return dma_addr;
+}
+
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t phys = dma_to_phys(dev, addr);
+
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+ if (unlikely(is_swiotlb_buffer(phys)))
+ swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+}
+#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index 05607642c888..eacd4c5b10bf 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -2,7 +2,7 @@
/*
* Dummy DMA ops that always fail.
*/
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
@@ -36,4 +36,3 @@ const struct dma_map_ops dma_dummy_ops = {
.map_sg = dma_dummy_map_sg,
.dma_supported = dma_dummy_supported,
};
-EXPORT_SYMBOL(dma_dummy_ops);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 0d129421e75f..51bb8fa8eb89 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -7,13 +7,14 @@
*/
#include <linux/memblock.h> /* for max_pfn */
#include <linux/acpi.h>
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
+#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include "debug.h"
+#include "direct.h"
/*
* Managed DMA API
@@ -144,6 +145,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
dma_addr_t addr;
BUG_ON(!valid_dma_direction(dir));
+
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return DMA_MAPPING_ERROR;
+
if (dma_map_direct(dev, ops))
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
@@ -179,6 +184,10 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
int ents;
BUG_ON(!valid_dma_direction(dir));
+
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return 0;
+
if (dma_map_direct(dev, ops))
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
@@ -213,6 +222,9 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
BUG_ON(!valid_dma_direction(dir));
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return DMA_MAPPING_ERROR;
+
/* Don't allow RAM to be mapped */
if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
return DMA_MAPPING_ERROR;
@@ -296,22 +308,6 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
EXPORT_SYMBOL(dma_sync_sg_for_device);
/*
- * Create scatter-list for the already allocated DMA buffer.
- */
-int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
- void *cpu_addr, dma_addr_t dma_addr, size_t size,
- unsigned long attrs)
-{
- struct page *page = virt_to_page(cpu_addr);
- int ret;
-
- ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
- if (!ret)
- sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
- return ret;
-}
-
-/*
* The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
* that the intention is to allow exporting memory allocated via the
* coherent DMA APIs through the dma_buf API, which only accepts a
@@ -346,9 +342,7 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
if (force_dma_unencrypted(dev))
prot = pgprot_decrypted(prot);
- if (dev_is_dma_coherent(dev) ||
- (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
- (attrs & DMA_ATTR_NON_CONSISTENT)))
+ if (dev_is_dma_coherent(dev))
return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
if (attrs & DMA_ATTR_WRITE_COMBINE)
@@ -358,35 +352,6 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
}
#endif /* CONFIG_MMU */
-/*
- * Create userspace mapping for the DMA-coherent memory.
- */
-int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
- void *cpu_addr, dma_addr_t dma_addr, size_t size,
- unsigned long attrs)
-{
-#ifdef CONFIG_MMU
- unsigned long user_count = vma_pages(vma);
- unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long off = vma->vm_pgoff;
- int ret = -ENXIO;
-
- vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
-
- if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
- return ret;
-
- if (off >= count || user_count > count - off)
- return -ENXIO;
-
- return remap_pfn_range(vma, vma->vm_start,
- page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
- user_count << PAGE_SHIFT, vma->vm_page_prot);
-#else
- return -ENXIO;
-#endif /* CONFIG_MMU */
-}
-
/**
* dma_can_mmap - check if a given device supports dma_mmap_*
* @dev: device to check
@@ -506,6 +471,86 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
}
EXPORT_SYMBOL(dma_free_attrs);
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ struct page *page;
+
+ if (WARN_ON_ONCE(!dev->coherent_dma_mask))
+ return NULL;
+ if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
+ return NULL;
+
+ size = PAGE_ALIGN(size);
+ if (dma_alloc_direct(dev, ops))
+ page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+ else if (ops->alloc_pages)
+ page = ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+ else
+ return NULL;
+
+ debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+
+ return page;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_pages);
+
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ size = PAGE_ALIGN(size);
+ debug_dma_unmap_page(dev, dma_handle, size, dir);
+
+ if (dma_alloc_direct(dev, ops))
+ dma_direct_free_pages(dev, size, page, dma_handle, dir);
+ else if (ops->free_pages)
+ ops->free_pages(dev, size, page, dma_handle, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_pages);
+
+void *dma_alloc_noncoherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ void *vaddr;
+
+ if (!ops || !ops->alloc_noncoherent) {
+ struct page *page;
+
+ page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (!page)
+ return NULL;
+ return page_address(page);
+ }
+
+ size = PAGE_ALIGN(size);
+ vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp);
+ if (vaddr)
+ debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir,
+ *dma_handle);
+ return vaddr;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_noncoherent);
+
+void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (!ops || !ops->free_noncoherent) {
+ dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
+ return;
+ }
+
+ size = PAGE_ALIGN(size);
+ debug_dma_unmap_page(dev, dma_handle, size, dir);
+ ops->free_noncoherent(dev, size, vaddr, dma_handle, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_noncoherent);
+
int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -563,20 +608,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
EXPORT_SYMBOL(dma_set_coherent_mask);
#endif
-void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
- enum dma_data_direction dir)
-{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- BUG_ON(!valid_dma_direction(dir));
-
- if (dma_alloc_direct(dev, ops))
- arch_dma_cache_sync(dev, vaddr, size, dir);
- else if (ops->cache_sync)
- ops->cache_sync(dev, vaddr, size, dir);
-}
-EXPORT_SYMBOL(dma_cache_sync);
-
size_t dma_max_mapping_size(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
new file mode 100644
index 000000000000..910ae69cae77
--- /dev/null
+++ b/kernel/dma/ops_helpers.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for DMA ops implementations. These generally rely on the fact that
+ * the allocated memory contains normal pages in the direct kernel mapping.
+ */
+#include <linux/dma-map-ops.h>
+
+/*
+ * Create scatter-list for the already allocated DMA buffer.
+ */
+int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ struct page *page = virt_to_page(cpu_addr);
+ int ret;
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
+}
+
+/*
+ * Create userspace mapping for the DMA-coherent memory.
+ */
+int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+#ifdef CONFIG_MMU
+ unsigned long user_count = vma_pages(vma);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long off = vma->vm_pgoff;
+ int ret = -ENXIO;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (off >= count || user_count > count - off)
+ return -ENXIO;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
+ user_count << PAGE_SHIFT, vma->vm_page_prot);
+#else
+ return -ENXIO;
+#endif /* CONFIG_MMU */
+}
+
+struct page *dma_common_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ struct page *page;
+
+ page = dma_alloc_contiguous(dev, size, gfp);
+ if (!page)
+ page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size));
+ if (!page)
+ return NULL;
+
+ *dma_handle = ops->map_page(dev, page, 0, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ if (*dma_handle == DMA_MAPPING_ERROR) {
+ dma_free_contiguous(dev, page, size);
+ return NULL;
+ }
+
+ memset(page_address(page), 0, size);
+ return page;
+}
+
+void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (ops->unmap_page)
+ ops->unmap_page(dev, dma_handle, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ dma_free_contiguous(dev, page, size);
+}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 1281c0f0442b..d4637f72239b 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -5,9 +5,8 @@
*/
#include <linux/cma.h>
#include <linux/debugfs.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
#include <linux/init.h>
#include <linux/genalloc.h>
#include <linux/set_memory.h>
@@ -115,7 +114,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
#endif
/*
* Memory in the atomic DMA pools must be unencrypted, the pools do not
- * shrink so no re-encryption occurs in dma_direct_free_pages().
+ * shrink so no re-encryption occurs in dma_direct_free().
*/
ret = set_memory_decrypted((unsigned long)page_to_virt(page),
1 << order);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c19379fabd20..b4eea0abc3f0 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -22,7 +22,7 @@
#include <linux/cache.h>
#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
+#include <linux/dma-map-ops.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/spinlock.h>
@@ -93,7 +93,7 @@ static unsigned int io_tlb_index;
* Max segment that we can provide which (if pages are contingous) will
* not be bounced (unless SWIOTLB_FORCE is set).
*/
-unsigned int max_segment;
+static unsigned int max_segment;
/*
* We need to save away the original address corresponding to a mapped entry
@@ -172,9 +172,7 @@ void swiotlb_print_info(void)
return;
}
- pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n",
- (unsigned long long)io_tlb_start,
- (unsigned long long)io_tlb_end,
+ pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end,
bytes >> 20);
}
@@ -670,13 +668,13 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
swiotlb_force);
swiotlb_addr = swiotlb_tbl_map_single(dev,
- __phys_to_dma(dev, io_tlb_start),
+ phys_to_dma_unencrypted(dev, io_tlb_start),
paddr, size, size, dir, attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
/* Ensure that the address returned is DMA'ble */
- dma_addr = __phys_to_dma(dev, swiotlb_addr);
+ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
index ebe128833af7..59d32317dd57 100644
--- a/kernel/dma/virt.c
+++ b/kernel/dma/virt.c
@@ -4,7 +4,7 @@
*/
#include <linux/export.h>
#include <linux/mm.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
#include <linux/scatterlist.h>
static void *dma_virt_alloc(struct device *dev, size_t size,
@@ -55,5 +55,7 @@ const struct dma_map_ops dma_virt_ops = {
.free = dma_virt_free,
.map_page = dma_virt_map_page,
.map_sg = dma_virt_map_sg,
+ .alloc_pages = dma_common_alloc_pages,
+ .free_pages = dma_common_free_pages,
};
EXPORT_SYMBOL(dma_virt_ops);
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index fcae019158ca..145ab11b8318 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -60,31 +60,56 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
return ret;
}
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, syscall);
syscall_enter_audit(regs, syscall);
- /* The above might have changed the syscall number */
- return ret ? : syscall_get_nr(current, regs);
+ return ret ? : syscall;
}
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+static __always_inline long
+__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long ti_work;
- enter_from_user_mode(regs);
- instrumentation_begin();
-
- local_irq_enable();
ti_work = READ_ONCE(current_thread_info()->flags);
if (ti_work & SYSCALL_ENTER_WORK)
syscall = syscall_trace_enter(regs, syscall, ti_work);
- instrumentation_end();
return syscall;
}
+long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+{
+ return __syscall_enter_from_user_work(regs, syscall);
+}
+
+noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+ long ret;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ local_irq_enable();
+ ret = __syscall_enter_from_user_work(regs, syscall);
+ instrumentation_end();
+
+ return ret;
+}
+
+noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+ local_irq_enable();
+ instrumentation_end();
+}
+
/**
* exit_to_user_mode - Fixup state when exiting to user mode
*
@@ -183,7 +208,7 @@ static inline bool report_single_step(unsigned long ti_work)
/*
* If TIF_SYSCALL_EMU is set, then the only reason to report is when
* TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_usermode().
+ * instruction has been already reported in syscall_enter_from_user_mode().
*/
#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7ed5248f0445..da467e1dd49a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -99,7 +99,7 @@ static void remote_function(void *data)
* retry due to any failures in smp_call_function_single(), such as if the
* task_cpu() goes offline concurrently.
*
- * returns @func return value or -ESRCH when the process isn't running
+ * returns @func return value or -ESRCH or -ENXIO when the process isn't running
*/
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
@@ -115,7 +115,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
for (;;) {
ret = smp_call_function_single(task_cpu(p), remote_function,
&data, 1);
- ret = !ret ? data.ret : -EAGAIN;
+ if (!ret)
+ ret = data.ret;
if (ret != -EAGAIN)
break;
@@ -382,7 +383,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -2133,8 +2133,24 @@ static inline struct list_head *get_event_list(struct perf_event *event)
return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
}
+/*
+ * Events that have PERF_EV_CAP_SIBLING require being part of a group and
+ * cannot exist on their own, schedule them out and move them into the ERROR
+ * state. Also see _perf_event_enable(), it will not be able to recover
+ * this ERROR state.
+ */
+static inline void perf_remove_sibling_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ event_sched_out(event, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+}
+
static void perf_group_detach(struct perf_event *event)
{
+ struct perf_event *leader = event->group_leader;
struct perf_event *sibling, *tmp;
struct perf_event_context *ctx = event->ctx;
@@ -2153,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event)
/*
* If this is a sibling, remove it from its group.
*/
- if (event->group_leader != event) {
+ if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
goto out;
@@ -2166,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
+ perf_remove_sibling_event(sibling);
+
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2183,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event)
}
out:
- perf_event__header_size(event->group_leader);
-
- for_each_sibling_event(tmp, event->group_leader)
+ for_each_sibling_event(tmp, leader)
perf_event__header_size(tmp);
+
+ perf_event__header_size(leader);
}
static bool is_orphaned_event(struct perf_event *event)
@@ -2979,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE ||
event->state < PERF_EVENT_STATE_ERROR) {
+out:
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -2990,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event)
* has gone back into error state, as distinct from the task having
* been scheduled away before the cross-call arrived.
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state == PERF_EVENT_STATE_ERROR) {
+ /*
+ * Detached SIBLING events cannot leave ERROR state.
+ */
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
+ event->group_leader == event)
+ goto out;
+
event->state = PERF_EVENT_STATE_OFF;
+ }
raw_spin_unlock_irq(&ctx->lock);
event_function_call(event, __perf_event_enable, NULL);
@@ -3356,10 +3384,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
+ struct pmu *pmu;
if (likely(!ctx))
return;
+ pmu = ctx->pmu;
cpuctx = __get_cpu_context(ctx);
if (!cpuctx->task_ctx)
return;
@@ -3389,11 +3419,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- struct pmu *pmu = ctx->pmu;
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
+ perf_pmu_disable(pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(ctx, false);
+
/*
* PMU specific parts of task perf context can require
* additional synchronization. As an example of such
@@ -3405,6 +3439,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
else
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ perf_pmu_enable(pmu);
+
/*
* RCU_INIT_POINTER here is safe because we've not
* modified the ctx and the above modification of
@@ -3427,21 +3463,22 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
+ perf_pmu_disable(pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(ctx, false);
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+
+ perf_pmu_enable(pmu);
raw_spin_unlock(&ctx->lock);
}
}
-static DEFINE_PER_CPU(struct list_head, sched_cb_list);
-
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- this_cpu_dec(perf_sched_cb_usages);
-
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ --cpuctx->sched_cb_usage;
}
@@ -3449,10 +3486,7 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
-
- this_cpu_inc(perf_sched_cb_usages);
+ cpuctx->sched_cb_usage++;
}
/*
@@ -3463,30 +3497,22 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void perf_pmu_sched_task(struct task_struct *prev,
- struct task_struct *next,
- bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
{
- struct perf_cpu_context *cpuctx;
struct pmu *pmu;
- if (prev == next)
- return;
-
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
- if (WARN_ON_ONCE(!pmu->sched_task))
- continue;
+ if (WARN_ON_ONCE(!pmu->sched_task))
+ return;
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(pmu);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
- perf_pmu_enable(pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static void perf_event_switch(struct task_struct *task,
@@ -3511,9 +3537,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
- if (__this_cpu_read(perf_sched_cb_usages))
- perf_pmu_sched_task(task, next, false);
-
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3745,10 +3768,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
+ struct pmu *pmu = ctx->pmu;
cpuctx = __get_cpu_context(ctx);
- if (cpuctx->task_ctx == ctx)
+ if (cpuctx->task_ctx == ctx) {
+ if (cpuctx->sched_cb_usage)
+ __perf_pmu_sched_task(cpuctx, true);
return;
+ }
perf_ctx_lock(cpuctx, ctx);
/*
@@ -3758,7 +3785,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(pmu);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3770,7 +3797,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
- perf_pmu_enable(ctx->pmu);
+
+ if (cpuctx->sched_cb_usage && pmu->sched_task)
+ pmu->sched_task(cpuctx->task_ctx, true);
+
+ perf_pmu_enable(pmu);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -3813,9 +3844,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
-
- if (__this_cpu_read(perf_sched_cb_usages))
- perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -5868,11 +5896,11 @@ static void perf_pmu_output_stop(struct perf_event *event);
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
-
struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ bool detach_rest = false;
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5903,7 +5931,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
}
- atomic_dec(&rb->mmap_count);
+ if (atomic_dec_and_test(&rb->mmap_count))
+ detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
@@ -5912,7 +5941,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count))
+ if (!detach_rest)
goto out_put;
/*
@@ -12828,7 +12857,6 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
- INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 733e80f334e7..1f51c27bae59 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1474,7 +1474,7 @@ end:
return retval;
}
-static struct pid *pidfd_get_pid(unsigned int fd)
+static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
struct fd f;
struct pid *pid;
@@ -1484,8 +1484,10 @@ static struct pid *pidfd_get_pid(unsigned int fd)
return ERR_PTR(-EBADF);
pid = pidfd_pid(f.file);
- if (!IS_ERR(pid))
+ if (!IS_ERR(pid)) {
get_pid(pid);
+ *flags = f.file->f_flags;
+ }
fdput(f);
return pid;
@@ -1498,6 +1500,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
struct pid *pid = NULL;
enum pid_type type;
long ret;
+ unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
@@ -1531,9 +1534,10 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
if (upid < 0)
return -EINVAL;
- pid = pidfd_get_pid(upid);
+ pid = pidfd_get_pid(upid, &f_flags);
if (IS_ERR(pid))
return PTR_ERR(pid);
+
break;
default:
return -EINVAL;
@@ -1544,7 +1548,12 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
wo.wo_flags = options;
wo.wo_info = infop;
wo.wo_rusage = ru;
+ if (f_flags & O_NONBLOCK)
+ wo.wo_flags |= WNOHANG;
+
ret = do_wait(&wo);
+ if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+ ret = -EAGAIN;
put_pid(pid);
return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d32190861bd..32083db7a2a2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -95,6 +95,7 @@
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
+#include <linux/io_uring.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -555,10 +556,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
- atomic_dec(&inode->i_writecount);
+ put_write_access(inode);
i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
+ mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_interval_tree_insert_after(tmp, mpnt,
@@ -589,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(mm, oldmm, mpnt);
+ retval = copy_page_range(tmp, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -728,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
@@ -1011,6 +1013,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
+ atomic_set(&mm->has_pinned, 0);
atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
@@ -1809,6 +1812,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
free_task(tsk);
}
+static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
+{
+ /* Skip if kernel thread */
+ if (!tsk->mm)
+ return;
+
+ /* Skip if spawning a thread or using vfork */
+ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
+ return;
+
+ /* We need to synchronize with __set_oom_adj */
+ mutex_lock(&oom_adj_mutex);
+ set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
+ /* Update the values in case they were changed after copy_signal */
+ tsk->signal->oom_score_adj = current->signal->oom_score_adj;
+ tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
+ mutex_unlock(&oom_adj_mutex);
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1982,6 +2004,10 @@ static __latent_entropy struct task_struct *copy_process(
p->vtime.state = VTIME_INACTIVE;
#endif
+#ifdef CONFIG_IO_URING
+ p->io_uring = NULL;
+#endif
+
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
@@ -2163,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process(
/*
* Ensure that the cgroup subsystem policies allow the new process to be
- * forked. It should be noted the the new process's css_set can be changed
+ * forked. It should be noted that the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
@@ -2281,6 +2307,8 @@ static __latent_entropy struct task_struct *copy_process(
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ copy_oom_score_adj(clone_flags, p);
+
return p;
bad_fork_cancel_cgroup:
@@ -2384,14 +2412,14 @@ struct mm_struct *copy_init_mm(void)
*
* args->exit_signal is expected to be checked for sanity by the caller.
*/
-long _do_fork(struct kernel_clone_args *args)
+pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
- long nr;
+ pid_t nr;
/*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
@@ -2477,7 +2505,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
.stack_size = (unsigned long)arg,
};
- return _do_fork(&args);
+ return kernel_clone(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
@@ -2488,7 +2516,7 @@ SYSCALL_DEFINE0(fork)
.exit_signal = SIGCHLD,
};
- return _do_fork(&args);
+ return kernel_clone(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -2504,7 +2532,7 @@ SYSCALL_DEFINE0(vfork)
.exit_signal = SIGCHLD,
};
- return _do_fork(&args);
+ return kernel_clone(&args);
}
#endif
@@ -2542,7 +2570,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
.tls = tls,
};
- return _do_fork(&args);
+ return kernel_clone(&args);
}
#endif
@@ -2700,7 +2728,7 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
if (!clone3_args_valid(&kargs))
return -EINVAL;
- return _do_fork(&kargs);
+ return kernel_clone(&kargs);
}
#endif
@@ -2863,7 +2891,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
- * functions used by _do_fork() cannot be used here directly
+ * functions used by kernel_clone() cannot be used here directly
* because they modify an inactive task_struct that is being
* constructed. Here we are modifying the current, active,
* task_struct.
@@ -3014,7 +3042,7 @@ int unshare_files(struct files_struct **displaced)
}
int sysctl_max_threads(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int ret;
diff --git a/kernel/futex.c b/kernel/futex.c
index a5876694a60e..680854dcf156 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -916,7 +916,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
* [10] Found | Found | task | !=taskTID | 0/1 | Invalid
*
* [1] Indicates that the kernel can acquire the futex atomically. We
- * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
*
* [2] Valid, if TID does not belong to a kernel thread. If no matching
* thread is found then it indicates that the owner TID has died.
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 908fdf5098c3..53c67c87f141 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -19,7 +19,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if (__GNUC__ >= 7)
+#if (__GNUC__ >= 10)
+#define GCOV_COUNTERS 8
+#elif (__GNUC__ >= 7)
#define GCOV_COUNTERS 9
#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 857f5f4c8098..b9b9618e1aca 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -945,6 +945,33 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
}
/**
+ * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu
+ * dev ids
+ * @desc: the interrupt description structure for this irq
+ *
+ * The biggest difference with the IRQ version is that the interrupt is
+ * EOIed early, as the IPI could result in a context switch, and we need to
+ * make sure the IPI can fire again. We also assume that the arch code has
+ * registered an action. If not, we are positively doomed.
+ */
+void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ __kstat_incr_irqs_this_cpu(desc);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+}
+
+/**
* handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
* dev ids
* @desc: the interrupt description structure for this irq
@@ -1541,18 +1568,17 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
*/
int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct irq_data *pos = NULL;
+ struct irq_data *pos;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- for (; data; data = data->parent_data)
-#endif
+ for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) {
if (data->chip && data->chip->irq_compose_msi_msg)
pos = data;
+ }
+
if (!pos)
return -ENOSYS;
pos->chip->irq_compose_msi_msg(pos, msg);
-
return 0;
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index b95ff5d5f4bd..e4cff358b437 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -57,6 +57,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
+ BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
};
static void
@@ -125,6 +126,8 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET),
BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
+
+ BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
};
static const struct irq_bit_descr irqdesc_states[] = {
@@ -136,6 +139,7 @@ static const struct irq_bit_descr irqdesc_states[] = {
BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID),
BIT_MASK_DESCR(_IRQ_IS_POLLED),
BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
+ BIT_MASK_DESCR(_IRQ_HIDDEN),
};
static const struct irq_bit_descr irqdesc_istates[] = {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7db284b10ac9..54363527feea 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -473,6 +473,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data)
}
#endif
+static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irqd->parent_data;
+#else
+ return NULL;
+#endif
+}
+
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 76cd7ebd1178..cf8b374b892d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1136,6 +1136,17 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
return irq_data;
}
+static void __irq_domain_free_hierarchy(struct irq_data *irq_data)
+{
+ struct irq_data *tmp;
+
+ while (irq_data) {
+ tmp = irq_data;
+ irq_data = irq_data->parent_data;
+ kfree(tmp);
+ }
+}
+
static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *irq_data, *tmp;
@@ -1147,12 +1158,83 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
irq_data->parent_data = NULL;
irq_data->domain = NULL;
- while (tmp) {
- irq_data = tmp;
- tmp = tmp->parent_data;
- kfree(irq_data);
+ __irq_domain_free_hierarchy(tmp);
+ }
+}
+
+/**
+ * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy
+ * @domain: IRQ domain from which the hierarchy is to be disconnected
+ * @virq: IRQ number where the hierarchy is to be trimmed
+ *
+ * Marks the @virq level belonging to @domain as disconnected.
+ * Returns -EINVAL if @virq doesn't have a valid irq_data pointing
+ * to @domain.
+ *
+ * Its only use is to be able to trim levels of hierarchy that do not
+ * have any real meaning for this interrupt, and that the driver marks
+ * as such from its .alloc() callback.
+ */
+int irq_domain_disconnect_hierarchy(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irqd;
+
+ irqd = irq_domain_get_irq_data(domain, virq);
+ if (!irqd)
+ return -EINVAL;
+
+ irqd->chip = ERR_PTR(-ENOTCONN);
+ return 0;
+}
+
+static int irq_domain_trim_hierarchy(unsigned int virq)
+{
+ struct irq_data *tail, *irqd, *irq_data;
+
+ irq_data = irq_get_irq_data(virq);
+ tail = NULL;
+
+ /* The first entry must have a valid irqchip */
+ if (!irq_data->chip || IS_ERR(irq_data->chip))
+ return -EINVAL;
+
+ /*
+ * Validate that the irq_data chain is sane in the presence of
+ * a hierarchy trimming marker.
+ */
+ for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) {
+ /* Can't have a valid irqchip after a trim marker */
+ if (irqd->chip && tail)
+ return -EINVAL;
+
+ /* Can't have an empty irqchip before a trim marker */
+ if (!irqd->chip && !tail)
+ return -EINVAL;
+
+ if (IS_ERR(irqd->chip)) {
+ /* Only -ENOTCONN is a valid trim marker */
+ if (PTR_ERR(irqd->chip) != -ENOTCONN)
+ return -EINVAL;
+
+ tail = irq_data;
}
}
+
+ /* No trim marker, nothing to do */
+ if (!tail)
+ return 0;
+
+ pr_info("IRQ%d: trimming hierarchy from %s\n",
+ virq, tail->parent_data->domain->name);
+
+ /* Sever the inner part of the hierarchy... */
+ irqd = tail;
+ tail = tail->parent_data;
+ irqd->parent_data = NULL;
+ __irq_domain_free_hierarchy(tail);
+
+ return 0;
}
static int irq_domain_alloc_irq_data(struct irq_domain *domain,
@@ -1362,6 +1444,15 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
mutex_unlock(&irq_domain_mutex);
goto out_free_irq_data;
}
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = irq_domain_trim_hierarchy(virq + i);
+ if (ret) {
+ mutex_unlock(&irq_domain_mutex);
+ goto out_free_irq_data;
+ }
+ }
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
mutex_unlock(&irq_domain_mutex);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index eb95f6106a1e..2c0c4d6d0f83 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -187,7 +187,6 @@ static const struct irq_domain_ops msi_domain_ops = {
.deactivate = msi_domain_deactivate,
};
-#ifdef GENERIC_MSI_DOMAIN_OPS
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
msi_alloc_info_t *arg)
{
@@ -206,11 +205,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
{
arg->desc = desc;
}
-#else
-#define msi_domain_ops_get_hwirq NULL
-#define msi_domain_ops_prepare NULL
-#define msi_domain_ops_set_desc NULL
-#endif /* !GENERIC_MSI_DOMAIN_OPS */
static int msi_domain_ops_init(struct irq_domain *domain,
struct msi_domain_info *info,
@@ -235,11 +229,13 @@ static int msi_domain_ops_check(struct irq_domain *domain,
}
static struct msi_domain_ops msi_domain_ops_default = {
- .get_hwirq = msi_domain_ops_get_hwirq,
- .msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
- .msi_prepare = msi_domain_ops_prepare,
- .set_desc = msi_domain_ops_set_desc,
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_check = msi_domain_ops_check,
+ .msi_prepare = msi_domain_ops_prepare,
+ .set_desc = msi_domain_ops_set_desc,
+ .domain_alloc_irqs = __msi_domain_alloc_irqs,
+ .domain_free_irqs = __msi_domain_free_irqs,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -251,6 +247,14 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
+ if (ops->domain_alloc_irqs == NULL)
+ ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
+ if (ops->domain_free_irqs == NULL)
+ ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
+
+ if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
+ return;
+
if (ops->get_hwirq == NULL)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
@@ -284,8 +288,7 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
{
struct irq_domain *domain;
- if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
- msi_domain_update_dom_ops(info);
+ msi_domain_update_dom_ops(info);
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
msi_domain_update_chip_ops(info);
@@ -370,8 +373,13 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
{
struct msi_desc *desc;
- if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_VMD_MSI:
+ break;
+ default:
return false;
+ }
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
@@ -387,17 +395,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
}
-/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
- * @dev: Pointer to device struct of the device for which the interrupts
- * are allocated
- * @nvec: The number of interrupts to allocate
- *
- * Returns 0 on success or an error code.
- */
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
@@ -491,12 +490,24 @@ cleanup:
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+
+ return ops->domain_alloc_irqs(domain, dev, nvec);
+}
+
+void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
struct msi_desc *desc;
@@ -514,6 +525,20 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
}
/**
+ * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+
+ return ops->domain_free_irqs(domain, dev);
+}
+
+/**
* msi_get_domain_info - Get the MSI interrupt domain info for @domain
* @domain: The interrupt domain to retrieve data from
*
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index c6c7e187ae74..ce0adb22ee96 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,12 +69,26 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
static bool suspend_device_irq(struct irq_desc *desc)
{
+ unsigned long chipflags = irq_desc_get_chip(desc)->flags;
+ struct irq_data *irqd = &desc->irq_data;
+
if (!desc->action || irq_desc_is_chained(desc) ||
desc->no_suspend_depth)
return false;
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ if (irqd_is_wakeup_set(irqd)) {
+ irqd_set(irqd, IRQD_WAKEUP_ARMED);
+
+ if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) &&
+ irqd_irq_disabled(irqd)) {
+ /*
+ * Interrupt marked for wakeup is in disabled state.
+ * Enable interrupt here to unmask/enable in irqchip
+ * to be able to resume with such interrupts.
+ */
+ __enable_irq(desc);
+ irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
/*
* We return true here to force the caller to issue
* synchronize_irq(). We need to make sure that the
@@ -93,7 +107,7 @@ static bool suspend_device_irq(struct irq_desc *desc)
* chip level. The chip implementation indicates that with
* IRQCHIP_MASK_ON_SUSPEND.
*/
- if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ if (chipflags & IRQCHIP_MASK_ON_SUSPEND)
mask_irq(desc);
return true;
}
@@ -137,7 +151,19 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
static void resume_irq(struct irq_desc *desc)
{
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ struct irq_data *irqd = &desc->irq_data;
+
+ irqd_clear(irqd, IRQD_WAKEUP_ARMED);
+
+ if (irqd_is_enabled_on_suspend(irqd)) {
+ /*
+ * Interrupt marked for wakeup was enabled during suspend
+ * entry. Disable such interrupts to restore them back to
+ * original state.
+ */
+ __disable_irq(desc);
+ irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
if (desc->istate & IRQS_SUSPENDED)
goto resume;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 32c071d7bc03..72513ed2a5fc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -485,7 +485,7 @@ int show_interrupts(struct seq_file *p, void *v)
rcu_read_lock();
desc = irq_to_desc(i);
- if (!desc)
+ if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
if (desc->kstat_irqs)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index c48ce19a257f..8ccd32a0cc80 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -86,6 +86,18 @@ static int irq_sw_resend(struct irq_desc *desc)
}
#endif
+static int try_retrigger(struct irq_desc *desc)
+{
+ if (desc->irq_data.chip->irq_retrigger)
+ return desc->irq_data.chip->irq_retrigger(&desc->irq_data);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irq_chip_retrigger_hierarchy(&desc->irq_data);
+#else
+ return 0;
+#endif
+}
+
/*
* IRQ resend
*
@@ -113,8 +125,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
desc->istate &= ~IRQS_PENDING;
- if (!desc->irq_data.chip->irq_retrigger ||
- !desc->irq_data.chip->irq_retrigger(&desc->irq_data))
+ if (!try_retrigger(desc))
err = irq_sw_resend(desc);
/* If the retrigger was successfull, mark it with the REPLAY bit */
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index e43795cd2ccf..403378b9947b 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -17,6 +17,7 @@ enum {
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
+ _IRQ_HIDDEN = IRQ_HIDDEN,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -31,6 +32,7 @@ enum {
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#define IRQ_IS_POLLED GOT_YOU_MORON
#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
+#define IRQ_HIDDEN GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -167,3 +169,8 @@ static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
{
desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
}
+
+static inline bool irq_settings_is_hidden(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_HIDDEN;
+}
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index e960d7ce7bcc..773b6105c4ae 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -604,7 +604,7 @@ int irq_timings_alloc(int irq)
/*
* Some platforms can have the same private interrupt per cpu,
- * so this function may be be called several times with the
+ * so this function may be called several times with the
* same interrupt number. Just bail out in case the per cpu
* stat structure is already allocated.
*/
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index cdb3ffab128b..015ef903ce8c 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -19,7 +19,7 @@
#include <linux/cpu.h>
#include <asm/sections.h>
-/* mutex to protect coming/going of the the jump_label table */
+/* mutex to protect coming/going of the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);
void jump_label_lock(void)
@@ -539,19 +539,25 @@ static void static_key_set_mod(struct static_key *key,
static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
+ int ret;
preempt_disable();
mod = __module_text_address((unsigned long)start);
WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
preempt_enable();
if (!mod)
return 0;
-
- return __jump_label_text_reserved(mod->jump_entries,
+ ret = __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
start, end);
+
+ module_put(mod);
+
+ return ret;
}
static void __jump_label_mod_update(struct static_key *key)
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 9147ff6a12e5..3994a217bde7 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "kcsan: " fmt
+
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/delay.h>
@@ -98,6 +100,9 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
*/
static DEFINE_PER_CPU(long, kcsan_skip);
+/* For kcsan_prandom_u32_max(). */
+static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state);
+
static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
size_t size,
bool expect_write,
@@ -223,7 +228,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) &&
(type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) &&
- IS_ALIGNED((unsigned long)ptr, size))
+ !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size))
return true; /* Assume aligned writes up to word size are atomic. */
if (ctx->atomic_next > 0) {
@@ -269,11 +274,28 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
return true;
}
+/*
+ * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max()
+ * for more details.
+ *
+ * The open-coded version here is using only safe primitives for all contexts
+ * where we can have KCSAN instrumentation. In particular, we cannot use
+ * prandom_u32() directly, as its tracepoint could cause recursion.
+ */
+static u32 kcsan_prandom_u32_max(u32 ep_ro)
+{
+ struct rnd_state *state = &get_cpu_var(kcsan_rand_state);
+ const u32 res = prandom_u32_state(state);
+
+ put_cpu_var(kcsan_rand_state);
+ return (u32)(((u64) res * ep_ro) >> 32);
+}
+
static inline void reset_kcsan_skip(void)
{
long skip_count = kcsan_skip_watch -
(IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ?
- prandom_u32_max(kcsan_skip_watch) :
+ kcsan_prandom_u32_max(kcsan_skip_watch) :
0);
this_cpu_write(kcsan_skip, skip_count);
}
@@ -283,12 +305,18 @@ static __always_inline bool kcsan_is_enabled(void)
return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0;
}
-static inline unsigned int get_delay(void)
+/* Introduce delay depending on context and configuration. */
+static void delay_access(int type)
{
unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt;
- return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
- prandom_u32_max(delay) :
- 0);
+ /* For certain access types, skew the random delay to be longer. */
+ unsigned int skew_delay_order =
+ (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0;
+
+ delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
+ kcsan_prandom_u32_max(delay >> skew_delay_order) :
+ 0;
+ udelay(delay);
}
void kcsan_save_irqtrace(struct task_struct *task)
@@ -361,13 +389,13 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
* already removed the watchpoint, or another thread consumed
* the watchpoint before this thread.
*/
- kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]);
}
if ((type & KCSAN_ACCESS_ASSERT) != 0)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
else
- kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]);
user_access_restore(flags);
}
@@ -408,7 +436,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
goto out;
if (!check_encodable((unsigned long)ptr, size)) {
- kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]);
goto out;
}
@@ -428,12 +456,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* with which should_watch() returns true should be tweaked so
* that this case happens very rarely.
*/
- kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]);
goto out_unlock;
}
- kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS);
- kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
/*
* Read the current value, to later check and infer a race if the data
@@ -459,7 +487,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) {
kcsan_disable_current();
- pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
+ pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
is_write ? "write" : "read", size, ptr,
watchpoint_slot((unsigned long)ptr),
encode_watchpoint((unsigned long)ptr, size, is_write));
@@ -470,7 +498,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Delay this thread, to increase probability of observing a racy
* conflicting access.
*/
- udelay(get_delay());
+ delay_access(type);
/*
* Re-read value, and check if it is as expected; if not, we infer a
@@ -535,16 +563,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* increment this counter.
*/
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL,
watchpoint - watchpoints);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
- kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]);
if (is_assert)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
@@ -557,7 +585,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* reused after this point.
*/
remove_watchpoint(watchpoint);
- kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
+ atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
out_unlock:
if (!kcsan_interrupt_watcher)
local_irq_restore(irq_flags);
@@ -614,13 +642,16 @@ void __init kcsan_init(void)
BUG_ON(!in_task());
kcsan_debugfs_init();
+ prandom_seed_full_state(&kcsan_rand_state);
/*
* We are in the init task, and no other tasks should be running;
* WRITE_ONCE without memory barrier is sufficient.
*/
- if (kcsan_early_enable)
+ if (kcsan_early_enable) {
+ pr_info("enabled early\n");
WRITE_ONCE(kcsan_enabled, true);
+ }
}
/* === Exported interface =================================================== */
@@ -793,7 +824,17 @@ EXPORT_SYMBOL(__kcsan_check_access);
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
__alias(__tsan_write##size); \
- EXPORT_SYMBOL(__tsan_unaligned_write##size)
+ EXPORT_SYMBOL(__tsan_unaligned_write##size); \
+ void __tsan_read_write##size(void *ptr); \
+ void __tsan_read_write##size(void *ptr) \
+ { \
+ check_access(ptr, size, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \
+ } \
+ EXPORT_SYMBOL(__tsan_read_write##size); \
+ void __tsan_unaligned_read_write##size(void *ptr) \
+ __alias(__tsan_read_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_read_write##size)
DEFINE_TSAN_READ_WRITE(1);
DEFINE_TSAN_READ_WRITE(2);
@@ -879,3 +920,130 @@ void __tsan_init(void)
{
}
EXPORT_SYMBOL(__tsan_init);
+
+/*
+ * Instrumentation for atomic builtins (__atomic_*, __sync_*).
+ *
+ * Normal kernel code _should not_ be using them directly, but some
+ * architectures may implement some or all atomics using the compilers'
+ * builtins.
+ *
+ * Note: If an architecture decides to fully implement atomics using the
+ * builtins, because they are implicitly instrumented by KCSAN (and KASAN,
+ * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via
+ * atomic-instrumented) is no longer necessary.
+ *
+ * TSAN instrumentation replaces atomic accesses with calls to any of the below
+ * functions, whose job is to also execute the operation itself.
+ */
+
+#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \
+ } \
+ return __atomic_load_n(ptr, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_load); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \
+ } \
+ __atomic_store_n(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_store)
+
+#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC); \
+ } \
+ return __atomic_##op##suffix(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_##op)
+
+/*
+ * Note: CAS operations are always classified as write, even in case they
+ * fail. We cannot perform check_access() after a write, as it might lead to
+ * false positives, in cases such as:
+ *
+ * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...)
+ *
+ * T1: if (__atomic_load_n(&p->flag, ...)) {
+ * modify *p;
+ * p->flag = 0;
+ * }
+ *
+ * The only downside is that, if there are 3 threads, with one CAS that
+ * succeeds, another CAS that fails, and an unmarked racing operation, we may
+ * point at the wrong CAS as the source of the race. However, if we assume that
+ * all CAS can succeed in some other execution, the data race is still valid.
+ */
+#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo); \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC); \
+ } \
+ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength)
+
+#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo); \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC); \
+ } \
+ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
+ return exp; \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val)
+
+#define DEFINE_TSAN_ATOMIC_OPS(bits) \
+ DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \
+ DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits)
+
+DEFINE_TSAN_ATOMIC_OPS(8);
+DEFINE_TSAN_ATOMIC_OPS(16);
+DEFINE_TSAN_ATOMIC_OPS(32);
+DEFINE_TSAN_ATOMIC_OPS(64);
+
+void __tsan_atomic_thread_fence(int memorder);
+void __tsan_atomic_thread_fence(int memorder)
+{
+ __atomic_thread_fence(memorder);
+}
+EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+
+void __tsan_atomic_signal_fence(int memorder);
+void __tsan_atomic_signal_fence(int memorder) { }
+EXPORT_SYMBOL(__tsan_atomic_signal_fence);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 023e49c58d55..3c8093a371b1 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "kcsan: " fmt
+
#include <linux/atomic.h>
#include <linux/bsearch.h>
#include <linux/bug.h>
@@ -15,10 +17,19 @@
#include "kcsan.h"
-/*
- * Statistics counters.
- */
-static atomic_long_t counters[KCSAN_COUNTER_COUNT];
+atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints",
+ [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints",
+ [KCSAN_COUNTER_DATA_RACES] = "data_races",
+ [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures",
+ [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity",
+ [KCSAN_COUNTER_REPORT_RACES] = "report_races",
+ [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin",
+ [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses",
+ [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives",
+};
+static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT);
/*
* Addresses for filtering functions from reporting. This list can be used as a
@@ -39,34 +50,6 @@ static struct {
};
static DEFINE_SPINLOCK(report_filterlist_lock);
-static const char *counter_to_name(enum kcsan_counter_id id)
-{
- switch (id) {
- case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints";
- case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints";
- case KCSAN_COUNTER_DATA_RACES: return "data_races";
- case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures";
- case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity";
- case KCSAN_COUNTER_REPORT_RACES: return "report_races";
- case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin";
- case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses";
- case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives";
- case KCSAN_COUNTER_COUNT:
- BUG();
- }
- return NULL;
-}
-
-void kcsan_counter_inc(enum kcsan_counter_id id)
-{
- atomic_long_inc(&counters[id]);
-}
-
-void kcsan_counter_dec(enum kcsan_counter_id id)
-{
- atomic_long_dec(&counters[id]);
-}
-
/*
* The microbenchmark allows benchmarking KCSAN core runtime only. To run
* multiple threads, pipe 'microbench=<iters>' from multiple tasks into the
@@ -86,7 +69,7 @@ static noinline void microbenchmark(unsigned long iters)
*/
WRITE_ONCE(kcsan_enabled, false);
- pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
+ pr_info("%s begin | iters: %lu\n", __func__, iters);
cycles = get_cycles();
while (iters--) {
@@ -97,73 +80,13 @@ static noinline void microbenchmark(unsigned long iters)
}
cycles = get_cycles() - cycles;
- pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
+ pr_info("%s end | cycles: %llu\n", __func__, cycles);
WRITE_ONCE(kcsan_enabled, was_enabled);
/* restore context */
current->kcsan_ctx = ctx_save;
}
-/*
- * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's
- * debugfs file from multiple tasks to generate real conflicts and show reports.
- */
-static long test_dummy;
-static long test_flags;
-static long test_scoped;
-static noinline void test_thread(unsigned long iters)
-{
- const long CHANGE_BITS = 0xff00ff00ff00ff00L;
- const struct kcsan_ctx ctx_save = current->kcsan_ctx;
- cycles_t cycles;
-
- /* We may have been called from an atomic region; reset context. */
- memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
-
- pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
- pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n",
- &test_dummy, &test_flags, &test_scoped);
-
- cycles = get_cycles();
- while (iters--) {
- /* These all should generate reports. */
- __kcsan_check_read(&test_dummy, sizeof(test_dummy));
- ASSERT_EXCLUSIVE_WRITER(test_dummy);
- ASSERT_EXCLUSIVE_ACCESS(test_dummy);
-
- ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */
- __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
-
- ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */
- __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
-
- /* not actually instrumented */
- WRITE_ONCE(test_dummy, iters); /* to observe value-change */
- __kcsan_check_write(&test_dummy, sizeof(test_dummy));
-
- test_flags ^= CHANGE_BITS; /* generate value-change */
- __kcsan_check_write(&test_flags, sizeof(test_flags));
-
- BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
- {
- /* Should generate reports anywhere in this block. */
- ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped);
- ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped);
- BUG_ON(!current->kcsan_ctx.scoped_accesses.prev);
- /* Unrelated accesses. */
- __kcsan_check_access(&cycles, sizeof(cycles), 0);
- __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC);
- }
- BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
- }
- cycles = get_cycles() - cycles;
-
- pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
-
- /* restore context */
- current->kcsan_ctx = ctx_save;
-}
-
static int cmp_filterlist_addrs(const void *rhs, const void *lhs)
{
const unsigned long a = *(const unsigned long *)rhs;
@@ -220,7 +143,7 @@ static ssize_t insert_report_filterlist(const char *func)
ssize_t ret = 0;
if (!addr) {
- pr_err("KCSAN: could not find function: '%s'\n", func);
+ pr_err("could not find function: '%s'\n", func);
return -ENOENT;
}
@@ -270,9 +193,10 @@ static int show_info(struct seq_file *file, void *v)
/* show stats */
seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled));
- for (i = 0; i < KCSAN_COUNTER_COUNT; ++i)
- seq_printf(file, "%s: %ld\n", counter_to_name(i),
- atomic_long_read(&counters[i]));
+ for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) {
+ seq_printf(file, "%s: %ld\n", counter_names[i],
+ atomic_long_read(&kcsan_counters[i]));
+ }
/* show filter functions, and filter type */
spin_lock_irqsave(&report_filterlist_lock, flags);
@@ -307,18 +231,12 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o
WRITE_ONCE(kcsan_enabled, true);
} else if (!strcmp(arg, "off")) {
WRITE_ONCE(kcsan_enabled, false);
- } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) {
+ } else if (str_has_prefix(arg, "microbench=")) {
unsigned long iters;
- if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters))
+ if (kstrtoul(&arg[strlen("microbench=")], 0, &iters))
return -EINVAL;
microbenchmark(iters);
- } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) {
- unsigned long iters;
-
- if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters))
- return -EINVAL;
- test_thread(iters);
} else if (!strcmp(arg, "whitelist")) {
set_report_filterlist_whitelist(true);
} else if (!strcmp(arg, "blacklist")) {
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
index f03562aaf2eb..1a6db2f797ac 100644
--- a/kernel/kcsan/encoding.h
+++ b/kernel/kcsan/encoding.h
@@ -32,7 +32,7 @@
* 1. different addresses but with the same encoded address race;
* 2. and both map onto the same watchpoint slots;
*
- * Both these are assumed to be very unlikely. However, in case it still happens
+ * Both these are assumed to be very unlikely. However, in case it still
* happens, the report logic will filter out the false positive (see report.c).
*/
#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS)
diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
index fed6fcb5768c..ebe7fd245104 100644
--- a/kernel/kcsan/kcsan-test.c
+++ b/kernel/kcsan/kcsan-test.c
@@ -27,6 +27,12 @@
#include <linux/types.h>
#include <trace/events/printk.h>
+#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
+#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
+#else
+#define __KCSAN_ACCESS_RW(alt) (alt)
+#endif
+
/* Points to current test-case memory access "kernels". */
static void (*access_kernels[2])(void);
@@ -186,20 +192,21 @@ static bool report_matches(const struct expect_report *r)
/* Access 1 & 2 */
for (i = 0; i < 2; ++i) {
+ const int ty = r->access[i].type;
const char *const access_type =
- (r->access[i].type & KCSAN_ACCESS_ASSERT) ?
- ((r->access[i].type & KCSAN_ACCESS_WRITE) ?
- "assert no accesses" :
- "assert no writes") :
- ((r->access[i].type & KCSAN_ACCESS_WRITE) ?
- "write" :
- "read");
+ (ty & KCSAN_ACCESS_ASSERT) ?
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ "assert no accesses" :
+ "assert no writes") :
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ ((ty & KCSAN_ACCESS_COMPOUND) ?
+ "read-write" :
+ "write") :
+ "read");
const char *const access_type_aux =
- (r->access[i].type & KCSAN_ACCESS_ATOMIC) ?
- " (marked)" :
- ((r->access[i].type & KCSAN_ACCESS_SCOPED) ?
- " (scoped)" :
- "");
+ (ty & KCSAN_ACCESS_ATOMIC) ?
+ " (marked)" :
+ ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : "");
if (i == 1) {
/* Access 2 */
@@ -277,6 +284,12 @@ static noinline void test_kernel_write_atomic(void)
WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1);
}
+static noinline void test_kernel_atomic_rmw(void)
+{
+ /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */
+ __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED);
+}
+
__no_kcsan
static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
@@ -390,6 +403,15 @@ static noinline void test_kernel_seqlock_writer(void)
write_sequnlock_irqrestore(&test_seqlock, flags);
}
+static noinline void test_kernel_atomic_builtins(void)
+{
+ /*
+ * Generate concurrent accesses, expecting no reports, ensuring KCSAN
+ * treats builtin atomics as actually atomic.
+ */
+ __atomic_load_n(&test_var, __ATOMIC_RELAXED);
+}
+
/* ===== Test cases ===== */
/* Simple test with normal data race. */
@@ -430,8 +452,8 @@ static void test_concurrent_races(struct kunit *test)
const struct expect_report expect = {
.access = {
/* NULL will match any address. */
- { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE },
- { test_kernel_rmw_array, NULL, 0, 0 },
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
},
};
static const struct expect_report never = {
@@ -620,6 +642,29 @@ static void test_read_plain_atomic_write(struct kunit *test)
KUNIT_EXPECT_TRUE(test, match_expect);
}
+/* Test that atomic RMWs generate correct report. */
+__no_kcsan
+static void test_read_plain_atomic_rmw(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_atomic_rmw, &test_var, sizeof(test_var),
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+ },
+ };
+ bool match_expect = false;
+
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
+ return;
+
+ begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
/* Zero-sized accesses should never cause data race reports. */
__no_kcsan
static void test_zero_size_access(struct kunit *test)
@@ -853,6 +898,59 @@ static void test_seqlock_noreport(struct kunit *test)
}
/*
+ * Test atomic builtins work and required instrumentation functions exist. We
+ * also test that KCSAN understands they're atomic by racing with them via
+ * test_kernel_atomic_builtins(), and expect no reports.
+ *
+ * The atomic builtins _SHOULD NOT_ be used in normal kernel code!
+ */
+static void test_atomic_builtins(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins);
+ do {
+ long tmp;
+
+ kcsan_enable_current();
+
+ __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED);
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED));
+
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 20L, test_var);
+
+ tmp = 20L;
+ KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L,
+ 0, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 20L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+ KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L,
+ 1, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 30L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, -2L, test_var);
+
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+ __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+ kcsan_disable_current();
+
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
* Each test case is run with different numbers of threads. Until KUnit supports
* passing arguments for each test case, we encode #threads in the test case
* name (read by get_num_threads()). [The '-' was chosen as a stylistic
@@ -880,6 +978,7 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_write_write_struct_part),
KCSAN_KUNIT_CASE(test_read_atomic_write_atomic),
KCSAN_KUNIT_CASE(test_read_plain_atomic_write),
+ KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
KCSAN_KUNIT_CASE(test_zero_size_access),
KCSAN_KUNIT_CASE(test_data_race),
KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
@@ -891,6 +990,7 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
KCSAN_KUNIT_CASE(test_jiffies_noreport),
KCSAN_KUNIT_CASE(test_seqlock_noreport),
+ KCSAN_KUNIT_CASE(test_atomic_builtins),
{},
};
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 29480010dc30..8d4bf3431b3c 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -8,6 +8,7 @@
#ifndef _KERNEL_KCSAN_KCSAN_H
#define _KERNEL_KCSAN_KCSAN_H
+#include <linux/atomic.h>
#include <linux/kcsan.h>
#include <linux/sched.h>
@@ -34,6 +35,10 @@ void kcsan_restore_irqtrace(struct task_struct *task);
*/
void kcsan_debugfs_init(void);
+/*
+ * Statistics counters displayed via debugfs; should only be modified in
+ * slow-paths.
+ */
enum kcsan_counter_id {
/*
* Number of watchpoints currently in use.
@@ -86,12 +91,7 @@ enum kcsan_counter_id {
KCSAN_COUNTER_COUNT, /* number of counters */
};
-
-/*
- * Increment/decrement counter with given id; avoid calling these in fast-path.
- */
-extern void kcsan_counter_inc(enum kcsan_counter_id id);
-extern void kcsan_counter_dec(enum kcsan_counter_id id);
+extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
/*
* Returns true if data races in the function symbol that maps to func_addr
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 9d07e175de0f..d3bf87e6007c 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -228,6 +228,10 @@ static const char *get_access_type(int type)
return "write";
case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "write (marked)";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked)";
case KCSAN_ACCESS_SCOPED:
return "read (scoped)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
@@ -275,8 +279,8 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
cur = strnstr(buf, "kcsan_", len);
if (cur) {
- cur += sizeof("kcsan_") - 1;
- if (strncmp(cur, "test", sizeof("test") - 1))
+ cur += strlen("kcsan_");
+ if (!str_has_prefix(cur, "test"))
continue; /* KCSAN runtime function. */
/* KCSAN related test. */
}
@@ -555,7 +559,7 @@ static bool prepare_report_consumer(unsigned long *flags,
* If the actual accesses to not match, this was a false
* positive due to watchpoint encoding.
*/
- kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]);
goto discard;
}
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index d26a052d3383..d98bc208d06d 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "kcsan: " fmt
+
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/printk.h>
@@ -116,16 +118,16 @@ static int __init kcsan_selftest(void)
if (do_test()) \
++passed; \
else \
- pr_err("KCSAN selftest: " #do_test " failed"); \
+ pr_err("selftest: " #do_test " failed"); \
} while (0)
RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
- pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total);
+ pr_info("selftest: %d/%d tests passed\n", passed, total);
if (passed != total)
- panic("KCSAN selftests failed");
+ panic("selftests failed");
return 0;
}
postcore_initcall(kcsan_selftest);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f977786fe498..c82c6c06f051 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -205,7 +205,7 @@ static inline int kexec_load_check(unsigned long nr_segments,
return -EPERM;
/* Permit LSMs and IMA to fail the kexec */
- result = security_kernel_load_data(LOADING_KEXEC_IMAGE);
+ result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false);
if (result < 0)
return result;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c19c0dad1ebe..8798a8183974 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -36,7 +36,7 @@
#include <linux/syscore_ops.h>
#include <linux/compiler.h>
#include <linux/hugetlb.h>
-#include <linux/frame.h>
+#include <linux/objtool.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded);
* defined more restrictively in <asm/kexec.h>.
*
* The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
+ * new kernel is placed in the control_code_buffer, whose size
* is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
* page of memory is necessary, but some architectures require more.
* Because this memory must be identity mapped in the transition from
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index ca40bef75a61..e21f6b9234f7 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -24,6 +24,7 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
#include "kexec_internal.h"
@@ -219,13 +220,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
{
int ret;
void *ldata;
- loff_t size;
- ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf,
- &size, INT_MAX, READING_KEXEC_IMAGE);
- if (ret)
+ ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf,
+ INT_MAX, NULL, READING_KEXEC_IMAGE);
+ if (ret < 0)
return ret;
- image->kernel_buf_len = size;
+ image->kernel_buf_len = ret;
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
@@ -241,12 +241,13 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
#endif
/* It is possible that there no initramfs is being loaded */
if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf,
- &size, INT_MAX,
+ ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf,
+ INT_MAX, NULL,
READING_KEXEC_INITRAMFS);
- if (ret)
+ if (ret < 0)
goto out;
- image->initrd_buf_len = size;
+ image->initrd_buf_len = ret;
+ ret = 0;
}
if (cmdline_len) {
@@ -520,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
/* Returning 0 will take to next memory range */
/* Don't use memory that will be detected and handled by a driver. */
- if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED)
+ if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
return 0;
if (sz < kbuf->memsz)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 287b263c9cb9..8a12a25fa40d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,6 +36,7 @@
#include <linux/cpu.h>
#include <linux/jump_label.h>
#include <linux/perf_event.h>
+#include <linux/static_call.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -1223,8 +1224,7 @@ void kprobes_inc_nmissed_count(struct kprobe *p)
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-void recycle_rp_inst(struct kretprobe_instance *ri,
- struct hlist_head *head)
+static void recycle_rp_inst(struct kretprobe_instance *ri)
{
struct kretprobe *rp = ri->rp;
@@ -1236,12 +1236,11 @@ void recycle_rp_inst(struct kretprobe_instance *ri,
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock(&rp->lock);
} else
- /* Unregistering */
- hlist_add_head(&ri->hlist, head);
+ kfree_rcu(ri, rcu);
}
NOKPROBE_SYMBOL(recycle_rp_inst);
-void kretprobe_hash_lock(struct task_struct *tsk,
+static void kretprobe_hash_lock(struct task_struct *tsk,
struct hlist_head **head, unsigned long *flags)
__acquires(hlist_lock)
{
@@ -1263,7 +1262,7 @@ __acquires(hlist_lock)
}
NOKPROBE_SYMBOL(kretprobe_table_lock);
-void kretprobe_hash_unlock(struct task_struct *tsk,
+static void kretprobe_hash_unlock(struct task_struct *tsk,
unsigned long *flags)
__releases(hlist_lock)
{
@@ -1284,7 +1283,7 @@ __releases(hlist_lock)
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);
-struct kprobe kprobe_busy = {
+static struct kprobe kprobe_busy = {
.addr = (void *) get_kprobe,
};
@@ -1313,7 +1312,7 @@ void kprobe_busy_end(void)
void kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
- struct hlist_head *head, empty_rp;
+ struct hlist_head *head;
struct hlist_node *tmp;
unsigned long hash, flags = 0;
@@ -1323,19 +1322,14 @@ void kprobe_flush_task(struct task_struct *tk)
kprobe_busy_begin();
- INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
kretprobe_table_lock(hash, &flags);
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task == tk)
- recycle_rp_inst(ri, &empty_rp);
+ recycle_rp_inst(ri);
}
kretprobe_table_unlock(hash, &flags);
- hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
kprobe_busy_end();
}
@@ -1359,7 +1353,8 @@ static void cleanup_rp_inst(struct kretprobe *rp)
struct hlist_node *next;
struct hlist_head *head;
- /* No race here */
+ /* To avoid recursive kretprobe by NMI, set kprobe busy here */
+ kprobe_busy_begin();
for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
kretprobe_table_lock(hash, &flags);
head = &kretprobe_inst_table[hash];
@@ -1369,6 +1364,8 @@ static void cleanup_rp_inst(struct kretprobe *rp)
}
kretprobe_table_unlock(hash, &flags);
}
+ kprobe_busy_end();
+
free_rp_inst(rp);
}
NOKPROBE_SYMBOL(cleanup_rp_inst);
@@ -1634,6 +1631,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
if (!kernel_text_address((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
+ static_call_text_reserved(p->addr, p->addr) ||
find_bug((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
@@ -1927,6 +1925,97 @@ unsigned long __weak arch_deref_entry_point(void *entry)
}
#ifdef CONFIG_KRETPROBES
+
+unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
+ void *trampoline_address,
+ void *frame_pointer)
+{
+ struct kretprobe_instance *ri = NULL, *last = NULL;
+ struct hlist_head *head;
+ struct hlist_node *tmp;
+ unsigned long flags;
+ kprobe_opcode_t *correct_ret_addr = NULL;
+ bool skipped = false;
+
+ kretprobe_hash_lock(current, &head, &flags);
+
+ /*
+ * It is possible to have multiple instances associated with a given
+ * task either because multiple functions in the call path have
+ * return probes installed on them, and/or more than one
+ * return probe was registered for a target function.
+ *
+ * We can handle this because:
+ * - instances are always pushed into the head of the list
+ * - when multiple return probes are registered for the same
+ * function, the (chronologically) first instance's ret_addr
+ * will be the real return address, and all the rest will
+ * point to kretprobe_trampoline.
+ */
+ hlist_for_each_entry(ri, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+ /*
+ * Return probes must be pushed on this hash list correct
+ * order (same as return order) so that it can be popped
+ * correctly. However, if we find it is pushed it incorrect
+ * order, this means we find a function which should not be
+ * probed, because the wrong order entry is pushed on the
+ * path of processing other kretprobe itself.
+ */
+ if (ri->fp != frame_pointer) {
+ if (!skipped)
+ pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
+ skipped = true;
+ continue;
+ }
+
+ correct_ret_addr = ri->ret_addr;
+ if (skipped)
+ pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
+ ri->rp->kp.addr);
+
+ if (correct_ret_addr != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address));
+ last = ri;
+
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+ if (ri->fp != frame_pointer)
+ continue;
+
+ if (ri->rp && ri->rp->handler) {
+ struct kprobe *prev = kprobe_running();
+
+ __this_cpu_write(current_kprobe, &ri->rp->kp);
+ ri->ret_addr = correct_ret_addr;
+ ri->rp->handler(ri, regs);
+ __this_cpu_write(current_kprobe, prev);
+ }
+
+ recycle_rp_inst(ri);
+
+ if (ri == last)
+ break;
+ }
+
+ kretprobe_hash_unlock(current, &flags);
+
+ return (unsigned long)correct_ret_addr;
+}
+NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
+
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
@@ -1937,17 +2026,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
unsigned long hash, flags = 0;
struct kretprobe_instance *ri;
- /*
- * To avoid deadlocks, prohibit return probing in NMI contexts,
- * just skip the probe and increase the (inexact) 'nmissed'
- * statistical counter, so that the user is informed that
- * something happened:
- */
- if (unlikely(in_nmi())) {
- rp->nmissed++;
- return 0;
- }
-
/* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
raw_spin_lock_irqsave(&rp->lock, flags);
@@ -2140,6 +2218,9 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ if (WARN_ON_ONCE(kprobe_gone(p)))
+ return;
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2159,9 +2240,10 @@ static void kill_kprobe(struct kprobe *p)
/*
* The module is going away. We should disarm the kprobe which
- * is using ftrace.
+ * is using ftrace, because ftrace framework is still available at
+ * MODULE_STATE_GOING notification.
*/
- if (kprobe_ftrace(p))
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
disarm_kprobe_ftrace(p);
}
@@ -2419,7 +2501,10 @@ static int kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist) {
+ if (kprobe_gone(p))
+ continue;
+
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2436,6 +2521,7 @@ static int kprobes_module_callback(struct notifier_block *nb,
*/
kill_kprobe(p);
}
+ }
}
if (val == MODULE_STATE_GOING)
remove_module_kprobe_blacklist(mod);
@@ -2452,6 +2538,28 @@ static struct notifier_block kprobe_module_nb = {
extern unsigned long __start_kprobe_blacklist[];
extern unsigned long __stop_kprobe_blacklist[];
+void kprobe_free_init_mem(void)
+{
+ void *start = (void *)(&__init_begin);
+ void *end = (void *)(&__init_end);
+ struct hlist_head *head;
+ struct kprobe *p;
+ int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Kill all kprobes on initmem */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry(p, head, hlist) {
+ if (start <= (void *)p->addr && (void *)p->addr < end)
+ kill_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&kprobe_mutex);
+}
+
static int __init init_kprobes(void)
{
int i, err = 0;
@@ -2506,7 +2614,7 @@ static int __init init_kprobes(void)
init_test_probes();
return err;
}
-subsys_initcall(init_kprobes);
+early_initcall(init_kprobes);
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3edaa380dc7b..e29773c82b70 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker);
/**
* kthread_create_worker_on_cpu - create a kthread worker and bind it
- * it to a given CPU and the associated NUMA node.
+ * to a given CPU and the associated NUMA node.
* @cpu: CPU number
* @flags: flags modifying the default behavior of the worker
* @namefmt: printf-style name for the kthread worker (task).
diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c
index 7ee19476de9d..2565d039ade0 100644
--- a/kernel/livepatch/state.c
+++ b/kernel/livepatch/state.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state);
*
* The function can be called only during transition when a new
* livepatch is being enabled or when such a transition is reverted.
- * It is typically called only from from pre/post (un)patch
+ * It is typically called only from pre/post (un)patch
* callbacks.
*
* Return: pointer to the latest struct klp_state from already
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 54b74fabf40c..3e99dfef8408 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -76,6 +76,23 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
+DEFINE_PER_CPU(unsigned int, lockdep_recursion);
+EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
+
+static inline bool lockdep_enabled(void)
+{
+ if (!debug_locks)
+ return false;
+
+ if (raw_cpu_read(lockdep_recursion))
+ return false;
+
+ if (current->lockdep_recursion)
+ return false;
+
+ return true;
+}
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -93,7 +110,7 @@ static inline void lockdep_lock(void)
arch_spin_lock(&__lock);
__owner = current;
- current->lockdep_recursion++;
+ __this_cpu_inc(lockdep_recursion);
}
static inline void lockdep_unlock(void)
@@ -101,7 +118,7 @@ static inline void lockdep_unlock(void)
if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))
return;
- current->lockdep_recursion--;
+ __this_cpu_dec(lockdep_recursion);
__owner = NULL;
arch_spin_unlock(&__lock);
}
@@ -372,6 +389,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE];
static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
+ * the id of held_lock
+ */
+static inline u16 hlock_id(struct held_lock *hlock)
+{
+ BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16);
+
+ return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS));
+}
+
+static inline unsigned int chain_hlock_class_idx(u16 hlock_id)
+{
+ return hlock_id & (MAX_LOCKDEP_KEYS - 1);
+}
+
+/*
* The hash key of the lock dependency chains is a hash itself too:
* it's a hash of all locks taken up to that lock, including that lock.
* It's a 64-bit hash, because it's important for the keys to be
@@ -393,10 +425,15 @@ void lockdep_init_task(struct task_struct *task)
task->lockdep_recursion = 0;
}
+static __always_inline void lockdep_recursion_inc(void)
+{
+ __this_cpu_inc(lockdep_recursion);
+}
+
static __always_inline void lockdep_recursion_finish(void)
{
- if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK))
- current->lockdep_recursion = 0;
+ if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion)))
+ __this_cpu_write(lockdep_recursion, 0);
}
void lockdep_set_selftest_task(struct task_struct *task)
@@ -585,6 +622,8 @@ static const char *usage_str[] =
#include "lockdep_states.h"
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
+ [LOCK_USED_READ] = "INITIAL READ USE",
+ /* abused as string storage for verify_lock_unused() */
[LOCK_USAGE_STATES] = "IN-NMI",
};
#endif
@@ -1320,7 +1359,7 @@ static struct lock_list *alloc_list_entry(void)
*/
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
- unsigned long ip, int distance,
+ unsigned long ip, u16 distance, u8 dep,
const struct lock_trace *trace)
{
struct lock_list *entry;
@@ -1334,6 +1373,7 @@ static int add_lock_to_list(struct lock_class *this,
entry->class = this;
entry->links_to = links_to;
+ entry->dep = dep;
entry->distance = distance;
entry->trace = trace;
/*
@@ -1421,23 +1461,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
return (cq->rear - cq->front) & CQ_MASK;
}
-static inline void mark_lock_accessed(struct lock_list *lock,
- struct lock_list *parent)
+static inline void mark_lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
- nr = lock - list_entries;
- WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
+static inline void visit_lock_entry(struct lock_list *lock,
+ struct lock_list *parent)
+{
lock->parent = parent;
- lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
static inline unsigned long lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
-
- nr = lock - list_entries;
- WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -1471,85 +1507,283 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
return lock_class + offset;
}
+/*
+ * Return values of a bfs search:
+ *
+ * BFS_E* indicates an error
+ * BFS_R* indicates a result (match or not)
+ *
+ * BFS_EINVALIDNODE: Find a invalid node in the graph.
+ *
+ * BFS_EQUEUEFULL: The queue is full while doing the bfs.
+ *
+ * BFS_RMATCH: Find the matched node in the graph, and put that node into
+ * *@target_entry.
+ *
+ * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry
+ * _unchanged_.
+ */
+enum bfs_result {
+ BFS_EINVALIDNODE = -2,
+ BFS_EQUEUEFULL = -1,
+ BFS_RMATCH = 0,
+ BFS_RNOMATCH = 1,
+};
/*
- * Forward- or backward-dependency search, used for both circular dependency
- * checking and hardirq-unsafe/softirq-unsafe checking.
+ * bfs_result < 0 means error
+ */
+static inline bool bfs_error(enum bfs_result res)
+{
+ return res < 0;
+}
+
+/*
+ * DEP_*_BIT in lock_list::dep
+ *
+ * For dependency @prev -> @next:
+ *
+ * SR: @prev is shared reader (->read != 0) and @next is recursive reader
+ * (->read == 2)
+ * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader
+ * SN: @prev is shared reader and @next is non-recursive locker (->read != 2)
+ * EN: @prev is exclusive locker and @next is non-recursive locker
+ *
+ * Note that we define the value of DEP_*_BITs so that:
+ * bit0 is prev->read == 0
+ * bit1 is next->read != 2
*/
-static int __bfs(struct lock_list *source_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry,
- int offset)
+#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */
+#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */
+#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */
+#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */
+
+#define DEP_SR_MASK (1U << (DEP_SR_BIT))
+#define DEP_ER_MASK (1U << (DEP_ER_BIT))
+#define DEP_SN_MASK (1U << (DEP_SN_BIT))
+#define DEP_EN_MASK (1U << (DEP_EN_BIT))
+
+static inline unsigned int
+__calc_dep_bit(struct held_lock *prev, struct held_lock *next)
+{
+ return (prev->read == 0) + ((next->read != 2) << 1);
+}
+
+static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next)
{
+ return 1U << __calc_dep_bit(prev, next);
+}
+
+/*
+ * calculate the dep_bit for backwards edges. We care about whether @prev is
+ * shared and whether @next is recursive.
+ */
+static inline unsigned int
+__calc_dep_bitb(struct held_lock *prev, struct held_lock *next)
+{
+ return (next->read != 2) + ((prev->read == 0) << 1);
+}
+
+static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bitb(prev, next);
+}
+
+/*
+ * Initialize a lock_list entry @lock belonging to @class as the root for a BFS
+ * search.
+ */
+static inline void __bfs_init_root(struct lock_list *lock,
+ struct lock_class *class)
+{
+ lock->class = class;
+ lock->parent = NULL;
+ lock->only_xr = 0;
+}
+
+/*
+ * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the
+ * root for a BFS search.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure
+ * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)->
+ * and -(S*)->.
+ */
+static inline void bfs_init_root(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read == 2);
+}
+
+/*
+ * Similar to bfs_init_root() but initialize the root for backwards BFS.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure
+ * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not
+ * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->).
+ */
+static inline void bfs_init_rootb(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read != 0);
+}
+
+static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
+{
+ if (!lock || !lock->parent)
+ return NULL;
+
+ return list_next_or_null_rcu(get_dep_list(lock->parent, offset),
+ &lock->entry, struct lock_list, entry);
+}
+
+/*
+ * Breadth-First Search to find a strong path in the dependency graph.
+ *
+ * @source_entry: the source of the path we are searching for.
+ * @data: data used for the second parameter of @match function
+ * @match: match function for the search
+ * @target_entry: pointer to the target of a matched path
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ *
+ * We may have multiple edges (considering different kinds of dependencies,
+ * e.g. ER and SN) between two nodes in the dependency graph. But
+ * only the strong dependency path in the graph is relevant to deadlocks. A
+ * strong dependency path is a dependency path that doesn't have two adjacent
+ * dependencies as -(*R)-> -(S*)->, please see:
+ *
+ * Documentation/locking/lockdep-design.rst
+ *
+ * for more explanation of the definition of strong dependency paths
+ *
+ * In __bfs(), we only traverse in the strong dependency path:
+ *
+ * In lock_list::only_xr, we record whether the previous dependency only
+ * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we
+ * filter out any -(S*)-> in the current dependency and after that, the
+ * ->only_xr is set according to whether we only have -(*R)-> left.
+ */
+static enum bfs_result __bfs(struct lock_list *source_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int offset)
+{
+ struct circular_queue *cq = &lock_cq;
+ struct lock_list *lock = NULL;
struct lock_list *entry;
- struct lock_list *lock;
struct list_head *head;
- struct circular_queue *cq = &lock_cq;
- int ret = 1;
+ unsigned int cq_depth;
+ bool first;
lockdep_assert_locked();
- if (match(source_entry, data)) {
- *target_entry = source_entry;
- ret = 0;
- goto exit;
- }
-
- head = get_dep_list(source_entry, offset);
- if (list_empty(head))
- goto exit;
-
__cq_init(cq);
__cq_enqueue(cq, source_entry);
- while ((lock = __cq_dequeue(cq))) {
+ while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) {
+ if (!lock->class)
+ return BFS_EINVALIDNODE;
+
+ /*
+ * Step 1: check whether we already finish on this one.
+ *
+ * If we have visited all the dependencies from this @lock to
+ * others (iow, if we have visited all lock_list entries in
+ * @lock->class->locks_{after,before}) we skip, otherwise go
+ * and visit all the dependencies in the list and mark this
+ * list accessed.
+ */
+ if (lock_accessed(lock))
+ continue;
+ else
+ mark_lock_accessed(lock);
+
+ /*
+ * Step 2: check whether prev dependency and this form a strong
+ * dependency path.
+ */
+ if (lock->parent) { /* Parent exists, check prev dependency */
+ u8 dep = lock->dep;
+ bool prev_only_xr = lock->parent->only_xr;
+
+ /*
+ * Mask out all -(S*)-> if we only have *R in previous
+ * step, because -(*R)-> -(S*)-> don't make up a strong
+ * dependency.
+ */
+ if (prev_only_xr)
+ dep &= ~(DEP_SR_MASK | DEP_SN_MASK);
- if (!lock->class) {
- ret = -2;
- goto exit;
+ /* If nothing left, we skip */
+ if (!dep)
+ continue;
+
+ /* If there are only -(*R)-> left, set that for the next step */
+ lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK));
}
- head = get_dep_list(lock, offset);
+ /*
+ * Step 3: we haven't visited this and there is a strong
+ * dependency path to this, so check with @match.
+ */
+ if (match(lock, data)) {
+ *target_entry = lock;
+ return BFS_RMATCH;
+ }
+ /*
+ * Step 4: if not match, expand the path by adding the
+ * forward or backwards dependencis in the search
+ *
+ */
+ first = true;
+ head = get_dep_list(lock, offset);
list_for_each_entry_rcu(entry, head, entry) {
- if (!lock_accessed(entry)) {
- unsigned int cq_depth;
- mark_lock_accessed(entry, lock);
- if (match(entry, data)) {
- *target_entry = entry;
- ret = 0;
- goto exit;
- }
+ visit_lock_entry(entry, lock);
- if (__cq_enqueue(cq, entry)) {
- ret = -1;
- goto exit;
- }
- cq_depth = __cq_get_elem_count(cq);
- if (max_bfs_queue_depth < cq_depth)
- max_bfs_queue_depth = cq_depth;
- }
+ /*
+ * Note we only enqueue the first of the list into the
+ * queue, because we can always find a sibling
+ * dependency from one (see __bfs_next()), as a result
+ * the space of queue is saved.
+ */
+ if (!first)
+ continue;
+
+ first = false;
+
+ if (__cq_enqueue(cq, entry))
+ return BFS_EQUEUEFULL;
+
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
}
}
-exit:
- return ret;
+
+ return BFS_RNOMATCH;
}
-static inline int __bfs_forwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
return __bfs(src_entry, data, match, target_entry,
offsetof(struct lock_class, locks_after));
}
-static inline int __bfs_backwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
return __bfs(src_entry, data, match, target_entry,
offsetof(struct lock_class, locks_before));
@@ -1659,15 +1893,72 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
print_circular_bug_entry(entry, depth);
}
-static inline int class_equal(struct lock_list *entry, void *data)
+/*
+ * We are about to add A -> B into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
+ * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
+ * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
+ * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
+ * dependency graph, as any strong path ..-> A -> B ->.. we can get with
+ * having dependency A -> B, we could already get a equivalent path ..-> A ->
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant.
+ *
+ * We need to make sure both the start and the end of A -> .. -> B is not
+ * weaker than A -> B. For the start part, please see the comment in
+ * check_redundant(). For the end part, we need:
+ *
+ * Either
+ *
+ * a) A -> B is -(*R)-> (everything is not weaker than that)
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
+ *
+ */
+static inline bool hlock_equal(struct lock_list *entry, void *data)
+{
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 2 || /* A -> B is -(*R)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
+}
+
+/*
+ * We are about to add B -> A into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong
+ * dependency cycle, that means:
+ *
+ * Either
+ *
+ * a) B -> A is -(E*)->
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B)
+ *
+ * as then we don't have -(*R)-> -(S*)-> in the cycle.
+ */
+static inline bool hlock_conflict(struct lock_list *entry, void *data)
{
- return entry->class == data;
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 0 || /* B -> A is -(E*)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
static noinline void print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt)
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1714,10 +2005,10 @@ static noinline void print_bfs_bug(int ret)
WARN(1, "lockdep bfs error:%d\n", ret);
}
-static int noop_count(struct lock_list *entry, void *data)
+static bool noop_count(struct lock_list *entry, void *data)
{
(*(unsigned long *)data)++;
- return 0;
+ return false;
}
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
@@ -1734,8 +2025,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
raw_local_irq_save(flags);
lockdep_lock();
@@ -1761,8 +2051,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
raw_local_irq_save(flags);
lockdep_lock();
@@ -1775,18 +2064,18 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
/*
* Check that the dependency graph starting at <src> can lead to
- * <target> or not. Print an error and return 0 if it does.
+ * <target> or not.
*/
-static noinline int
-check_path(struct lock_class *target, struct lock_list *src_entry,
+static noinline enum bfs_result
+check_path(struct held_lock *target, struct lock_list *src_entry,
+ bool (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- int ret;
+ enum bfs_result ret;
- ret = __bfs_forwards(src_entry, (void *)target, class_equal,
- target_entry);
+ ret = __bfs_forwards(src_entry, target, match, target_entry);
- if (unlikely(ret < 0))
+ if (unlikely(bfs_error(ret)))
print_bfs_bug(ret);
return ret;
@@ -1797,24 +2086,23 @@ check_path(struct lock_class *target, struct lock_list *src_entry,
* lead to <target>. If it can, there is a circle when adding
* <target> -> <src> dependency.
*
- * Print an error and return 0 if it does.
+ * Print an error and return BFS_RMATCH if it does.
*/
-static noinline int
+static noinline enum bfs_result
check_noncircular(struct held_lock *src, struct held_lock *target,
struct lock_trace **const trace)
{
- int ret;
+ enum bfs_result ret;
struct lock_list *target_entry;
- struct lock_list src_entry = {
- .class = hlock_class(src),
- .parent = NULL,
- };
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
debug_atomic_inc(nr_cyclic_checks);
- ret = check_path(hlock_class(target), &src_entry, &target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, &target_entry);
- if (unlikely(!ret)) {
+ if (unlikely(ret == BFS_RMATCH)) {
if (!*trace) {
/*
* If save_trace fails here, the printing might
@@ -1836,27 +2124,35 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
* <target> or not. If it can, <src> -> <target> dependency is already
* in the graph.
*
- * Print an error and return 2 if it does or 1 if it does not.
+ * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
*/
-static noinline int
+static noinline enum bfs_result
check_redundant(struct held_lock *src, struct held_lock *target)
{
- int ret;
+ enum bfs_result ret;
struct lock_list *target_entry;
- struct lock_list src_entry = {
- .class = hlock_class(src),
- .parent = NULL,
- };
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
debug_atomic_inc(nr_redundant_checks);
- ret = check_path(hlock_class(target), &src_entry, &target_entry);
+ ret = check_path(target, &src_entry, hlock_equal, &target_entry);
- if (!ret) {
+ if (ret == BFS_RMATCH)
debug_atomic_inc(nr_redundant);
- ret = 2;
- } else if (ret < 0)
- ret = 0;
return ret;
}
@@ -1864,39 +2160,86 @@ check_redundant(struct held_lock *src, struct held_lock *target)
#ifdef CONFIG_TRACE_IRQFLAGS
-static inline int usage_accumulate(struct lock_list *entry, void *mask)
-{
- *(unsigned long *)mask |= entry->class->usage_mask;
-
- return 0;
-}
-
/*
* Forwards and backwards subgraph searching, for the purposes of
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ *
+ * A irq safe->unsafe deadlock happens with the following conditions:
+ *
+ * 1) We have a strong dependency path A -> ... -> B
+ *
+ * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore
+ * irq can create a new dependency B -> A (consider the case that a holder
+ * of B gets interrupted by an irq whose handler will try to acquire A).
+ *
+ * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a
+ * strong circle:
+ *
+ * For the usage bits of B:
+ * a) if A -> B is -(*N)->, then B -> A could be any type, so any
+ * ENABLED_IRQ usage suffices.
+ * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only
+ * ENABLED_IRQ_*_READ usage suffices.
+ *
+ * For the usage bits of A:
+ * c) if A -> B is -(E*)->, then B -> A could be any type, so any
+ * USED_IN_IRQ usage suffices.
+ * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only
+ * USED_IN_IRQ_*_READ usage suffices.
*/
-static inline int usage_match(struct lock_list *entry, void *mask)
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of A should be accumulated to detect
+ * safe->unsafe bugs.
+ *
+ * Note that usage_accumulate() is used in backwards search, so ->only_xr
+ * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true).
+ *
+ * As above, if only_xr is false, which means A -> B has -(E*)-> dependency
+ * path, any usage of A should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_accumulate(struct lock_list *entry, void *mask)
{
- return entry->class->usage_mask & *(unsigned long *)mask;
+ if (!entry->only_xr)
+ *(unsigned long *)mask |= entry->class->usage_mask;
+ else /* Mask out _READ usage bits */
+ *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ);
+
+ return false;
+}
+
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of B conflicts with the usage bits of A,
+ * i.e. which usage bit of B may introduce safe->unsafe deadlocks.
+ *
+ * As above, if only_xr is false, which means A -> B has -(*N)-> dependency
+ * path, any usage of B should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_match(struct lock_list *entry, void *mask)
+{
+ if (!entry->only_xr)
+ return !!(entry->class->usage_mask & *(unsigned long *)mask);
+ else /* Mask out _READ usage bits */
+ return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
}
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
*
- * Return 0 if such a node exists in the subgraph, and put that node
+ * Return BFS_MATCH if such a node exists in the subgraph, and put that node
* into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
+static enum bfs_result
find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_forwards_checks);
@@ -1908,18 +2251,12 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
/*
* Find a node in the backwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
- *
- * Return 0 if such a node exists in the subgraph, and put that node
- * into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
+static enum bfs_result
find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_backwards_checks);
@@ -1939,7 +2276,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
#endif
printk(KERN_CONT " {\n");
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ for (bit = 0; bit < LOCK_TRACE_STATES; bit++) {
if (class->usage_mask & (1 << bit)) {
int len = depth;
@@ -2179,17 +2516,39 @@ static unsigned long invert_dir_mask(unsigned long mask)
}
/*
- * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
- * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
- * And then mask out all bitnr0.
+ * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ
+ * usage may cause deadlock too, for example:
+ *
+ * P1 P2
+ * <irq disabled>
+ * write_lock(l1); <irq enabled>
+ * read_lock(l2);
+ * write_lock(l2);
+ * <in irq>
+ * read_lock(l1);
+ *
+ * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2
+ * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible
+ * deadlock.
+ *
+ * In fact, all of the following cases may cause deadlocks:
+ *
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ
+ *
+ * As a result, to calculate the "exclusive mask", first we invert the
+ * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with
+ * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all
+ * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*).
*/
static unsigned long exclusive_mask(unsigned long mask)
{
unsigned long excl = invert_dir_mask(mask);
- /* Strip read */
excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
- excl &= ~LOCKF_IRQ_READ;
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
return excl;
}
@@ -2206,6 +2565,7 @@ static unsigned long original_mask(unsigned long mask)
unsigned long excl = invert_dir_mask(mask);
/* Include read in existing usages */
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
return excl;
@@ -2220,14 +2580,24 @@ static int find_exclusive_match(unsigned long mask,
enum lock_usage_bit *bitp,
enum lock_usage_bit *excl_bitp)
{
- int bit, excl;
+ int bit, excl, excl_read;
for_each_set_bit(bit, &mask, LOCK_USED) {
+ /*
+ * exclusive_bit() strips the read bit, however,
+ * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need
+ * to search excl | LOCK_USAGE_READ_MASK as well.
+ */
excl = exclusive_bit(bit);
+ excl_read = excl | LOCK_USAGE_READ_MASK;
if (excl_mask & lock_flag(excl)) {
*bitp = bit;
*excl_bitp = excl;
return 0;
+ } else if (excl_mask & lock_flag(excl_read)) {
+ *bitp = bit;
+ *excl_bitp = excl_read;
+ return 0;
}
}
return -1;
@@ -2247,17 +2617,16 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
struct lock_list *target_entry1;
struct lock_list *target_entry;
struct lock_list this, that;
- int ret;
+ enum bfs_result ret;
/*
* Step 1: gather all hard/soft IRQs usages backward in an
* accumulated usage mask.
*/
- this.parent = NULL;
- this.class = hlock_class(prev);
+ bfs_init_rootb(&this, prev);
ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
@@ -2272,16 +2641,15 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
*/
forward_mask = exclusive_mask(usage_mask);
- that.parent = NULL;
- that.class = hlock_class(next);
+ bfs_init_root(&that, next);
ret = find_usage_forwards(&that, forward_mask, &target_entry1);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
/*
* Step 3: we found a bad match! Now retrieve a lock from the backward
@@ -2291,11 +2659,11 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
backward_mask = original_mask(target_entry1->class->usage_mask);
ret = find_usage_backwards(&this, backward_mask, &target_entry);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (DEBUG_LOCKS_WARN_ON(ret == 1))
+ if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH))
return 1;
/*
@@ -2459,11 +2827,11 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance,
+ struct held_lock *next, u16 distance,
struct lock_trace **const trace)
{
struct lock_list *entry;
- int ret;
+ enum bfs_result ret;
if (!hlock_class(prev)->key || !hlock_class(next)->key) {
/*
@@ -2494,23 +2862,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* in the graph whose neighbours are to be checked.
*/
ret = check_noncircular(next, prev, trace);
- if (unlikely(ret <= 0))
+ if (unlikely(bfs_error(ret) || ret == BFS_RMATCH))
return 0;
if (!check_irq_usage(curr, prev, next))
return 0;
/*
- * For recursive read-locks we do all the dependency checks,
- * but we dont store read-triggered dependencies (only
- * write-triggered dependencies). This ensures that only the
- * write-side dependencies matter, and that if for example a
- * write-lock never takes any other locks, then the reads are
- * equivalent to a NOP.
- */
- if (next->read == 2 || prev->read == 2)
- return 1;
- /*
* Is the <prev> -> <next> dependency already present?
*
* (this may occur even though this is a new chain: consider
@@ -2522,7 +2880,35 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 1;
+ entry->dep |= calc_dep(prev, next);
+
+ /*
+ * Also, update the reverse dependency in @next's
+ * ->locks_before list.
+ *
+ * Here we reuse @entry as the cursor, which is fine
+ * because we won't go to the next iteration of the
+ * outer loop:
+ *
+ * For normal cases, we return in the inner loop.
+ *
+ * If we fail to return, we have inconsistency, i.e.
+ * <prev>::locks_after contains <next> while
+ * <next>::locks_before doesn't contain <prev>. In
+ * that case, we return after the inner and indicate
+ * something is wrong.
+ */
+ list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) {
+ if (entry->class == hlock_class(prev)) {
+ if (distance == 1)
+ entry->distance = 1;
+ entry->dep |= calc_depb(prev, next);
+ return 1;
+ }
+ }
+
+ /* <prev> is not found in <next>::locks_before */
+ return 0;
}
}
@@ -2531,8 +2917,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Is the <prev> -> <next> link redundant?
*/
ret = check_redundant(prev, next);
- if (ret != 1)
- return ret;
+ if (bfs_error(ret))
+ return 0;
+ else if (ret == BFS_RMATCH)
+ return 2;
#endif
if (!*trace) {
@@ -2547,14 +2935,18 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, *trace);
+ next->acquire_ip, distance,
+ calc_dep(prev, next),
+ *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, *trace);
+ next->acquire_ip, distance,
+ calc_depb(prev, next),
+ *trace);
if (!ret)
return 0;
@@ -2590,16 +2982,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
goto out_bug;
for (;;) {
- int distance = curr->lockdep_depth - depth + 1;
+ u16 distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
- /*
- * Only non-recursive-read entries get new dependencies
- * added:
- */
- if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next, distance,
- &trace);
+ if (hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace);
if (!ret)
return 0;
@@ -2875,7 +3262,10 @@ static inline void free_chain_hlocks(int base, int size)
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
{
- return lock_classes + chain_hlocks[chain->base + i];
+ u16 chain_hlock = chain_hlocks[chain->base + i];
+ unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
+
+ return lock_classes + class_idx - 1;
}
/*
@@ -2901,12 +3291,12 @@ static inline int get_first_held_lock(struct task_struct *curr,
/*
* Returns the next chain_key iteration
*/
-static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key)
{
- u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+ u64 new_chain_key = iterate_chain_key(chain_key, hlock_id);
- printk(" class_idx:%d -> chain_key:%016Lx",
- class_idx,
+ printk(" hlock_id:%d -> chain_key:%016Lx",
+ (unsigned int)hlock_id,
(unsigned long long)new_chain_key);
return new_chain_key;
}
@@ -2923,12 +3313,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
hlock_next->irq_context);
for (; i < depth; i++) {
hlock = curr->held_locks + i;
- chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+ chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key);
print_lock(hlock);
}
- print_chain_key_iteration(hlock_next->class_idx, chain_key);
+ print_chain_key_iteration(hlock_id(hlock_next), chain_key);
print_lock(hlock_next);
}
@@ -2936,14 +3326,14 @@ static void print_chain_keys_chain(struct lock_chain *chain)
{
int i;
u64 chain_key = INITIAL_CHAIN_KEY;
- int class_id;
+ u16 hlock_id;
printk("depth: %u\n", chain->depth);
for (i = 0; i < chain->depth; i++) {
- class_id = chain_hlocks[chain->base + i];
- chain_key = print_chain_key_iteration(class_id, chain_key);
+ hlock_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + class_id);
+ print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1);
printk("\n");
}
}
@@ -2992,7 +3382,7 @@ static int check_no_collision(struct task_struct *curr,
}
for (j = 0; j < chain->depth - 1; j++, i++) {
- id = curr->held_locks[i].class_idx;
+ id = hlock_id(&curr->held_locks[i]);
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
print_collision(curr, hlock, chain);
@@ -3041,7 +3431,6 @@ static inline int add_chain_cache(struct task_struct *curr,
struct held_lock *hlock,
u64 chain_key)
{
- struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
int i, j;
@@ -3084,11 +3473,11 @@ static inline int add_chain_cache(struct task_struct *curr,
chain->base = j;
for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx;
+ int lock_id = hlock_id(curr->held_locks + i);
chain_hlocks[chain->base + j] = lock_id;
}
- chain_hlocks[chain->base + j] = class - lock_classes;
+ chain_hlocks[chain->base + j] = hlock_id(hlock);
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains(chain->irq_context);
@@ -3275,7 +3664,7 @@ static void check_chain_key(struct task_struct *curr)
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
chain_key = INITIAL_CHAIN_KEY;
- chain_key = iterate_chain_key(chain_key, hlock->class_idx);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
prev_hlock = hlock;
}
if (chain_key != curr->curr_chain_key) {
@@ -3434,24 +3823,32 @@ print_irq_inversion_bug(struct task_struct *curr,
*/
static int
check_usage_forwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0) {
+ bfs_init_root(&root, this);
+ ret = find_usage_forwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
+
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(read_bit));
+ }
- print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
return 0;
}
@@ -3461,24 +3858,32 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
*/
static int
check_usage_backwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0) {
+ bfs_init_rootb(&root, this);
+ ret = find_usage_backwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
+
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(read_bit));
+ }
- print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
return 0;
}
@@ -3517,8 +3922,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-#define STRICT_READ_CHECKS 1
-
static int (*state_verbose_f[])(struct lock_class *class) = {
#define LOCKDEP_STATE(__STATE) \
__STATE##_verbose,
@@ -3544,16 +3947,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
- * mark USED_IN has to look forwards -- to ensure no dependency
- * has ENABLED state, which would allow recursion deadlocks.
- *
- * mark ENABLED has to look backwards -- to ensure no dependee
- * has USED_IN state, which, again, would allow recursion deadlocks.
- */
- check_usage_f usage = dir ?
- check_usage_backwards : check_usage_forwards;
-
- /*
* Validate that this particular lock does not have conflicting
* usage states.
*/
@@ -3561,23 +3954,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 0;
/*
- * Validate that the lock dependencies don't have conflicting usage
- * states.
+ * Check for read in write conflicts
*/
- if ((!read || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
+ if (!read && !valid_state(curr, this, new_bit,
+ excl_bit + LOCK_USAGE_READ_MASK))
return 0;
+
/*
- * Check for read in write conflicts
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
*/
- if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK))
+ if (dir) {
+ /*
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ if (!check_usage_backwards(curr, this, excl_bit))
return 0;
-
- if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK,
- state_name(new_bit + LOCK_USAGE_READ_MASK)))
+ } else {
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ */
+ if (!check_usage_forwards(curr, this, excl_bit))
return 0;
}
@@ -3657,7 +4057,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
if (unlikely(in_nmi()))
return;
- if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
+ if (unlikely(__this_cpu_read(lockdep_recursion)))
return;
if (unlikely(lockdep_hardirqs_enabled())) {
@@ -3693,7 +4093,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
current->hardirq_chain_key = current->curr_chain_key;
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__trace_hardirqs_on_caller();
lockdep_recursion_finish();
}
@@ -3726,7 +4126,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
goto skip_checks;
}
- if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
+ if (unlikely(__this_cpu_read(lockdep_recursion)))
return;
if (lockdep_hardirqs_enabled()) {
@@ -3779,7 +4179,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
if (in_nmi()) {
if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
return;
- } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
+ } else if (__this_cpu_read(lockdep_recursion))
return;
/*
@@ -3812,7 +4212,7 @@ void lockdep_softirqs_on(unsigned long ip)
{
struct irqtrace_events *trace = &current->irqtrace;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -3827,7 +4227,7 @@ void lockdep_softirqs_on(unsigned long ip)
return;
}
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
/*
* We'll do an OFF -> ON transition:
*/
@@ -3850,7 +4250,7 @@ void lockdep_softirqs_on(unsigned long ip)
*/
void lockdep_softirqs_off(unsigned long ip)
{
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -3969,13 +4369,18 @@ static int separate_irq_context(struct task_struct *curr,
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
- unsigned int new_mask = 1 << new_bit, ret = 1;
+ unsigned int new_mask, ret = 1;
if (new_bit >= LOCK_USAGE_STATES) {
DEBUG_LOCKS_WARN_ON(1);
return 0;
}
+ if (new_bit == LOCK_USED && this->read)
+ new_bit = LOCK_USED_READ;
+
+ new_mask = 1 << new_bit;
+
/*
* If already set then do not dirty the cacheline,
* nor do any checks:
@@ -3988,26 +4393,32 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Make sure we didn't race:
*/
- if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
- graph_unlock();
- return 1;
- }
+ if (unlikely(hlock_class(this)->usage_mask & new_mask))
+ goto unlock;
hlock_class(this)->usage_mask |= new_mask;
- if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
- return 0;
+ if (new_bit < LOCK_TRACE_STATES) {
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
+ return 0;
+ }
switch (new_bit) {
+ case 0 ... LOCK_USED-1:
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
+ return 0;
+ break;
+
case LOCK_USED:
debug_atomic_dec(nr_unused_locks);
break;
+
default:
- ret = mark_lock_irq(curr, this, new_bit);
- if (!ret)
- return 0;
+ break;
}
+unlock:
graph_unlock();
/*
@@ -4220,11 +4631,11 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
if (subclass) {
unsigned long flags;
- if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
register_lock_class(lock, subclass, 1);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -4411,7 +4822,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
- chain_key = iterate_chain_key(chain_key, class_idx);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
if (nest_lock && !__lock_is_held(nest_lock, -1)) {
print_lock_nested_lock_not_held(curr, hlock, ip);
@@ -4907,11 +5318,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_set_class(lock, name, key, subclass, ip))
check_chain_key(current);
@@ -4924,11 +5335,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_downgrade(lock, ip))
check_chain_key(current);
@@ -4942,12 +5353,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock
{
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class = look_up_lock_class(lock, subclass);
+ unsigned long mask = LOCKF_USED;
/* if it doesn't have a class (yet), it certainly hasn't been used yet */
if (!class)
return;
- if (!(class->usage_mask & LOCK_USED))
+ /*
+ * READ locks only conflict with USED, such that if we only ever use
+ * READ locks, there is no deadlock possible -- RCU.
+ */
+ if (!hlock->read)
+ mask |= LOCKF_USED_READ;
+
+ if (!(class->usage_mask & mask))
return;
hlock->class_idx = class - lock_classes;
@@ -4958,7 +5377,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock
static bool lockdep_nmi(void)
{
- if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
+ if (raw_cpu_read(lockdep_recursion))
return false;
if (!in_nmi())
@@ -4968,6 +5387,20 @@ static bool lockdep_nmi(void)
}
/*
+ * read_lock() is recursive if:
+ * 1. We force lockdep think this way in selftests or
+ * 2. The implementation is not queued read/write lock or
+ * 3. The locker is at an in_interrupt() context.
+ */
+bool read_lock_is_recursive(void)
+{
+ return force_read_lock_recursive ||
+ !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) ||
+ in_interrupt();
+}
+EXPORT_SYMBOL_GPL(read_lock_is_recursive);
+
+/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
*/
@@ -4979,7 +5412,10 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
- if (unlikely(current->lockdep_recursion)) {
+ if (!debug_locks)
+ return;
+
+ if (unlikely(!lockdep_enabled())) {
/* XXX allow trylock from NMI ?!? */
if (lockdep_nmi() && !trylock) {
struct held_lock hlock;
@@ -5002,7 +5438,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
lockdep_recursion_finish();
@@ -5016,13 +5452,13 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
trace_lock_release(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
if (__lock_release(lock, ip))
check_chain_key(current);
lockdep_recursion_finish();
@@ -5035,13 +5471,13 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
unsigned long flags;
int ret = 0;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return 1; /* avoid false negative lockdep_assert_held() */
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
ret = __lock_is_held(lock, read);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5056,13 +5492,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
struct pin_cookie cookie = NIL_COOKIE;
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return cookie;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
cookie = __lock_pin_lock(lock);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5075,13 +5511,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_repin_lock(lock, cookie);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5092,13 +5528,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_unpin_lock(lock, cookie);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5228,15 +5664,12 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
trace_lock_acquired(lock, ip);
- if (unlikely(!lock_stat || !debug_locks))
- return;
-
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_contended(lock, ip);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5249,15 +5682,12 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
trace_lock_contended(lock, ip);
- if (unlikely(!lock_stat || !debug_locks))
- return;
-
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_acquired(lock, ip);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5296,7 +5726,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
int i;
for (i = chain->base; i < chain->base + chain->depth; i++) {
- if (chain_hlocks[i] != class - lock_classes)
+ if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes)
continue;
/*
* Each lock class occurs at most once in a lock chain so once
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index baca699b94e9..de49f9e1c11b 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -19,9 +19,13 @@ enum lock_usage_bit {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
LOCK_USED,
- LOCK_USAGE_STATES
+ LOCK_USED_READ,
+ LOCK_USAGE_STATES,
};
+/* states after LOCK_USED_READ are not traced and printed */
+static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES);
+
#define LOCK_USAGE_READ_MASK 1
#define LOCK_USAGE_DIR_MASK 2
#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
@@ -40,6 +44,7 @@ enum {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
__LOCKF(USED)
+ __LOCKF(USED_READ)
};
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
@@ -119,7 +124,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
-#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 8bbafe3e5203..70a32a576f3f 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
/*
* Due to having preemption disabled the decrement happens on
@@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
if (likely(!atomic_read_acquire(&sem->block)))
return true;
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
/* Prod writer to re-evaluate readers_active_check() */
rcuwait_wake_up(&sem->writer);
diff --git a/kernel/module.c b/kernel/module.c
index 1c5cff34d9f2..9d9f2400d94e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
@@ -3013,7 +3014,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
- err = security_kernel_load_data(LOADING_MODULE);
+ err = security_kernel_load_data(LOADING_MODULE, true);
if (err)
return err;
@@ -3023,11 +3024,17 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
return -ENOMEM;
if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
- vfree(info->hdr);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
- return 0;
+ err = security_kernel_post_load_data((char *)info->hdr, info->len,
+ LOADING_MODULE, "init_module");
+out:
+ if (err)
+ vfree(info->hdr);
+
+ return err;
}
static void free_copy(struct load_info *info)
@@ -3275,6 +3282,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(unsigned long),
&mod->num_kprobe_blacklist);
#endif
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+ mod->static_call_sites = section_objs(info, ".static_call_sites",
+ sizeof(*mod->static_call_sites),
+ &mod->num_static_call_sites);
+#endif
mod->extable = section_objs(info, "__ex_table",
sizeof(*mod->extable), &mod->num_exentries);
@@ -3792,9 +3804,13 @@ static int prepare_coming_module(struct module *mod)
if (err)
return err;
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
- return 0;
+ err = blocking_notifier_call_chain_robust(&module_notify_list,
+ MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
+ err = notifier_to_errno(err);
+ if (err)
+ klp_module_going(mod);
+
+ return err;
}
static int unknown_module_param_cb(char *param, char *val, const char *modname,
@@ -4034,8 +4050,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
struct load_info info = { };
- loff_t size;
- void *hdr;
+ void *hdr = NULL;
int err;
err = may_init_module();
@@ -4048,12 +4063,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
|MODULE_INIT_IGNORE_VERMAGIC))
return -EINVAL;
- err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX,
+ err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL,
READING_MODULE);
- if (err)
+ if (err < 0)
return err;
info.hdr = hdr;
- info.len = size;
+ info.len = err;
return load_module(&info, uargs, flags);
}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 84c987dfbe03..1b019cbca594 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -94,6 +94,34 @@ static int notifier_call_chain(struct notifier_block **nl,
}
NOKPROBE_SYMBOL(notifier_call_chain);
+/**
+ * notifier_call_chain_robust - Inform the registered notifiers about an event
+ * and rollback on error.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val_up: Value passed unmodified to the notifier function
+ * @val_down: Value passed unmodified to the notifier function when recovering
+ * from an error on @val_up
+ * @v Pointer passed unmodified to the notifier function
+ *
+ * NOTE: It is important the @nl chain doesn't change between the two
+ * invocations of notifier_call_chain() such that we visit the
+ * exact same notifier callbacks; this rules out any RCU usage.
+ *
+ * Returns: the return value of the @val_up call.
+ */
+static int notifier_call_chain_robust(struct notifier_block **nl,
+ unsigned long val_up, unsigned long val_down,
+ void *v)
+{
+ int ret, nr = 0;
+
+ ret = notifier_call_chain(nl, val_up, v, -1, &nr);
+ if (ret & NOTIFY_STOP_MASK)
+ notifier_call_chain(nl, val_down, v, nr-1, NULL);
+
+ return ret;
+}
+
/*
* Atomic notifier chain routines. Registration and unregistration
* use a spinlock, and call_chain is synchronized by RCU (no locks).
@@ -144,13 +172,30 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh,
+ unsigned long val_up, unsigned long val_down, void *v)
+{
+ unsigned long flags;
+ int ret;
+
+ /*
+ * Musn't use RCU; because then the notifier list can
+ * change between the up and down traversal.
+ */
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+ spin_unlock_irqrestore(&nh->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust);
+
/**
- * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See the comment for notifier_call_chain.
- * @nr_calls: See the comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
@@ -163,24 +208,16 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret;
rcu_read_lock();
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
-NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
-int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
-{
- return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+ return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);
@@ -250,13 +287,30 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
+ unsigned long val_up, unsigned long val_down, void *v)
+{
+ int ret = NOTIFY_DONE;
+
+ /*
+ * We check the head outside the lock, but if this access is
+ * racy then it does not matter what the result of the test
+ * is, we re-check the list after having taken the lock anyway:
+ */
+ if (rcu_access_pointer(nh->head)) {
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+ up_read(&nh->rwsem);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);
+
/**
- * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -268,9 +322,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret = NOTIFY_DONE;
@@ -281,19 +334,11 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
*/
if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
- nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
up_read(&nh->rwsem);
}
return ret;
}
-EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
-
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v)
-{
- return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
-}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
@@ -335,13 +380,18 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
+ unsigned long val_up, unsigned long val_down, void *v)
+{
+ return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);
+
/**
- * __raw_notifier_call_chain - Call functions in a raw notifier chain
+ * raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
@@ -354,18 +404,10 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __raw_notifier_call_chain(struct raw_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
-{
- return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
-}
-EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
-
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
- return __raw_notifier_call_chain(nh, val, v, -1, NULL);
+ return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -437,12 +479,10 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
- * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -454,25 +494,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
-EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
-
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v)
-{
- return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
-}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
diff --git a/kernel/padata.c b/kernel/padata.c
index 16cb894dc272..d4d3ba6e1728 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -215,12 +215,13 @@ int padata_do_parallel(struct padata_shell *ps,
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
- rcu_read_unlock_bh();
-
spin_lock(&padata_works_lock);
padata->seq_nr = ++pd->seq_nr;
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+
+ rcu_read_unlock_bh();
+
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
diff --git a/kernel/panic.c b/kernel/panic.c
index aef8872ba843..396142ee43fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (args)
vprintk(args->fmt, args->args);
+ print_modules();
+
+ if (regs)
+ show_regs(regs);
+
if (panic_on_warn) {
/*
* This thread may hit another WARN() in the panic path.
@@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
panic("panic_on_warn set ...\n");
}
- print_modules();
-
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+ dump_stack();
print_irqtrace_events(current);
diff --git a/kernel/params.c b/kernel/params.c
index 8e56f8b12d8f..3835fb82c64b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -233,14 +233,15 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
diff --git a/kernel/pid.c b/kernel/pid.c
index b2562a7ce525..74ddbff1a6ba 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <net/sock.h>
+#include <uapi/linux/pidfd.h>
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
@@ -522,7 +523,8 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
/**
* pidfd_create() - Create a new pid file descriptor.
*
- * @pid: struct pid that the pidfd will reference
+ * @pid: struct pid that the pidfd will reference
+ * @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
*
@@ -532,12 +534,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-static int pidfd_create(struct pid *pid)
+static int pidfd_create(struct pid *pid, unsigned int flags)
{
int fd;
fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- O_RDWR | O_CLOEXEC);
+ flags | O_RDWR | O_CLOEXEC);
if (fd < 0)
put_pid(pid);
@@ -565,7 +567,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;
- if (flags)
+ if (flags & ~PIDFD_NONBLOCK)
return -EINVAL;
if (pid <= 0)
@@ -576,7 +578,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return -ESRCH;
if (pid_has_task(p, PIDTYPE_TGID))
- fd = pidfd_create(p);
+ fd = pidfd_create(p, flags);
else
fd = -EINVAL;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ac135bd600eb..9de21803a8ae 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* to pid_ns->child_reaper. Thus pidns->child_reaper needs to
* stay valid until they all go away.
*
- * The code relies on the the pid_ns->child_reaper ignoring
+ * The code relies on the pid_ns->child_reaper ignoring
* SIGCHILD to cause those EXIT_ZOMBIE processes to be
* autoreaped if reparented.
*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e7aa57fb2fdc..2fc7d509a34f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -706,8 +706,8 @@ static int load_image_and_restore(void)
*/
int hibernate(void)
{
- int error, nr_calls = 0;
bool snapshot_test = false;
+ int error;
if (!hibernation_available()) {
pm_pr_dbg("Hibernation not available.\n");
@@ -723,11 +723,9 @@ int hibernate(void)
pr_info("hibernation entry\n");
pm_prepare_console();
- error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
- if (error) {
- nr_calls--;
- goto Exit;
- }
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
+ if (error)
+ goto Restore;
ksys_sync_helper();
@@ -785,7 +783,8 @@ int hibernate(void)
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
- __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+ Restore:
pm_restore_console();
hibernate_release();
Unlock:
@@ -804,7 +803,7 @@ int hibernate(void)
*/
int hibernate_quiet_exec(int (*func)(void *data), void *data)
{
- int error, nr_calls = 0;
+ int error;
lock_system_sleep();
@@ -815,11 +814,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
pm_prepare_console();
- error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
- if (error) {
- nr_calls--;
- goto exit;
- }
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
+ if (error)
+ goto restore;
error = freeze_processes();
if (error)
@@ -880,8 +877,9 @@ thaw:
thaw_processes();
exit:
- __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+restore:
pm_restore_console();
hibernate_release();
@@ -910,7 +908,7 @@ EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
*/
static int software_resume(void)
{
- int error, nr_calls = 0;
+ int error;
/*
* If the user said "noresume".. bail out early.
@@ -948,17 +946,6 @@ static int software_resume(void)
/* Check if the device is there */
swsusp_resume_device = name_to_dev_t(resume_file);
-
- /*
- * name_to_dev_t is ineffective to verify parition if resume_file is in
- * integer format. (e.g. major:minor)
- */
- if (isdigit(resume_file[0]) && resume_wait) {
- int partno;
- while (!get_gendisk(swsusp_resume_device, &partno))
- msleep(10);
- }
-
if (!swsusp_resume_device) {
/*
* Some device discovery might still be in progress; we need
@@ -997,11 +984,9 @@ static int software_resume(void)
pr_info("resume from hibernation\n");
pm_prepare_console();
- error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
- if (error) {
- nr_calls--;
- goto Close_Finish;
- }
+ error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
+ if (error)
+ goto Restore;
pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
@@ -1017,7 +1002,8 @@ static int software_resume(void)
error = load_image_and_restore();
thaw_processes();
Finish:
- __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
+ pm_notifier_call_chain(PM_POST_RESTORE);
+ Restore:
pm_restore_console();
pr_info("resume failed (%d)\n", error);
hibernate_release();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 40f86ec4ab30..0aefd6f57e0a 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -80,18 +80,18 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
+int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
int ret;
- ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
- nr_to_call, nr_calls);
+ ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL);
return notifier_to_errno(ret);
}
+
int pm_notifier_call_chain(unsigned long val)
{
- return __pm_notifier_call_chain(val, -1, NULL);
+ return blocking_notifier_call_chain(&pm_chain_head, val, NULL);
}
/* If set, devices may be suspended and resumed asynchronously. */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 32fc89ac96c3..24f12d534515 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -210,8 +210,7 @@ static inline void suspend_test_finish(const char *label) {}
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
-extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
- int *nr_calls);
+extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d25749bce7cf..46b1804c1ddf 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -735,7 +735,7 @@ zone_found:
*/
/*
- * If the zone we wish to scan is the the current zone and the
+ * If the zone we wish to scan is the current zone and the
* pfn falls into the current node then we do not need to walk
* the tree.
*/
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8b1bb5ee7e5d..32391acc806b 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -342,18 +342,16 @@ static int suspend_test(int level)
*/
static int suspend_prepare(suspend_state_t state)
{
- int error, nr_calls = 0;
+ int error;
if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
- error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
- if (error) {
- nr_calls--;
- goto Finish;
- }
+ error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND);
+ if (error)
+ goto Restore;
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
@@ -363,8 +361,8 @@ static int suspend_prepare(suspend_state_t state)
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
- Finish:
- __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ Restore:
pm_restore_console();
return error;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 01e2858b5fe3..c73f2e295167 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -226,6 +226,7 @@ struct hib_bio_batch {
atomic_t count;
wait_queue_head_t wait;
blk_status_t error;
+ struct blk_plug plug;
};
static void hib_init_batch(struct hib_bio_batch *hb)
@@ -233,6 +234,12 @@ static void hib_init_batch(struct hib_bio_batch *hb)
atomic_set(&hb->count, 0);
init_waitqueue_head(&hb->wait);
hb->error = BLK_STS_OK;
+ blk_start_plug(&hb->plug);
+}
+
+static void hib_finish_batch(struct hib_bio_batch *hb)
+{
+ blk_finish_plug(&hb->plug);
}
static void hib_end_io(struct bio *bio)
@@ -294,6 +301,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
{
+ /*
+ * We are relying on the behavior of blk_plug that a thread with
+ * a plug will flush the plug list before sleeping.
+ */
wait_event(hb->wait, atomic_read(&hb->count) == 0);
return blk_status_to_errno(hb->error);
}
@@ -335,26 +346,23 @@ static int swsusp_swap_check(void)
{
int res;
- res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
- &hib_resume_bdev);
+ if (swsusp_resume_device)
+ res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+ else
+ res = find_first_swap(&swsusp_resume_device);
if (res < 0)
return res;
-
root_swap = res;
- res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
- if (res)
- return res;
+
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE,
+ NULL);
+ if (IS_ERR(hib_resume_bdev))
+ return PTR_ERR(hib_resume_bdev);
res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
if (res < 0)
blkdev_put(hib_resume_bdev, FMODE_WRITE);
- /*
- * Update the resume device to the one actually used,
- * so the test_resume mode can use it in case it is
- * invoked from hibernate() to test the snapshot.
- */
- swsusp_resume_device = hib_resume_bdev->bd_dev;
return res;
}
@@ -561,6 +569,7 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_io(&hb);
+ hib_finish_batch(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -854,6 +863,7 @@ out_finish:
pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
+ hib_finish_batch(&hb);
if (crc) {
if (crc->thr)
kthread_stop(crc->thr);
@@ -1084,6 +1094,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_io(&hb);
+ hib_finish_batch(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -1447,6 +1458,7 @@ out_finish:
}
swsusp_show_speed(start, stop, nr_to_read, "Read");
out_clean:
+ hib_finish_batch(&hb);
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
if (crc) {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d5eedc2baa2a..740723bb3885 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -35,18 +35,18 @@ static struct snapshot_data {
bool ready;
bool platform_support;
bool free_bitmaps;
- struct inode *bd_inode;
+ dev_t dev;
} snapshot_state;
-int is_hibernate_resume_dev(const struct inode *bd_inode)
+int is_hibernate_resume_dev(dev_t dev)
{
- return hibernation_available() && snapshot_state.bd_inode == bd_inode;
+ return hibernation_available() && snapshot_state.dev == dev;
}
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- int error, nr_calls = 0;
+ int error;
if (!hibernation_available())
return -EPERM;
@@ -69,13 +69,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
/* Hibernating. The image device should be accessible. */
- data->swap = swsusp_resume_device ?
- swap_type_of(swsusp_resume_device, 0, NULL) : -1;
+ data->swap = swap_type_of(swsusp_resume_device, 0);
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
- if (error)
- __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
} else {
/*
* Resuming. We may need to wait for the image device to
@@ -85,15 +82,11 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = -1;
data->mode = O_WRONLY;
- error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+ error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
- } else
- nr_calls--;
-
- if (error)
- __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
+ }
}
if (error)
hibernate_release();
@@ -101,7 +94,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->frozen = false;
data->ready = false;
data->platform_support = false;
- data->bd_inode = NULL;
+ data->dev = 0;
Unlock:
unlock_system_sleep();
@@ -117,7 +110,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
swsusp_free();
data = filp->private_data;
- data->bd_inode = NULL;
+ data->dev = 0;
free_all_swap_pages(data->swap);
if (data->frozen) {
pm_restore_gfp_mask();
@@ -210,7 +203,6 @@ struct compat_resume_swap_area {
static int snapshot_set_swap_area(struct snapshot_data *data,
void __user *argp)
{
- struct block_device *bdev;
sector_t offset;
dev_t swdev;
@@ -237,16 +229,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data,
* User space encodes device types as two-byte values,
* so we need to recode them
*/
- if (!swdev) {
- data->swap = -1;
- return -EINVAL;
- }
- data->swap = swap_type_of(swdev, offset, &bdev);
+ data->swap = swap_type_of(swdev, offset);
if (data->swap < 0)
- return -ENODEV;
-
- data->bd_inode = bdev->bd_inode;
- bdput(bdev);
+ return swdev ? -ENODEV : -EINVAL;
+ data->dev = swdev;
return 0;
}
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..eee3dc9b60a9 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -2,3 +2,4 @@
obj-y = printk.o
obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
+obj-$(CONFIG_PRINTK) += printk_ringbuffer.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 660f9a6bf73a..3a8fd491758c 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -14,9 +14,9 @@
extern raw_spinlock_t logbuf_lock;
-__printf(5, 0)
+__printf(4, 0)
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args);
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9b75f6bfc333..fe64a49344bf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
+#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"
@@ -294,30 +295,22 @@ enum con_msg_format_flags {
static int console_msg_format = MSG_FORMAT_DEFAULT;
/*
- * The printk log buffer consists of a chain of concatenated variable
- * length records. Every record starts with a record header, containing
- * the overall length of the record.
+ * The printk log buffer consists of a sequenced collection of records, each
+ * containing variable length message text. Every record also contains its
+ * own meta-data (@info).
*
- * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these entries are maintained when messages are
- * stored.
+ * Every record meta-data carries the timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual kernel
+ * messages use LOG_KERN; userspace-injected messages always carry a matching
+ * syslog facility, by default LOG_USER. The origin of every message can be
+ * reliably determined that way.
*
- * If the heads indicate available messages, the length in the header
- * tells the start next message. A length == 0 for the next message
- * indicates a wrap-around to the beginning of the buffer.
+ * The human readable log message of a record is available in @text, the
+ * length of the message text in @text_len. The stored message is not
+ * terminated.
*
- * Every record carries the monotonic timestamp in microseconds, as well as
- * the standard userspace syslog level and syslog facility. The usual
- * kernel messages use LOG_KERN; userspace-injected messages always carry
- * a matching syslog facility, by default LOG_USER. The origin of every
- * message can be reliably determined that way.
- *
- * The human readable log message directly follows the message header. The
- * length of the message text is stored in the header, the stored message
- * is not terminated.
- *
- * Optionally, a message can carry a dictionary of properties (key/value pairs),
- * to provide userspace with a machine-readable message context.
+ * Optionally, a record can carry a dictionary of properties (key/value
+ * pairs), to provide userspace with a machine-readable message context.
*
* Examples for well-defined, commonly used property names are:
* DEVICE=b12:8 device identifier
@@ -327,25 +320,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* +sound:card0 subsystem:devname
* SUBSYSTEM=pci driver-core subsystem name
*
- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
- * follows directly after a '=' character. Every property is terminated by
- * a '\0' character. The last property is not terminated.
- *
- * Example of a message structure:
- * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
- * 0008 34 00 record is 52 bytes long
- * 000a 0b 00 text is 11 bytes long
- * 000c 1f 00 dictionary is 23 bytes long
- * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
- * 0010 69 74 27 73 20 61 20 6c "it's a l"
- * 69 6e 65 "ine"
- * 001b 44 45 56 49 43 "DEVIC"
- * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
- * 52 49 56 45 52 3d 62 75 "RIVER=bu"
- * 67 "g"
- * 0032 00 00 00 padding to next message header
- *
- * The 'struct printk_log' buffer header must never be directly exported to
+ * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
+ * and values are terminated by a '\0' character.
+ *
+ * Example of record values:
+ * record.text_buf = "it's a line" (unterminated)
+ * record.info.seq = 56
+ * record.info.ts_nsec = 36863
+ * record.info.text_len = 11
+ * record.info.facility = 0 (LOG_KERN)
+ * record.info.flags = 0
+ * record.info.level = 3 (LOG_ERR)
+ * record.info.caller_id = 299 (task 299)
+ * record.info.dev_info.subsystem = "pci" (terminated)
+ * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated)
+ *
+ * The 'struct printk_info' buffer must never be directly exported to
* userspace, it is a kernel-private implementation detail that might
* need to be changed in the future, when the requirements change.
*
@@ -365,23 +355,6 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
-struct printk_log {
- u64 ts_nsec; /* timestamp in nanoseconds */
- u16 len; /* length of entire record */
- u16 text_len; /* length of text buffer */
- u16 dict_len; /* length of dictionary buffer */
- u8 facility; /* syslog facility */
- u8 flags:5; /* internal record flags */
- u8 level:3; /* syslog level */
-#ifdef CONFIG_PRINTK_CALLER
- u32 caller_id; /* thread id or processor id */
-#endif
-}
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-__packed __aligned(4)
-#endif
-;
-
/*
* The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
* within the scheduler's rq lock. It must be released before calling
@@ -421,26 +394,16 @@ DEFINE_RAW_SPINLOCK(logbuf_lock);
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
-static u32 syslog_idx;
static size_t syslog_partial;
static bool syslog_time;
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
/* the next printk record to write to the console */
static u64 console_seq;
-static u32 console_idx;
static u64 exclusive_console_stop_seq;
+static unsigned long console_dropped;
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
-static u32 clear_idx;
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
@@ -453,7 +416,7 @@ static u32 clear_idx;
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#define LOG_ALIGN __alignof__(struct printk_log)
+#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX (u32)(1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -461,6 +424,23 @@ static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
/*
+ * Define the average message size. This only affects the number of
+ * descriptors that will be available. Underestimating is better than
+ * overestimating (too many available descriptors is better than not enough).
+ */
+#define PRB_AVGBITS 5 /* 32 character average length */
+
+#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
+#error CONFIG_LOG_BUF_SHIFT value too small.
+#endif
+_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
+ PRB_AVGBITS, &__log_buf[0]);
+
+static struct printk_ringbuffer printk_rb_dynamic;
+
+static struct printk_ringbuffer *prb = &printk_rb_static;
+
+/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
* per_cpu_areas are initialised. This variable is set to true when
* it's safe to access per-CPU data.
@@ -484,108 +464,6 @@ u32 log_buf_len_get(void)
return log_buf_len;
}
-/* human readable text of the record */
-static char *log_text(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log);
-}
-
-/* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log) + msg->text_len;
-}
-
-/* get record by index; idx must point to valid msg */
-static struct printk_log *log_from_idx(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer.
- */
- if (!msg->len)
- return (struct printk_log *)log_buf;
- return msg;
-}
-
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /* length == 0 indicates the end of the buffer; wrap */
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer as *this* one, and
- * return the one after that.
- */
- if (!msg->len) {
- msg = (struct printk_log *)log_buf;
- return msg->len;
- }
- return idx + msg->len;
-}
-
-/*
- * Check whether there is enough free space for the given message.
- *
- * The same values of first_idx and next_idx mean that the buffer
- * is either empty or full.
- *
- * If the buffer is empty, we must respect the position of the indexes.
- * They cannot be reset to the beginning of the buffer.
- */
-static int logbuf_has_space(u32 msg_size, bool empty)
-{
- u32 free;
-
- if (log_next_idx > log_first_idx || empty)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
-
- /*
- * We need space also for an empty header that signalizes wrapping
- * of the buffer.
- */
- return free >= msg_size + sizeof(struct printk_log);
-}
-
-static int log_make_free_space(u32 msg_size)
-{
- while (log_first_seq < log_next_seq &&
- !logbuf_has_space(msg_size, false)) {
- /* drop old messages until we have enough contiguous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
- }
-
- if (clear_seq < log_first_seq) {
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
- /* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
- return 0;
-
- return -ENOMEM;
-}
-
-/* compute the message size including the padding bytes */
-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
-{
- u32 size;
-
- size = sizeof(struct printk_log) + text_len + dict_len;
- *pad_len = (-size) & (LOG_ALIGN - 1);
- size += *pad_len;
-
- return size;
-}
-
/*
* Define how much of the log buffer we could take at maximum. The value
* must be greater than two. Note that only half of the buffer is available
@@ -594,84 +472,69 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";
-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
- u16 *dict_len, u32 *pad_len)
+static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
/*
* The message should not take the whole buffer. Otherwise, it might
* get removed too soon.
*/
u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+
if (*text_len > max_text_len)
*text_len = max_text_len;
- /* enable the warning message */
+
+ /* enable the warning message (if there is room) */
*trunc_msg_len = strlen(trunc_msg);
- /* disable the "dict" completely */
- *dict_len = 0;
- /* compute the size again, count also the warning message */
- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
+ if (*text_len >= *trunc_msg_len)
+ *text_len -= *trunc_msg_len;
+ else
+ *trunc_msg_len = 0;
}
/* insert record into the buffer, discard old ones, update heads */
static int log_store(u32 caller_id, int facility, int level,
enum log_flags flags, u64 ts_nsec,
- const char *dict, u16 dict_len,
+ const struct dev_printk_info *dev_info,
const char *text, u16 text_len)
{
- struct printk_log *msg;
- u32 size, pad_len;
+ struct prb_reserved_entry e;
+ struct printk_record r;
u16 trunc_msg_len = 0;
- /* number of '\0' padding bytes to next message */
- size = msg_used_size(text_len, dict_len, &pad_len);
+ prb_rec_init_wr(&r, text_len);
- if (log_make_free_space(size)) {
+ if (!prb_reserve(&e, prb, &r)) {
/* truncate the message if it is too long for empty buffer */
- size = truncate_msg(&text_len, &trunc_msg_len,
- &dict_len, &pad_len);
+ truncate_msg(&text_len, &trunc_msg_len);
+ prb_rec_init_wr(&r, text_len + trunc_msg_len);
/* survive when the log buffer is too small for trunc_msg */
- if (log_make_free_space(size))
+ if (!prb_reserve(&e, prb, &r))
return 0;
}
- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
- /*
- * This message + an additional empty header does not fit
- * at the end of the buffer. Add an empty header with len == 0
- * to signify a wrap around.
- */
- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
- log_next_idx = 0;
- }
-
/* fill message */
- msg = (struct printk_log *)(log_buf + log_next_idx);
- memcpy(log_text(msg), text, text_len);
- msg->text_len = text_len;
- if (trunc_msg_len) {
- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
- msg->text_len += trunc_msg_len;
- }
- memcpy(log_dict(msg), dict, dict_len);
- msg->dict_len = dict_len;
- msg->facility = facility;
- msg->level = level & 7;
- msg->flags = flags & 0x1f;
+ memcpy(&r.text_buf[0], text, text_len);
+ if (trunc_msg_len)
+ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
+ r.info->text_len = text_len + trunc_msg_len;
+ r.info->facility = facility;
+ r.info->level = level & 7;
+ r.info->flags = flags & 0x1f;
if (ts_nsec > 0)
- msg->ts_nsec = ts_nsec;
+ r.info->ts_nsec = ts_nsec;
else
- msg->ts_nsec = local_clock();
-#ifdef CONFIG_PRINTK_CALLER
- msg->caller_id = caller_id;
-#endif
- memset(log_dict(msg) + dict_len, 0, pad_len);
- msg->len = size;
+ r.info->ts_nsec = local_clock();
+ r.info->caller_id = caller_id;
+ if (dev_info)
+ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
/* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
+ if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE))
+ prb_commit(&e);
+ else
+ prb_final_commit(&e);
- return msg->text_len;
+ return (text_len + trunc_msg_len);
}
int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
@@ -723,13 +586,13 @@ static void append_char(char **pp, char *e, char c)
*(*pp)++ = c;
}
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq)
+static ssize_t info_print_ext_header(char *buf, size_t size,
+ struct printk_info *info)
{
- u64 ts_usec = msg->ts_nsec;
+ u64 ts_usec = info->ts_nsec;
char caller[20];
#ifdef CONFIG_PRINTK_CALLER
- u32 id = msg->caller_id;
+ u32 id = info->caller_id;
snprintf(caller, sizeof(caller), ",caller=%c%u",
id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
@@ -740,13 +603,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
do_div(ts_usec, 1000);
return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
- (msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-', caller);
+ (info->facility << 3) | info->level, info->seq,
+ ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len)
+static ssize_t msg_add_ext_text(char *buf, size_t size,
+ const char *text, size_t text_len,
+ unsigned char endc)
{
char *p = buf, *e = buf + size;
size_t i;
@@ -760,45 +623,56 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
else
append_char(&p, e, c);
}
- append_char(&p, e, '\n');
+ append_char(&p, e, endc);
- if (dict_len) {
- bool line = true;
+ return p - buf;
+}
- for (i = 0; i < dict_len; i++) {
- unsigned char c = dict[i];
+static ssize_t msg_add_dict_text(char *buf, size_t size,
+ const char *key, const char *val)
+{
+ size_t val_len = strlen(val);
+ ssize_t len;
- if (line) {
- append_char(&p, e, ' ');
- line = false;
- }
+ if (!val_len)
+ return 0;
- if (c == '\0') {
- append_char(&p, e, '\n');
- line = true;
- continue;
- }
+ len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */
+ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
+ len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
- if (c < ' ' || c >= 127 || c == '\\') {
- p += scnprintf(p, e - p, "\\x%02x", c);
- continue;
- }
+ return len;
+}
- append_char(&p, e, c);
- }
- append_char(&p, e, '\n');
- }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *text, size_t text_len,
+ struct dev_printk_info *dev_info)
+{
+ ssize_t len;
- return p - buf;
+ len = msg_add_ext_text(buf, size, text, text_len, '\n');
+
+ if (!dev_info)
+ goto out;
+
+ len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
+ dev_info->subsystem);
+ len += msg_add_dict_text(buf + len, size - len, "DEVICE",
+ dev_info->device);
+out:
+ return len;
}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
u64 seq;
- u32 idx;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
+
+ struct printk_info info;
+ char text_buf[CONSOLE_EXT_LOG_MAX];
+ struct printk_record record;
};
static __printf(3, 4) __cold
@@ -808,7 +682,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...)
int r;
va_start(args, fmt);
- r = vprintk_emit(facility, level, NULL, 0, fmt, args);
+ r = vprintk_emit(facility, level, NULL, fmt, args);
va_end(args);
return r;
@@ -881,7 +755,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_log *msg;
+ struct printk_record *r = &user->record;
size_t len;
ssize_t ret;
@@ -893,7 +767,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
return ret;
logbuf_lock_irq();
- while (user->seq == log_next_seq) {
+ if (!prb_read_valid(prb, user->seq, r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
logbuf_unlock_irq();
@@ -902,30 +776,26 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
logbuf_unlock_irq();
ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
+ prb_read_valid(prb, user->seq, r));
if (ret)
goto out;
logbuf_lock_irq();
}
- if (user->seq < log_first_seq) {
+ if (user->seq < prb_first_valid_seq(prb)) {
/* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->seq = prb_first_valid_seq(prb);
ret = -EPIPE;
logbuf_unlock_irq();
goto out;
}
- msg = log_from_idx(user->idx);
- len = msg_print_ext_header(user->buf, sizeof(user->buf),
- msg, user->seq);
+ len = info_print_ext_header(user->buf, sizeof(user->buf), r->info);
len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
+ &r->text_buf[0], r->info->text_len,
+ &r->info->dev_info);
- user->idx = log_next(user->idx);
- user->seq++;
+ user->seq = r->info->seq + 1;
logbuf_unlock_irq();
if (len > count) {
@@ -965,8 +835,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->seq = prb_first_valid_seq(prb);
break;
case SEEK_DATA:
/*
@@ -974,13 +843,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
user->seq = clear_seq;
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ user->seq = prb_next_seq(prb);
break;
default:
ret = -EINVAL;
@@ -1000,9 +867,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
logbuf_lock_irq();
- if (user->seq < log_next_seq) {
+ if (prb_read_valid(prb, user->seq, NULL)) {
/* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
+ if (user->seq < prb_first_valid_seq(prb))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
@@ -1037,9 +904,11 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
+ prb_rec_init_rd(&user->record, &user->info,
+ &user->text_buf[0], sizeof(user->text_buf));
+
logbuf_lock_irq();
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->seq = prb_first_valid_seq(prb);
logbuf_unlock_irq();
file->private_data = user;
@@ -1080,23 +949,58 @@ const struct file_operations kmsg_fops = {
*/
void log_buf_vmcoreinfo_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(clear_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
+ struct dev_printk_info *dev_info = NULL;
+
+ VMCOREINFO_SYMBOL(prb);
+ VMCOREINFO_SYMBOL(printk_rb_static);
+ VMCOREINFO_SYMBOL(clear_seq);
+
/*
- * Export struct printk_log size and field offsets. User space tools can
+ * Export struct size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
*/
- VMCOREINFO_STRUCT_SIZE(printk_log);
- VMCOREINFO_OFFSET(printk_log, ts_nsec);
- VMCOREINFO_OFFSET(printk_log, len);
- VMCOREINFO_OFFSET(printk_log, text_len);
- VMCOREINFO_OFFSET(printk_log, dict_len);
-#ifdef CONFIG_PRINTK_CALLER
- VMCOREINFO_OFFSET(printk_log, caller_id);
-#endif
+
+ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
+ VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, fail);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
+ VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
+ VMCOREINFO_OFFSET(prb_desc_ring, descs);
+ VMCOREINFO_OFFSET(prb_desc_ring, infos);
+ VMCOREINFO_OFFSET(prb_desc_ring, head_id);
+ VMCOREINFO_OFFSET(prb_desc_ring, tail_id);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc);
+ VMCOREINFO_OFFSET(prb_desc, state_var);
+ VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, next);
+
+ VMCOREINFO_STRUCT_SIZE(printk_info);
+ VMCOREINFO_OFFSET(printk_info, seq);
+ VMCOREINFO_OFFSET(printk_info, ts_nsec);
+ VMCOREINFO_OFFSET(printk_info, text_len);
+ VMCOREINFO_OFFSET(printk_info, caller_id);
+ VMCOREINFO_OFFSET(printk_info, dev_info);
+
+ VMCOREINFO_STRUCT_SIZE(dev_printk_info);
+ VMCOREINFO_OFFSET(dev_printk_info, subsystem);
+ VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
+ VMCOREINFO_OFFSET(dev_printk_info, device);
+ VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_ring);
+ VMCOREINFO_OFFSET(prb_data_ring, size_bits);
+ VMCOREINFO_OFFSET(prb_data_ring, data);
+ VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
+ VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);
+
+ VMCOREINFO_SIZE(atomic_long_t);
+ VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
}
#endif
@@ -1174,11 +1078,46 @@ static void __init set_percpu_data_ready(void)
__printk_percpu_data_ready = true;
}
+static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_reserved_entry e;
+ struct printk_record dest_r;
+
+ prb_rec_init_wr(&dest_r, r->info->text_len);
+
+ if (!prb_reserve(&e, rb, &dest_r))
+ return 0;
+
+ memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
+ dest_r.info->text_len = r->info->text_len;
+ dest_r.info->facility = r->info->facility;
+ dest_r.info->level = r->info->level;
+ dest_r.info->flags = r->info->flags;
+ dest_r.info->ts_nsec = r->info->ts_nsec;
+ dest_r.info->caller_id = r->info->caller_id;
+ memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));
+
+ prb_final_commit(&e);
+
+ return prb_record_text_space(&e);
+}
+
+static char setup_text_buf[LOG_LINE_MAX] __initdata;
+
void __init setup_log_buf(int early)
{
+ struct printk_info *new_infos;
+ unsigned int new_descs_count;
+ struct prb_desc *new_descs;
+ struct printk_info info;
+ struct printk_record r;
+ size_t new_descs_size;
+ size_t new_infos_size;
unsigned long flags;
char *new_log_buf;
unsigned int free;
+ u64 seq;
/*
* Some archs call setup_log_buf() multiple times - first is very
@@ -1197,24 +1136,75 @@ void __init setup_log_buf(int early)
if (!new_log_buf_len)
return;
+ new_descs_count = new_log_buf_len >> PRB_AVGBITS;
+ if (new_descs_count == 0) {
+ pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
+ return;
+ }
+
new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
- pr_err("log_buf_len: %lu bytes not available\n",
- new_log_buf_len);
+ pr_err("log_buf_len: %lu text bytes not available\n",
+ new_log_buf_len);
return;
}
+ new_descs_size = new_descs_count * sizeof(struct prb_desc);
+ new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
+ if (unlikely(!new_descs)) {
+ pr_err("log_buf_len: %zu desc bytes not available\n",
+ new_descs_size);
+ goto err_free_log_buf;
+ }
+
+ new_infos_size = new_descs_count * sizeof(struct printk_info);
+ new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
+ if (unlikely(!new_infos)) {
+ pr_err("log_buf_len: %zu info bytes not available\n",
+ new_infos_size);
+ goto err_free_descs;
+ }
+
+ prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));
+
+ prb_init(&printk_rb_dynamic,
+ new_log_buf, ilog2(new_log_buf_len),
+ new_descs, ilog2(new_descs_count),
+ new_infos);
+
logbuf_lock_irqsave(flags);
+
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_next_idx;
- memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
+
+ free = __LOG_BUF_LEN;
+ prb_for_each_record(0, &printk_rb_static, seq, &r)
+ free -= add_to_rb(&printk_rb_dynamic, &r);
+
+ /*
+ * This is early enough that everything is still running on the
+ * boot CPU and interrupts are disabled. So no new messages will
+ * appear during the transition to the dynamic buffer.
+ */
+ prb = &printk_rb_dynamic;
+
logbuf_unlock_irqrestore(flags);
+ if (seq != prb_next_seq(&printk_rb_static)) {
+ pr_err("dropped %llu messages\n",
+ prb_next_seq(&printk_rb_static) - seq);
+ }
+
pr_info("log_buf_len: %u bytes\n", log_buf_len);
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
+ return;
+
+err_free_descs:
+ memblock_free(__pa(new_descs), new_descs_size);
+err_free_log_buf:
+ memblock_free(__pa(new_log_buf), new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
@@ -1321,18 +1311,18 @@ static size_t print_caller(u32 id, char *buf)
#define print_caller(id, buf) 0
#endif
-static size_t print_prefix(const struct printk_log *msg, bool syslog,
- bool time, char *buf)
+static size_t info_print_prefix(const struct printk_info *info, bool syslog,
+ bool time, char *buf)
{
size_t len = 0;
if (syslog)
- len = print_syslog((msg->facility << 3) | msg->level, buf);
+ len = print_syslog((info->facility << 3) | info->level, buf);
if (time)
- len += print_time(msg->ts_nsec, buf + len);
+ len += print_time(info->ts_nsec, buf + len);
- len += print_caller(msg->caller_id, buf + len);
+ len += print_caller(info->caller_id, buf + len);
if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
buf[len++] = ' ';
@@ -1342,72 +1332,150 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size)
+/*
+ * Prepare the record for printing. The text is shifted within the given
+ * buffer to avoid a need for another one. The following operations are
+ * done:
+ *
+ * - Add prefix for each line.
+ * - Add the trailing newline that has been removed in vprintk_store().
+ * - Drop truncated lines that do not longer fit into the buffer.
+ *
+ * Return: The length of the updated/prepared text, including the added
+ * prefixes and the newline. The dropped line(s) are not counted.
+ */
+static size_t record_print_text(struct printk_record *r, bool syslog,
+ bool time)
{
- const char *text = log_text(msg);
- size_t text_size = msg->text_len;
- size_t len = 0;
+ size_t text_len = r->info->text_len;
+ size_t buf_size = r->text_buf_size;
+ char *text = r->text_buf;
char prefix[PREFIX_MAX];
- const size_t prefix_len = print_prefix(msg, syslog, time, prefix);
+ bool truncated = false;
+ size_t prefix_len;
+ size_t line_len;
+ size_t len = 0;
+ char *next;
+
+ /*
+ * If the message was truncated because the buffer was not large
+ * enough, treat the available text as if it were the full text.
+ */
+ if (text_len > buf_size)
+ text_len = buf_size;
- do {
- const char *next = memchr(text, '\n', text_size);
- size_t text_len;
+ prefix_len = info_print_prefix(r->info, syslog, time, prefix);
+ /*
+ * @text_len: bytes of unprocessed text
+ * @line_len: bytes of current line _without_ newline
+ * @text: pointer to beginning of current line
+ * @len: number of bytes prepared in r->text_buf
+ */
+ for (;;) {
+ next = memchr(text, '\n', text_len);
if (next) {
- text_len = next - text;
- next++;
- text_size -= next - text;
+ line_len = next - text;
} else {
- text_len = text_size;
+ /* Drop truncated line(s). */
+ if (truncated)
+ break;
+ line_len = text_len;
}
- if (buf) {
- if (prefix_len + text_len + 1 >= size - len)
+ /*
+ * Truncate the text if there is not enough space to add the
+ * prefix and a trailing newline.
+ */
+ if (len + prefix_len + text_len + 1 > buf_size) {
+ /* Drop even the current line if no space. */
+ if (len + prefix_len + line_len + 1 > buf_size)
break;
- memcpy(buf + len, prefix, prefix_len);
- len += prefix_len;
- memcpy(buf + len, text, text_len);
- len += text_len;
- buf[len++] = '\n';
- } else {
- /* SYSLOG_ACTION_* buffer size only calculation */
- len += prefix_len + text_len + 1;
+ text_len = buf_size - len - prefix_len - 1;
+ truncated = true;
}
- text = next;
- } while (text);
+ memmove(text + prefix_len, text, text_len);
+ memcpy(text, prefix, prefix_len);
+
+ len += prefix_len + line_len + 1;
+
+ if (text_len == line_len) {
+ /*
+ * Add the trailing newline removed in
+ * vprintk_store().
+ */
+ text[prefix_len + line_len] = '\n';
+ break;
+ }
+
+ /*
+ * Advance beyond the added prefix and the related line with
+ * its newline.
+ */
+ text += prefix_len + line_len + 1;
+
+ /*
+ * The remaining text has only decreased by the line with its
+ * newline.
+ *
+ * Note that @text_len can become zero. It happens when @text
+ * ended with a newline (either due to truncation or the
+ * original string ending with "\n\n"). The loop is correctly
+ * repeated and (if not truncated) an empty line with a prefix
+ * will be prepared.
+ */
+ text_len -= line_len + 1;
+ }
return len;
}
+static size_t get_record_print_text_size(struct printk_info *info,
+ unsigned int line_count,
+ bool syslog, bool time)
+{
+ char prefix[PREFIX_MAX];
+ size_t prefix_len;
+
+ prefix_len = info_print_prefix(info, syslog, time, prefix);
+
+ /*
+ * Each line will be preceded with a prefix. The intermediate
+ * newlines are already within the text, but a final trailing
+ * newline will be added.
+ */
+ return ((prefix_len * line_count) + info->text_len + 1);
+}
+
static int syslog_print(char __user *buf, int size)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
- struct printk_log *msg;
int len = 0;
text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+
while (size > 0) {
size_t n;
size_t skip;
logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
- }
- if (syslog_seq == log_next_seq) {
+ if (!prb_read_valid(prb, syslog_seq, &r)) {
logbuf_unlock_irq();
break;
}
+ if (r.info->seq != syslog_seq) {
+ /* message is gone, move to next valid one */
+ syslog_seq = r.info->seq;
+ syslog_partial = 0;
+ }
/*
* To keep reading/counting partial line consistent,
@@ -1417,13 +1485,10 @@ static int syslog_print(char __user *buf, int size)
syslog_time = printk_time;
skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, true, syslog_time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ n = record_print_text(&r, true, syslog_time);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
+ syslog_seq = r.info->seq + 1;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
@@ -1454,11 +1519,12 @@ static int syslog_print(char __user *buf, int size)
static int syslog_print_all(char __user *buf, int size, bool clear)
{
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
char *text;
int len = 0;
- u64 next_seq;
u64 seq;
- u32 idx;
bool time;
text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
@@ -1471,38 +1537,28 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- seq = clear_seq;
- idx = clear_idx;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ prb_for_each_info(clear_seq, prb, seq, &info, &line_count)
+ len += get_record_print_text_size(&info, line_count, true, time);
/* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ prb_for_each_info(clear_seq, prb, seq, &info, &line_count) {
+ if (len <= size)
+ break;
+ len -= get_record_print_text_size(&info, line_count, true, time);
}
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen = msg_print_text(msg, true, time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ prb_for_each_record(seq, prb, seq, &r) {
+ int textlen;
- idx = log_next(idx);
- seq++;
+ textlen = record_print_text(&r, true, time);
+
+ if (len + textlen > size) {
+ seq--;
+ break;
+ }
logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
@@ -1511,17 +1567,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
len += textlen;
logbuf_lock_irq();
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- }
+ if (len < 0)
+ break;
}
- if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- }
+ if (clear)
+ clear_seq = seq;
logbuf_unlock_irq();
kfree(text);
@@ -1531,8 +1582,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
logbuf_lock_irq();
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
+ clear_seq = prb_next_seq(prb);
logbuf_unlock_irq();
}
@@ -1559,7 +1609,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
if (!access_ok(buf, len))
return -EFAULT;
error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
+ prb_read_valid(prb, syslog_seq, NULL));
if (error)
return error;
error = syslog_print(buf, len);
@@ -1567,7 +1617,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
clear = true;
- /* FALL THRU */
+ fallthrough;
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
if (!buf || len < 0)
@@ -1608,10 +1658,9 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
+ if (syslog_seq < prb_first_valid_seq(prb)) {
/* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
+ syslog_seq = prb_first_valid_seq(prb);
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
@@ -1620,20 +1669,18 @@ int do_syslog(int type, char __user *buf, int len, int source)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_seq - syslog_seq;
+ error = prb_next_seq(prb) - syslog_seq;
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
bool time = syslog_partial ? syslog_time : printk_time;
-
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- error += msg_print_text(msg, true, time, NULL,
- 0);
+ struct printk_info info;
+ unsigned int line_count;
+ u64 seq;
+
+ prb_for_each_info(syslog_seq, prb, seq, &info,
+ &line_count) {
+ error += get_record_print_text_size(&info, line_count,
+ true, time);
time = printk_time;
- idx = log_next(idx);
- seq++;
}
error -= syslog_partial;
}
@@ -1804,10 +1851,22 @@ static int console_trylock_spinning(void)
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len)
{
+ static char dropped_text[64];
+ size_t dropped_len = 0;
struct console *con;
trace_console_rcuidle(text, len);
+ if (!console_drivers)
+ return;
+
+ if (console_dropped) {
+ dropped_len = snprintf(dropped_text, sizeof(dropped_text),
+ "** %lu printk messages dropped **\n",
+ console_dropped);
+ console_dropped = 0;
+ }
+
for_each_console(con) {
if (exclusive_console && con != exclusive_console)
continue;
@@ -1820,8 +1879,11 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
continue;
if (con->flags & CON_EXTENDED)
con->write(con, ext_text, ext_len);
- else
+ else {
+ if (dropped_len)
+ con->write(con, dropped_text, dropped_len);
con->write(con, text, len);
+ }
}
}
@@ -1845,97 +1907,38 @@ static inline u32 printk_caller_id(void)
0x80000000 + raw_smp_processor_id();
}
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
- char buf[LOG_LINE_MAX];
- size_t len; /* length == 0 means unused buffer */
- u32 caller_id; /* printk_caller_id() of first print */
- u64 ts_nsec; /* time of first print */
- u8 level; /* log level of first message */
- u8 facility; /* log facility of first message */
- enum log_flags flags; /* prefix, newline flags */
-} cont;
-
-static void cont_flush(void)
-{
- if (cont.len == 0)
- return;
-
- log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.len = 0;
-}
-
-static bool cont_add(u32 caller_id, int facility, int level,
- enum log_flags flags, const char *text, size_t len)
-{
- /* If the line gets too long, split it up in separate records. */
- if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
- return false;
- }
-
- if (!cont.len) {
- cont.facility = facility;
- cont.level = level;
- cont.caller_id = caller_id;
- cont.ts_nsec = local_clock();
- cont.flags = flags;
- }
-
- memcpy(cont.buf + cont.len, text, len);
- cont.len += len;
-
- // The original flags come from the first line,
- // but later continuations can add a newline.
- if (flags & LOG_NEWLINE) {
- cont.flags |= LOG_NEWLINE;
- cont_flush();
- }
-
- return true;
-}
-
-static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
+static size_t log_output(int facility, int level, enum log_flags lflags,
+ const struct dev_printk_info *dev_info,
+ char *text, size_t text_len)
{
const u32 caller_id = printk_caller_id();
- /*
- * If an earlier line was buffered, and we're a continuation
- * write from the same context, try to add it to the buffer.
- */
- if (cont.len) {
- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
- }
- /* Otherwise, make sure it's flushed */
- cont_flush();
- }
-
- /* Skip empty continuation lines that couldn't be added - they just flush */
- if (!text_len && (lflags & LOG_CONT))
- return 0;
-
- /* If it doesn't end in a newline, try to buffer the current line */
- if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
+ if (lflags & LOG_CONT) {
+ struct prb_reserved_entry e;
+ struct printk_record r;
+
+ prb_rec_init_wr(&r, text_len);
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ memcpy(&r.text_buf[r.info->text_len], text, text_len);
+ r.info->text_len += text_len;
+ if (lflags & LOG_NEWLINE) {
+ r.info->flags |= LOG_NEWLINE;
+ prb_final_commit(&e);
+ } else {
+ prb_commit(&e);
+ }
return text_len;
+ }
}
/* Store it in the record log */
return log_store(caller_id, facility, level, lflags, 0,
- dict, dictlen, text, text_len);
+ dev_info, text, text_len);
}
/* Must be called under logbuf_lock. */
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
static char textbuf[LOG_LINE_MAX];
@@ -1977,21 +1980,19 @@ int vprintk_store(int facility, int level,
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
- if (dict)
+ if (dev_info)
lflags |= LOG_NEWLINE;
- return log_output(facility, level, lflags,
- dict, dictlen, text, text_len);
+ return log_output(facility, level, lflags, dev_info, text, text_len);
}
asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
int printed_len;
- bool in_sched = false, pending_output;
+ bool in_sched = false;
unsigned long flags;
- u64 curr_log_seq;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
@@ -2007,13 +2008,11 @@ asmlinkage int vprintk_emit(int facility, int level,
/* This stops the holder of console_sem just where we want him */
logbuf_lock_irqsave(flags);
- curr_log_seq = log_next_seq;
- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args);
- pending_output = (curr_log_seq != log_next_seq);
+ printed_len = vprintk_store(facility, level, dev_info, fmt, args);
logbuf_unlock_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
- if (!in_sched && pending_output) {
+ if (!in_sched) {
/*
* Disable preemption to avoid being preempted while holding
* console_sem which would prevent anyone from printing to
@@ -2030,8 +2029,7 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
- if (pending_output)
- wake_up_klogd();
+ wake_up_klogd();
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2044,7 +2042,7 @@ EXPORT_SYMBOL(vprintk);
int vprintk_default(const char *fmt, va_list args)
{
- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
@@ -2088,30 +2086,31 @@ EXPORT_SYMBOL(printk);
#define PREFIX_MAX 0
#define printk_time false
+#define prb_read_valid(rb, seq, r) false
+#define prb_first_valid_seq(rb) 0
+
static u64 syslog_seq;
-static u32 syslog_idx;
static u64 console_seq;
-static u32 console_idx;
static u64 exclusive_console_stop_seq;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static char *log_text(const struct printk_log *msg) { return NULL; }
-static char *log_dict(const struct printk_log *msg) { return NULL; }
-static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg,
- u64 seq) { return 0; }
+static unsigned long console_dropped;
+
+static size_t record_print_text(const struct printk_record *r,
+ bool syslog, bool time)
+{
+ return 0;
+}
+static ssize_t info_print_ext_header(char *buf, size_t size,
+ struct printk_info *info)
+{
+ return 0;
+}
static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len) { return 0; }
+ char *text, size_t text_len,
+ struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(void) { return 0; }
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
#endif /* CONFIG_PRINTK */
@@ -2398,12 +2397,16 @@ void console_unlock(void)
static char text[LOG_LINE_MAX + PREFIX_MAX];
unsigned long flags;
bool do_cond_resched, retry;
+ struct printk_info info;
+ struct printk_record r;
if (console_suspended) {
up_console_sem();
return;
}
+ prb_rec_init_rd(&r, &info, text, sizeof(text));
+
/*
* Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
@@ -2416,7 +2419,7 @@ void console_unlock(void)
*
* console_trylock() is not able to detect the preemptive
* context reliably. Therefore the value must be stored before
- * and cleared after the the "again" goto label.
+ * and cleared after the "again" goto label.
*/
do_cond_resched = console_may_schedule;
again:
@@ -2434,35 +2437,26 @@ again:
}
for (;;) {
- struct printk_log *msg;
size_t ext_len = 0;
size_t len;
printk_safe_enter_irqsave(flags);
raw_spin_lock(&logbuf_lock);
- if (console_seq < log_first_seq) {
- len = snprintf(text, sizeof(text),
- "** %llu printk messages dropped **\n",
- log_first_seq - console_seq);
-
- /* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- } else {
- len = 0;
- }
skip:
- if (console_seq == log_next_seq)
+ if (!prb_read_valid(prb, console_seq, &r))
break;
- msg = log_from_idx(console_idx);
- if (suppress_message_printing(msg->level)) {
+ if (console_seq != r.info->seq) {
+ console_dropped += r.info->seq - console_seq;
+ console_seq = r.info->seq;
+ }
+
+ if (suppress_message_printing(r.info->level)) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it, and
* record that has level above the console loglevel.
*/
- console_idx = log_next(console_idx);
console_seq++;
goto skip;
}
@@ -2473,19 +2467,23 @@ skip:
exclusive_console = NULL;
}
- len += msg_print_text(msg,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time, text + len, sizeof(text) - len);
+ /*
+ * Handle extended console text first because later
+ * record_print_text() will modify the record buffer in-place.
+ */
if (nr_ext_console_drivers) {
- ext_len = msg_print_ext_header(ext_text,
+ ext_len = info_print_ext_header(ext_text,
sizeof(ext_text),
- msg, console_seq);
+ r.info);
ext_len += msg_print_ext_body(ext_text + ext_len,
sizeof(ext_text) - ext_len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
+ &r.text_buf[0],
+ r.info->text_len,
+ &r.info->dev_info);
}
- console_idx = log_next(console_idx);
+ len = record_print_text(&r,
+ console_msg_format & MSG_FORMAT_SYSLOG,
+ printk_time);
console_seq++;
raw_spin_unlock(&logbuf_lock);
@@ -2525,7 +2523,7 @@ skip:
* flush, no worries.
*/
raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
+ retry = prb_read_valid(prb, console_seq, NULL);
raw_spin_unlock(&logbuf_lock);
printk_safe_exit_irqrestore(flags);
@@ -2594,8 +2592,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
unsigned long flags;
logbuf_lock_irqsave(flags);
- console_seq = log_first_seq;
- console_idx = log_first_idx;
+ console_seq = prb_first_valid_seq(prb);
logbuf_unlock_irqrestore(flags);
}
console_unlock();
@@ -2838,7 +2835,6 @@ void register_console(struct console *newcon)
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
console_seq = syslog_seq;
- console_idx = syslog_idx;
logbuf_unlock_irqrestore(flags);
}
console_unlock();
@@ -3062,7 +3058,7 @@ int vprintk_deferred(const char *fmt, va_list args)
{
int r;
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
defer_console_output();
return r;
@@ -3227,9 +3223,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
logbuf_lock_irqsave(flags);
dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ dumper->next_seq = prb_next_seq(prb);
logbuf_unlock_irqrestore(flags);
/* invoke dumper which will iterate over records */
@@ -3263,28 +3257,33 @@ void kmsg_dump(enum kmsg_dump_reason reason)
bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
char *line, size_t size, size_t *len)
{
- struct printk_log *msg;
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
size_t l = 0;
bool ret = false;
+ prb_rec_init_rd(&r, &info, line, size);
+
if (!dumper->active)
goto out;
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
- }
-
- /* last entry */
- if (dumper->cur_seq >= log_next_seq)
- goto out;
+ /* Read text or count text lines? */
+ if (line) {
+ if (!prb_read_valid(prb, dumper->cur_seq, &r))
+ goto out;
+ l = record_print_text(&r, syslog, printk_time);
+ } else {
+ if (!prb_read_valid_info(prb, dumper->cur_seq,
+ &info, &line_count)) {
+ goto out;
+ }
+ l = get_record_print_text_size(&info, line_count, syslog,
+ printk_time);
- msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, syslog, printk_time, line, size);
+ }
- dumper->cur_idx = log_next(dumper->cur_idx);
- dumper->cur_seq++;
+ dumper->cur_seq = r.info->seq + 1;
ret = true;
out:
if (len)
@@ -3332,7 +3331,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* @len: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
- * with as many of the the *youngest* kmsg records that fit into it.
+ * with as many of the *youngest* kmsg records that fit into it.
* If the buffer is large enough, all available kmsg records will be
* copied with a single call.
*
@@ -3345,23 +3344,25 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
char *buf, size_t size, size_t *len)
{
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
unsigned long flags;
u64 seq;
- u32 idx;
u64 next_seq;
- u32 next_idx;
size_t l = 0;
bool ret = false;
bool time = printk_time;
- if (!dumper->active)
+ prb_rec_init_rd(&r, &info, buf, size);
+
+ if (!dumper->active || !buf || !size)
goto out;
logbuf_lock_irqsave(flags);
- if (dumper->cur_seq < log_first_seq) {
+ if (dumper->cur_seq < prb_first_valid_seq(prb)) {
/* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ dumper->cur_seq = prb_first_valid_seq(prb);
}
/* last entry */
@@ -3372,41 +3373,41 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* calculate length of entire buffer */
seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- l += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ while (prb_read_valid_info(prb, seq, &info, &line_count)) {
+ if (r.info->seq >= dumper->next_seq)
+ break;
+ l += get_record_print_text_size(&info, line_count, true, time);
+ seq = r.info->seq + 1;
}
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (l >= size && seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- l -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ while (l >= size && prb_read_valid_info(prb, seq,
+ &info, &line_count)) {
+ if (r.info->seq >= dumper->next_seq)
+ break;
+ l -= get_record_print_text_size(&info, line_count, true, time);
+ seq = r.info->seq + 1;
}
/* last message in next interation */
next_seq = seq;
- next_idx = idx;
+ /* actually read text into the buffer now */
l = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ while (prb_read_valid(prb, seq, &r)) {
+ if (r.info->seq >= dumper->next_seq)
+ break;
+
+ l += record_print_text(&r, syslog, time);
+
+ /* adjust record to store to remaining buffer space */
+ prb_rec_init_rd(&r, &info, buf + l, size - l);
- l += msg_print_text(msg, syslog, time, buf + l, size - l);
- idx = log_next(idx);
- seq++;
+ seq = r.info->seq + 1;
}
dumper->next_seq = next_seq;
- dumper->next_idx = next_idx;
ret = true;
logbuf_unlock_irqrestore(flags);
out:
@@ -3429,9 +3430,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ dumper->next_seq = prb_next_seq(prb);
}
/**
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
new file mode 100644
index 000000000000..2493348a1631
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.c
@@ -0,0 +1,2083 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include "printk_ringbuffer.h"
+
+/**
+ * DOC: printk_ringbuffer overview
+ *
+ * Data Structure
+ * --------------
+ * The printk_ringbuffer is made up of 3 internal ringbuffers:
+ *
+ * desc_ring
+ * A ring of descriptors and their meta data (such as sequence number,
+ * timestamp, loglevel, etc.) as well as internal state information about
+ * the record and logical positions specifying where in the other
+ * ringbuffer the text strings are located.
+ *
+ * text_data_ring
+ * A ring of data blocks. A data block consists of an unsigned long
+ * integer (ID) that maps to a desc_ring index followed by the text
+ * string of the record.
+ *
+ * The internal state information of a descriptor is the key element to allow
+ * readers and writers to locklessly synchronize access to the data.
+ *
+ * Implementation
+ * --------------
+ *
+ * Descriptor Ring
+ * ~~~~~~~~~~~~~~~
+ * The descriptor ring is an array of descriptors. A descriptor contains
+ * essential meta data to track the data of a printk record using
+ * blk_lpos structs pointing to associated text data blocks (see
+ * "Data Rings" below). Each descriptor is assigned an ID that maps
+ * directly to index values of the descriptor array and has a state. The ID
+ * and the state are bitwise combined into a single descriptor field named
+ * @state_var, allowing ID and state to be synchronously and atomically
+ * updated.
+ *
+ * Descriptors have four states:
+ *
+ * reserved
+ * A writer is modifying the record.
+ *
+ * committed
+ * The record and all its data are written. A writer can reopen the
+ * descriptor (transitioning it back to reserved), but in the committed
+ * state the data is consistent.
+ *
+ * finalized
+ * The record and all its data are complete and available for reading. A
+ * writer cannot reopen the descriptor.
+ *
+ * reusable
+ * The record exists, but its text and/or meta data may no longer be
+ * available.
+ *
+ * Querying the @state_var of a record requires providing the ID of the
+ * descriptor to query. This can yield a possible fifth (pseudo) state:
+ *
+ * miss
+ * The descriptor being queried has an unexpected ID.
+ *
+ * The descriptor ring has a @tail_id that contains the ID of the oldest
+ * descriptor and @head_id that contains the ID of the newest descriptor.
+ *
+ * When a new descriptor should be created (and the ring is full), the tail
+ * descriptor is invalidated by first transitioning to the reusable state and
+ * then invalidating all tail data blocks up to and including the data blocks
+ * associated with the tail descriptor (for the text ring). Then
+ * @tail_id is advanced, followed by advancing @head_id. And finally the
+ * @state_var of the new descriptor is initialized to the new ID and reserved
+ * state.
+ *
+ * The @tail_id can only be advanced if the new @tail_id would be in the
+ * committed or reusable queried state. This makes it possible that a valid
+ * sequence number of the tail is always available.
+ *
+ * Descriptor Finalization
+ * ~~~~~~~~~~~~~~~~~~~~~~~
+ * When a writer calls the commit function prb_commit(), record data is
+ * fully stored and is consistent within the ringbuffer. However, a writer can
+ * reopen that record, claiming exclusive access (as with prb_reserve()), and
+ * modify that record. When finished, the writer must again commit the record.
+ *
+ * In order for a record to be made available to readers (and also become
+ * recyclable for writers), it must be finalized. A finalized record cannot be
+ * reopened and can never become "unfinalized". Record finalization can occur
+ * in three different scenarios:
+ *
+ * 1) A writer can simultaneously commit and finalize its record by calling
+ * prb_final_commit() instead of prb_commit().
+ *
+ * 2) When a new record is reserved and the previous record has been
+ * committed via prb_commit(), that previous record is automatically
+ * finalized.
+ *
+ * 3) When a record is committed via prb_commit() and a newer record
+ * already exists, the record being committed is automatically finalized.
+ *
+ * Data Ring
+ * ~~~~~~~~~
+ * The text data ring is a byte array composed of data blocks. Data blocks are
+ * referenced by blk_lpos structs that point to the logical position of the
+ * beginning of a data block and the beginning of the next adjacent data
+ * block. Logical positions are mapped directly to index values of the byte
+ * array ringbuffer.
+ *
+ * Each data block consists of an ID followed by the writer data. The ID is
+ * the identifier of a descriptor that is associated with the data block. A
+ * given data block is considered valid if all of the following conditions
+ * are met:
+ *
+ * 1) The descriptor associated with the data block is in the committed
+ * or finalized queried state.
+ *
+ * 2) The blk_lpos struct within the descriptor associated with the data
+ * block references back to the same data block.
+ *
+ * 3) The data block is within the head/tail logical position range.
+ *
+ * If the writer data of a data block would extend beyond the end of the
+ * byte array, only the ID of the data block is stored at the logical
+ * position and the full data block (ID and writer data) is stored at the
+ * beginning of the byte array. The referencing blk_lpos will point to the
+ * ID before the wrap and the next data block will be at the logical
+ * position adjacent the full data block after the wrap.
+ *
+ * Data rings have a @tail_lpos that points to the beginning of the oldest
+ * data block and a @head_lpos that points to the logical position of the
+ * next (not yet existing) data block.
+ *
+ * When a new data block should be created (and the ring is full), tail data
+ * blocks will first be invalidated by putting their associated descriptors
+ * into the reusable state and then pushing the @tail_lpos forward beyond
+ * them. Then the @head_lpos is pushed forward and is associated with a new
+ * descriptor. If a data block is not valid, the @tail_lpos cannot be
+ * advanced beyond it.
+ *
+ * Info Array
+ * ~~~~~~~~~~
+ * The general meta data of printk records are stored in printk_info structs,
+ * stored in an array with the same number of elements as the descriptor ring.
+ * Each info corresponds to the descr