aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c11
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/audit_fsnotify.c1
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c41
-rw-r--r--kernel/bounds.c7
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arraymap.c77
-rw-r--r--kernel/bpf/bloom_filter.c2
-rw-r--r--kernel/bpf/bpf_iter.c33
-rw-r--r--kernel/bpf/bpf_local_storage.c10
-rw-r--r--kernel/bpf/bpf_lsm.c91
-rw-r--r--kernel/bpf/bpf_struct_ops.c10
-rw-r--r--kernel/bpf/bpf_task_storage.c8
-rw-r--r--kernel/bpf/btf.c631
-rw-r--r--kernel/bpf/cgroup.c579
-rw-r--r--kernel/bpf/cgroup_iter.c282
-rw-r--r--kernel/bpf/core.c158
-rw-r--r--kernel/bpf/cpumap.c6
-rw-r--r--kernel/bpf/devmap.c12
-rw-r--r--kernel/bpf/dispatcher.c27
-rw-r--r--kernel/bpf/hashtab.c218
-rw-r--r--kernel/bpf/helpers.c144
-rw-r--r--kernel/bpf/local_storage.c7
-rw-r--r--kernel/bpf/lpm_trie.c6
-rw-r--r--kernel/bpf/memalloc.c635
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/percpu_freelist.c60
-rw-r--r--kernel/bpf/preload/iterators/Makefile10
-rw-r--r--kernel/bpf/queue_stack_maps.c2
-rw-r--r--kernel/bpf/reuseport_array.c9
-rw-r--r--kernel/bpf/ringbuf.c253
-rw-r--r--kernel/bpf/stackmap.c4
-rw-r--r--kernel/bpf/syscall.c136
-rw-r--r--kernel/bpf/task_iter.c234
-rw-r--r--kernel/bpf/trampoline.c497
-rw-r--r--kernel/bpf/verifier.c1086
-rw-r--r--kernel/cfi.c342
-rw-r--r--kernel/cgroup/cgroup-internal.h4
-rw-r--r--kernel/cgroup/cgroup-v1.c21
-rw-r--r--kernel/cgroup/cgroup.c579
-rw-r--r--kernel/cgroup/cpuset.c820
-rw-r--r--kernel/cgroup/legacy_freezer.c23
-rw-r--r--kernel/cgroup/pids.c37
-rw-r--r--kernel/cgroup/rstat.c92
-rw-r--r--kernel/configs/android-base.config1
-rw-r--r--kernel/configs/rust.config1
-rw-r--r--kernel/configs/x86_debug.config3
-rw-r--r--kernel/configs/xen.config1
-rw-r--r--kernel/context_tracking.c617
-rw-r--r--kernel/cpu_pm.c8
-rw-r--r--kernel/crash_core.c29
-rw-r--r--kernel/debug/debug_core.c12
-rw-r--r--kernel/delayacct.c13
-rw-r--r--kernel/dma/coherent.c10
-rw-r--r--kernel/dma/debug.c8
-rw-r--r--kernel/dma/direct.c48
-rw-r--r--kernel/dma/direct.h8
-rw-r--r--kernel/dma/mapping.c58
-rw-r--r--kernel/dma/swiotlb.c304
-rw-r--r--kernel/entry/common.c21
-rw-r--r--kernel/entry/kvm.c6
-rw-r--r--kernel/events/Makefile1
-rw-r--r--kernel/events/core.c335
-rw-r--r--kernel/events/hw_breakpoint.c648
-rw-r--r--kernel/events/hw_breakpoint_test.c333
-rw-r--r--kernel/events/ring_buffer.c7
-rw-r--r--kernel/events/uprobes.c39
-rw-r--r--kernel/exit.c35
-rw-r--r--kernel/extable.c4
-rw-r--r--kernel/fail_function.c26
-rw-r--r--kernel/fork.c115
-rw-r--r--kernel/freezer.c133
-rw-r--r--kernel/futex/waitwake.c8
-rw-r--r--kernel/gcov/gcc_4_7.c18
-rwxr-xr-xkernel/gen_kheaders.sh6
-rw-r--r--kernel/groups.c13
-rw-r--r--kernel/hung_task.c29
-rw-r--r--kernel/irq/Kconfig2
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/generic-chip.c2
-rw-r--r--kernel/irq/ipi.c16
-rw-r--r--kernel/irq/irqdesc.c26
-rw-r--r--kernel/irq/irqdomain.c14
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/jump_label.c41
-rw-r--r--kernel/kallsyms.c157
-rw-r--r--kernel/kallsyms_internal.h30
-rw-r--r--kernel/kcov.c7
-rw-r--r--kernel/kcsan/.kunitconfig24
-rw-r--r--kernel/kcsan/selftest.c4
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kexec_core.c63
-rw-r--r--kernel/kexec_file.c108
-rw-r--r--kernel/kexec_internal.h15
-rw-r--r--kernel/kprobes.c19
-rw-r--r--kernel/ksysfs.c7
-rw-r--r--kernel/kthread.c18
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/livepatch/core.c24
-rw-r--r--kernel/livepatch/transition.c18
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c17
-rw-r--r--kernel/locking/percpu-rwsem.c6
-rw-r--r--kernel/locking/qrwlock.c4
-rw-r--r--kernel/locking/qspinlock.c2
-rw-r--r--kernel/locking/qspinlock_paravirt.h4
-rw-r--r--kernel/locking/rwsem.c44
-rw-r--r--kernel/locking/semaphore.c12
-rw-r--r--kernel/locking/spinlock.c56
-rw-r--r--kernel/locking/test-ww_mutex.c4
-rw-r--r--kernel/module/Kconfig293
-rw-r--r--kernel/module/decompress.c10
-rw-r--r--kernel/module/internal.h19
-rw-r--r--kernel/module/kallsyms.c76
-rw-r--r--kernel/module/main.c125
-rw-r--r--kernel/module/procfs.c2
-rw-r--r--kernel/module/tracking.c71
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/power/Kconfig20
-rw-r--r--kernel/power/energy_model.c24
-rw-r--r--kernel/power/hibernate.c37
-rw-r--r--kernel/power/main.c18
-rw-r--r--kernel/power/process.c10
-rw-r--r--kernel/power/qos.c4
-rw-r--r--kernel/power/suspend.c15
-rw-r--r--kernel/power/swap.c29
-rw-r--r--kernel/power/user.c37
-rw-r--r--kernel/printk/printk.c619
-rw-r--r--kernel/profile.c47
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcu/Kconfig31
-rw-r--r--kernel/rcu/Kconfig.debug5
-rw-r--r--kernel/rcu/rcu.h19
-rw-r--r--kernel/rcu/rcuscale.c1
-rw-r--r--kernel/rcu/rcutorture.c483
-rw-r--r--kernel/rcu/refscale.c18
-rw-r--r--kernel/rcu/srcutiny.c14
-rw-r--r--kernel/rcu/srcutree.c98
-rw-r--r--kernel/rcu/tasks.h544
-rw-r--r--kernel/rcu/tiny.c52
-rw-r--r--kernel/rcu/tree.c970
-rw-r--r--kernel/rcu/tree.h21
-rw-r--r--kernel/rcu/tree_exp.h168
-rw-r--r--kernel/rcu/tree_nocb.h276
-rw-r--r--kernel/rcu/tree_plugin.h108
-rw-r--r--kernel/rcu/tree_stall.h62
-rw-r--r--kernel/rcu/update.c15
-rw-r--r--kernel/reboot.c118
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c185
-rw-r--r--kernel/rseq.c23
-rw-r--r--kernel/sched/autogroup.c3
-rw-r--r--kernel/sched/completion.c12
-rw-r--r--kernel/sched/core.c453
-rw-r--r--kernel/sched/core_sched.c19
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c33
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/sched/deadline.c130
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c1214
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/idle.c10
-rw-r--r--kernel/sched/pelt.h40
-rw-r--r--kernel/sched/psi.c309
-rw-r--r--kernel/sched/rt.c37
-rw-r--r--kernel/sched/sched.h174
-rw-r--r--kernel/sched/stats.h6
-rw-r--r--kernel/sched/stop_task.c11
-rw-r--r--kernel/sched/topology.c23
-rw-r--r--kernel/sched/wait_bit.c2
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c13
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl-test.c43
-rw-r--r--kernel/sysctl.c148
-rw-r--r--kernel/task_work.c16
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/time/Kconfig37
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c5
-rw-r--r--kernel/time/posix-stubs.c3
-rw-r--r--kernel/time/posix-timers.c19
-rw-r--r--kernel/time/tick-sched.c3
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c157
-rw-r--r--kernel/trace/bpf_trace.c286
-rw-r--r--kernel/trace/ftrace.c432
-rw-r--r--kernel/trace/kprobe_event_gen_test.c49
-rw-r--r--kernel/trace/rethook.c9
-rw-r--r--kernel/trace/ring_buffer.c93
-rw-r--r--kernel/trace/rv/Kconfig78
-rw-r--r--kernel/trace/rv/Makefile8
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c88
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h46
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c87
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h46
-rw-r--r--kernel/trace/rv/reactor_panic.c43
-rw-r--r--kernel/trace/rv/reactor_printk.c42
-rw-r--r--kernel/trace/rv/rv.c799
-rw-r--r--kernel/trace/rv/rv.h68
-rw-r--r--kernel/trace/rv/rv_reactors.c510
-rw-r--r--kernel/trace/trace.c134
-rw-r--r--kernel/trace/trace.h22
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_benchmark.h8
-rw-r--r--kernel/trace/trace_dynevent.c2
-rw-r--r--kernel/trace/trace_eprobe.c289
-rw-r--r--kernel/trace/trace_event_perf.c7
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_events_filter.c239
-rw-r--r--kernel/trace/trace_events_hist.c253
-rw-r--r--kernel/trace/trace_events_synth.c23
-rw-r--r--kernel/trace/trace_events_trigger.c3
-rw-r--r--kernel/trace/trace_events_user.c570
-rw-r--r--kernel/trace/trace_kprobe.c87
-rw-r--r--kernel/trace/trace_osnoise.c3
-rw-r--r--kernel/trace/trace_preemptirq.c4
-rw-r--r--kernel/trace/trace_probe.c33
-rw-r--r--kernel/trace/trace_probe.h6
-rw-r--r--kernel/trace/trace_probe_kernel.h115
-rw-r--r--kernel/trace/trace_uprobe.c20
-rw-r--r--kernel/trace/tracing_map.c5
-rw-r--r--kernel/tracepoint.c19
-rw-r--r--kernel/ucount.c34
-rw-r--r--kernel/umh.c18
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/utsname_sysctl.c10
-rw-r--r--kernel/watch_queue.c105
-rw-r--r--kernel/watchdog.c25
-rw-r--r--kernel/watchdog_hld.c4
-rw-r--r--kernel/workqueue.c38
245 files changed, 16674 insertions, 6735 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 318789c728d3..d754e0be1176 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
UBSAN_SANITIZE_kcov.o := n
+KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# Don't instrument error handlers
diff --git a/kernel/acct.c b/kernel/acct.c
index 13706356ec54..62200d799b9b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead)
unsigned long vsize = 0;
if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- mmap_read_lock(current->mm);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/audit.c b/kernel/audit.c
index 7690c29d4ee4..9bc0b0301198 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -321,7 +321,6 @@ static inline int audit_rate_check(void)
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
- unsigned long elapsed;
int retval = 0;
if (!audit_rate_limit) return 1;
@@ -330,9 +329,8 @@ static inline int audit_rate_check(void)
if (++messages < audit_rate_limit) {
retval = 1;
} else {
- now = jiffies;
- elapsed = now - last_check;
- if (elapsed > HZ) {
+ now = jiffies;
+ if (time_after(now, last_check + HZ)) {
last_check = now;
messages = 0;
retval = 1;
@@ -366,7 +364,7 @@ void audit_log_lost(const char *message)
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
- if (now - last_msg > HZ) {
+ if (time_after(now, last_msg + HZ)) {
print = 1;
last_msg = now;
}
@@ -1100,7 +1098,7 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
audit_log_common_recv_msg(NULL, ab, msg_type);
}
-int is_audit_feature_set(int i)
+static int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
@@ -1390,7 +1388,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
str);
} else {
audit_log_format(ab, " data=");
- if (data_len > 0 && str[data_len - 1] == '\0')
+ if (str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 58b66543b4d5..c57b008b9914 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -133,7 +133,7 @@ struct audit_context {
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
/* Save things to print about task_struct */
- pid_t pid, ppid;
+ pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
@@ -245,8 +245,6 @@ struct audit_netlink_list {
int audit_send_list_thread(void *_dest);
-extern int selinux_audit_rule_update(void);
-
extern struct mutex audit_filter_mutex;
extern int audit_del_rule(struct audit_entry *entry);
extern void audit_free_rule_rcu(struct rcu_head *head);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 6432a37ac1c9..c565fbf66ac8 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
if (ret < 0) {
+ audit_mark->path = NULL;
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 4b0957aa2cd4..65075f1e4ac8 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct path *path)
+static struct audit_parent *audit_init_parent(const struct path *path)
{
struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f3a2abd6d1a1..9f8c05228d6d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -965,7 +965,7 @@ static void audit_reset_context(struct audit_context *ctx)
if (!ctx)
return;
- /* if ctx is non-null, reset the "ctx->state" regardless */
+ /* if ctx is non-null, reset the "ctx->context" regardless */
ctx->context = AUDIT_CTX_UNUSED;
if (ctx->dummy)
return;
@@ -1002,7 +1002,7 @@ static void audit_reset_context(struct audit_context *ctx)
kfree(ctx->sockaddr);
ctx->sockaddr = NULL;
ctx->sockaddr_len = 0;
- ctx->pid = ctx->ppid = 0;
+ ctx->ppid = 0;
ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
ctx->personality = 0;
@@ -1014,10 +1014,9 @@ static void audit_reset_context(struct audit_context *ctx)
ctx->target_comm[0] = '\0';
unroll_tree_refs(ctx, NULL, 0);
WARN_ON(!list_empty(&ctx->killed_trees));
- ctx->type = 0;
audit_free_module(ctx);
ctx->fds[0] = -1;
- audit_proctitle_free(ctx);
+ ctx->type = 0; /* reset last for audit_free_*() */
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -1073,35 +1072,11 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
-/**
- * audit_alloc_kernel - allocate an audit_context for a kernel task
- * @tsk: the kernel task
- *
- * Similar to the audit_alloc() function, but intended for kernel private
- * threads. Returns zero on success, negative values on failure.
- */
-int audit_alloc_kernel(struct task_struct *tsk)
-{
- /*
- * At the moment we are just going to call into audit_alloc() to
- * simplify the code, but there two things to keep in mind with this
- * approach:
- *
- * 1. Filtering internal kernel tasks is a bit laughable in almost all
- * cases, but there is at least one case where there is a benefit:
- * the '-a task,never' case allows the admin to effectively disable
- * task auditing at runtime.
- *
- * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
- * on these internal kernel tasks, but they probably don't hurt either.
- */
- return audit_alloc(tsk);
-}
-
static inline void audit_free_context(struct audit_context *context)
{
/* resetting is extra work, but it is likely just noise */
audit_reset_context(context);
+ audit_proctitle_free(context);
free_tree_refs(context);
kfree(context->filterkey);
kfree(context);
@@ -1858,7 +1833,7 @@ void __audit_free(struct task_struct *tsk)
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
- * random task_struct that doesn't doesn't have any meaningful data we
+ * random task_struct that doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
if (tsk == current && !context->dummy) {
@@ -1965,6 +1940,7 @@ void __audit_uring_exit(int success, long code)
goto out;
}
+ audit_return_fixup(ctx, success, code);
if (ctx->context == AUDIT_CTX_SYSCALL) {
/*
* NOTE: See the note in __audit_uring_entry() about the case
@@ -2006,7 +1982,6 @@ void __audit_uring_exit(int success, long code)
audit_filter_inodes(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
goto out;
- audit_return_fixup(ctx, success, code);
audit_log_exit();
out:
@@ -2090,13 +2065,13 @@ void __audit_syscall_exit(int success, long return_code)
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
+ audit_return_fixup(context, success, return_code);
/* run through both filters to ensure we set the filterkey properly */
audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
- if (context->current_state < AUDIT_STATE_RECORD)
+ if (context->current_state != AUDIT_STATE_RECORD)
goto out;
- audit_return_fixup(context, success, return_code);
audit_log_exit();
out:
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..b529182e8b04 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,13 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+ DEFINE(__LRU_REFS_WIDTH, 0);
+#endif
/* End of constants */
return 0;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70..341c94f208f4 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
@@ -24,6 +24,9 @@ endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o
+endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index fe40d3b9458f..832b2659e96e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -70,10 +70,8 @@ int array_map_alloc_check(union bpf_attr *attr)
attr->map_flags & BPF_F_PRESERVE_ELEMS)
return -EINVAL;
- if (attr->value_size > KMALLOC_MAX_SIZE)
- /* if value_size is bigger, the user space won't be able to
- * access the elements.
- */
+ /* avoid overflow on round_up(map->value_size) */
+ if (attr->value_size > INT_MAX)
return -E2BIG;
return 0;
@@ -156,6 +154,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return &array->map;
}
+static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
+{
+ return array->value + (u64)array->elem_size * index;
+}
+
/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -165,7 +168,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * (index & array->index_mask);
+ return array->value + (u64)array->elem_size * (index & array->index_mask);
}
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
@@ -203,7 +206,7 @@ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
@@ -272,11 +275,12 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
rcu_read_unlock();
@@ -335,11 +339,12 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
- value, map->value_size);
+ val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ copy_map_value(map, val, value);
+ check_and_free_fields(array, val);
} else {
val = array->value +
- array->elem_size * (index & array->index_mask);
+ (u64)array->elem_size * (index & array->index_mask);
if (map_flags & BPF_F_LOCK)
copy_map_value_locked(map, val, value, false);
else
@@ -376,11 +381,12 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+ check_and_free_fields(array, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -408,8 +414,7 @@ static void array_map_free_timers(struct bpf_map *map)
return;
for (i = 0; i < array->map.max_entries; i++)
- bpf_timer_cancel_and_free(array->value + array->elem_size * i +
- map->timer_off);
+ bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -419,8 +424,20 @@ static void array_map_free(struct bpf_map *map)
int i;
if (map_value_has_kptrs(map)) {
- for (i = 0; i < array->map.max_entries; i++)
- bpf_map_free_kptrs(map, array->value + array->elem_size * i);
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ void __percpu *pptr = array->pptrs[i & array->index_mask];
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ }
+ } else {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+ }
bpf_map_free_kptr_off_tab(map);
}
@@ -556,7 +573,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return array_map_elem_ptr(array, index);
}
static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -575,7 +592,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return array_map_elem_ptr(array, index);
}
static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
@@ -583,6 +600,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_seq_array_map_info *info = seq->private;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
@@ -603,11 +621,11 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
ctx.value = v;
} else {
pptr = v;
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += size;
}
ctx.value = info->percpu_value_buf;
@@ -633,11 +651,12 @@ static int bpf_iter_init_array_map(void *priv_data,
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
struct bpf_map *map = aux->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
void *value_buf;
u32 buf_size;
if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ buf_size = array->elem_size * num_possible_cpus();
value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
if (!value_buf)
return -ENOMEM;
@@ -645,6 +664,11 @@ static int bpf_iter_init_array_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ /* bpf_iter_attach_map() acquires a map uref, and the uref may be
+ * released before or in the middle of iterating map elements, so
+ * acquire an extra map uref for iterator.
+ */
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
return 0;
}
@@ -653,6 +677,7 @@ static void bpf_iter_fini_array_map(void *priv_data)
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
@@ -690,7 +715,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
else
- val = array->value + array->elem_size * i;
+ val = array_map_elem_ptr(array, i);
num_elems++;
key = i;
ret = callback_fn((u64)(long)map, (u64)(long)&key,
@@ -1322,7 +1347,7 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index b9ea539a5561..48ee750849f2 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
attr->value_size / sizeof(u32);
if (!(attr->map_flags & BPF_F_ZERO_SEED))
- bloom->hash_seed = get_random_int();
+ bloom->hash_seed = get_random_u32();
return &bloom->map;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index d5d96ceca105..5dc307bdeaeb 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -68,23 +68,27 @@ static void bpf_iter_done_stop(struct seq_file *seq)
iter_priv->done_stop = true;
}
+static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo)
+{
+ return tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
static bool bpf_iter_support_resched(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
- return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
+ return bpf_iter_target_support_resched(iter_priv->tinfo);
}
/* maximum visited objects before bailing out */
#define MAX_ITER_OBJECTS 1000000
/* bpf_seq_read, a customized and simpler version for bpf iterator.
- * no_llseek is assumed for this file.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
- * . assuming no_llseek
+ * . assuming NULL ->llseek()
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
@@ -198,6 +202,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
}
stop:
offs = seq->count;
+ if (IS_ERR(p)) {
+ seq->op->stop(seq, NULL);
+ err = PTR_ERR(p);
+ goto done;
+ }
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
@@ -538,6 +547,10 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
if (!tinfo)
return -ENOENT;
+ /* Only allow sleepable program for resched-able iterator */
+ if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+ return -EINVAL;
+
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
@@ -681,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
int ret;
if (prog->aux->sleepable) {
rcu_read_lock_trace();
migrate_disable();
might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock_trace();
} else {
rcu_read_lock();
migrate_disable();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
migrate_enable();
rcu_read_unlock();
}
@@ -723,9 +741,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg4_type = ARG_ANYTHING,
};
-/* maximum number of loops */
-#define MAX_LOOPS BIT(23)
-
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64, flags)
{
@@ -733,9 +748,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64 ret;
u32 i;
+ /* Note: these safety checks are also verified when bpf_loop
+ * is inlined, be careful to modify this code in sync. See
+ * function verifier.c:inline_bpf_loop.
+ */
if (flags)
return -EINVAL;
- if (nr_loops > MAX_LOOPS)
+ if (nr_loops > BPF_MAX_LOOPS)
return -E2BIG;
for (i = 0; i < nr_loops; i++) {
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 8ce40fd869f6..802fc15b0d73 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -555,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem, map_node))) {
if (busy_counter) {
migrate_disable();
- __this_cpu_inc(*busy_counter);
+ this_cpu_inc(*busy_counter);
}
bpf_selem_unlink(selem, false);
if (busy_counter) {
- __this_cpu_dec(*busy_counter);
+ this_cpu_dec(*busy_counter);
migrate_enable();
}
cond_resched_rcu();
@@ -582,7 +582,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
synchronize_rcu();
kvfree(smap->buckets);
- kfree(smap);
+ bpf_map_area_free(smap);
}
int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
@@ -610,7 +610,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
unsigned int i;
u32 nbuckets;
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
@@ -623,7 +623,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!smap->buckets) {
- kfree(smap);
+ bpf_map_area_free(smap);
return ERR_PTR(-ENOMEM);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index c1351df9f7ee..d6c9b3705f24 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -16,6 +16,7 @@
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@@ -35,6 +36,65 @@ BTF_SET_START(bpf_lsm_hooks)
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
+BTF_SET_END(bpf_lsm_current_hooks)
+
+/* List of LSM hooks that trigger while the socket is properly locked.
+ */
+BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
+BTF_ID(func, bpf_lsm_sock_graft)
+BTF_ID(func, bpf_lsm_inet_csk_clone)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
+BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
+
+/* List of LSM hooks that trigger while the socket is _not_ locked,
+ * but it's ok to call bpf_{g,s}etsockopt because the socket is still
+ * in the early init phase.
+ */
+BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
+BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
+
+#ifdef CONFIG_CGROUP_BPF
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+ bpf_func_t *bpf_func)
+{
+ const struct btf_param *args __maybe_unused;
+
+ if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+ btf_id_set_contains(&bpf_lsm_current_hooks,
+ prog->aux->attach_btf_id)) {
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+ return;
+ }
+
+#ifdef CONFIG_NET
+ args = btf_params(prog->aux->attach_func_proto);
+
+ if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+ *bpf_func = __cgroup_bpf_run_lsm_socket;
+ else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+ *bpf_func = __cgroup_bpf_run_lsm_sock;
+ else
+#endif
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+}
+#endif
+
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
@@ -135,6 +195,14 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
switch (func_id) {
case BPF_FUNC_inode_storage_get:
return &bpf_inode_storage_get_proto;
@@ -158,6 +226,28 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
case BPF_FUNC_get_attach_cookie:
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+#ifdef CONFIG_NET
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_setsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_getsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_getsockopt_proto;
+ return NULL;
+#endif
default:
return tracing_prog_func_proto(func_id, prog);
}
@@ -250,6 +340,7 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d9a3c9207240..84b2d9dba79a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -341,6 +341,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+ /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
+ * and it must be used alone.
+ */
flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
model, flags, tlinks, NULL);
@@ -503,10 +506,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- /* Error during st_ops->reg(). It is very unlikely since
- * the above init_member() should have caught it earlier
- * before reg(). The only possibility is if there was a race
- * in registering the struct_ops (under the same name) to
+ /* Error during st_ops->reg(). Can happen if this struct_ops needs to be
+ * verified as a whole, after all init_member() calls. Can also happen if
+ * there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
set_memory_nx((long)st_map->image, 1);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index e9014dc62682..6f290623347e 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
migrate_disable();
- __this_cpu_inc(bpf_task_storage_busy);
+ this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
- __this_cpu_dec(bpf_task_storage_busy);
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
migrate_disable();
- if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- __this_cpu_dec(bpf_task_storage_busy);
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7bccaa4646e5..eba603cec2c5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -208,12 +208,12 @@ enum btf_kfunc_hook {
};
enum {
- BTF_KFUNC_SET_MAX_CNT = 32,
+ BTF_KFUNC_SET_MAX_CNT = 256,
BTF_DTOR_KFUNC_MAX_CNT = 256,
};
struct btf_kfunc_set_tab {
- struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+ struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
};
struct btf_id_dtor_kfunc_tab {
@@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FLOAT] = "FLOAT",
[BTF_KIND_DECL_TAG] = "DECL_TAG",
[BTF_KIND_TYPE_TAG] = "TYPE_TAG",
+ [BTF_KIND_ENUM64] = "ENUM64",
};
const char *btf_type_str(const struct btf_type *t)
@@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
return true;
}
@@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
return (const struct btf_decl_tag *)(t + 1);
}
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+ return (const struct btf_enum64 *)(t + 1);
+}
+
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -811,6 +818,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
return NULL;
return btf->types[type_id];
}
+EXPORT_SYMBOL_GPL(btf_type_by_id);
/*
* Regular int is not a bit field and it must be either
@@ -1019,6 +1027,7 @@ static const char *btf_show_name(struct btf_show *show)
parens = "{";
break;
case BTF_KIND_ENUM:
+ case BTF_KIND_ENUM64:
prefix = "enum";
break;
default:
@@ -1108,7 +1117,8 @@ __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
*/
#define btf_show_type_value(show, fmt, value) \
do { \
- if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \
+ if ((value) != (__typeof__(value))0 || \
+ (show->flags & BTF_SHOW_ZERO) || \
show->state.depth == 0) { \
btf_show(show, "%s%s" fmt "%s%s", \
btf_show_indent(show), \
@@ -1387,7 +1397,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
- u8 kind = BTF_INFO_KIND(t->info);
struct btf *btf = env->btf;
va_list args;
@@ -1403,7 +1412,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
- btf_kind_str[kind],
+ btf_type_str(t),
__btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
@@ -1607,7 +1616,7 @@ static void btf_free_id(struct btf *btf)
static void btf_free_kfunc_set_tab(struct btf *btf)
{
struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
- int hook, type;
+ int hook;
if (!tab)
return;
@@ -1616,10 +1625,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)
*/
if (btf_is_module(btf))
goto free_tab;
- for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
- for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
- kfree(tab->sets[hook][type]);
- }
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
+ kfree(tab->sets[hook]);
free_tab:
kfree(tab);
btf->kfunc_set_tab = NULL;
@@ -1834,6 +1841,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
size = type->size;
goto resolved;
@@ -3120,7 +3128,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
- u16 last_member_type_id;
+ u32 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;
@@ -3670,6 +3678,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
{
const struct btf_enum *enums = btf_type_enum(t);
struct btf *btf = env->btf;
+ const char *fmt_str;
u16 i, nr_enums;
u32 meta_needed;
@@ -3683,11 +3692,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
if (t->size > 8 || !is_power_of_2(t->size)) {
btf_verifier_log_type(env, t, "Unexpected size");
return -EINVAL;
@@ -3718,7 +3722,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
if (env->log.level == BPF_LOG_KERNEL)
continue;
- btf_verifier_log(env, "\t%s val=%d\n",
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+ btf_verifier_log(env, fmt_str,
__btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
@@ -3759,7 +3764,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
return;
}
- btf_show_type_value(show, "%d", v);
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%d", v);
+ else
+ btf_show_type_value(show, "%u", v);
btf_show_end_type(show);
}
@@ -3772,6 +3780,109 @@ static struct btf_kind_operations enum_ops = {
.show = btf_enum_show,
};
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ struct btf *btf = env->btf;
+ const char *fmt_str;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
+ return -EINVAL;
+ }
+
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
+
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+ btf_verifier_log(env, fmt_str,
+ __btf_name_by_offset(btf, enums[i].name_off),
+ btf_enum64_value(enums + i));
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ void *safe_data;
+ s64 v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(u64 *)safe_data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v != btf_enum64_value(enums + i))
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
+ }
+
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%lld", v);
+ else
+ btf_show_type_value(show, "%llu", v);
+ btf_show_end_type(show);
+}
+
+static struct btf_kind_operations enum64_ops = {
+ .check_meta = btf_enum64_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
+ .log_details = btf_enum_log,
+ .show = btf_enum64_show,
+};
+
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -4438,6 +4549,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_FLOAT] = &float_ops,
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
[BTF_KIND_TYPE_TAG] = &modifier_ops,
+ [BTF_KIND_ENUM64] = &enum64_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4742,7 +4854,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
struct btf *btf;
- int err;
btf = env->btf;
btf_data_size = btf->data_size;
@@ -4799,11 +4910,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return -EINVAL;
}
- err = btf_check_sec_info(env, btf_data_size);
- if (err)
- return err;
-
- return 0;
+ return btf_check_sec_info(env, btf_data_size);
}
static int btf_check_type_tags(struct btf_verifier_env *env,
@@ -4815,6 +4922,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
n = btf_nr_types(btf);
for (i = start_id; i < n; i++) {
const struct btf_type *t;
+ int chain_limit = 32;
u32 cur_id = i;
t = btf_type_by_id(btf, i);
@@ -4827,6 +4935,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
in_tags = btf_type_is_type_tag(t);
while (btf_type_is_modifier(t)) {
+ if (!chain_limit--) {
+ btf_verifier_log(env, "Max chain length or cycle detected");
+ return -ELOOP;
+ }
if (btf_type_is_type_tag(t)) {
if (!in_tags) {
btf_verifier_log(env, "Type tags don't precede modifiers");
@@ -5211,6 +5323,34 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
return btf_type_is_int(t);
}
+static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 offset = 0, nr_args;
+ int i;
+
+ if (!func_proto)
+ return off / 8;
+
+ nr_args = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nr_args; i++) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return i;
+ }
+
+ t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return nr_args;
+
+ return nr_args + 1;
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -5230,7 +5370,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = off / 8;
+ arg = get_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -5250,6 +5390,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
+ case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
@@ -5280,7 +5421,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_small_int(t)) {
bpf_log(log,
"ret type %s not allowed for fmod_ret\n",
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
break;
@@ -5299,7 +5440,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -5307,7 +5448,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
tname, arg,
__btf_name_by_offset(btf, t->name_off),
- btf_kind_str[BTF_INFO_KIND(t->info)]);
+ btf_type_str(t));
return false;
}
@@ -5391,11 +5532,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
- tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, arg, btf_type_str(t));
return false;
}
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
- tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+ tname, arg, info->btf_id, btf_type_str(t),
__btf_name_by_offset(btf, t->name_off));
return true;
}
@@ -5746,26 +5887,25 @@ again:
}
static int __get_type_size(struct btf *btf, u32 btf_id,
- const struct btf_type **bad_type)
+ const struct btf_type **ret_type)
{
const struct btf_type *t;
+ *ret_type = btf_type_by_id(btf, 0);
if (!btf_id)
/* void */
return 0;
t = btf_type_by_id(btf, btf_id);
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!t) {
- *bad_type = btf_type_by_id(btf, 0);
+ if (!t)
return -EINVAL;
- }
+ *ret_type = t;
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
return t->size;
- *bad_type = t;
return -EINVAL;
}
@@ -5784,8 +5924,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
/* BTF function prototype doesn't match the verifier types.
* Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
m->arg_size[i] = 8;
+ m->arg_flags[i] = 0;
+ }
m->ret_size = 8;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
@@ -5799,10 +5941,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0) {
+ if (ret < 0 || __btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
- tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, btf_type_str(t));
return -EINVAL;
}
m->ret_size = ret;
@@ -5815,10 +5957,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t);
- if (ret < 0) {
+
+ /* No support of struct argument size greater than 16 bytes */
+ if (ret < 0 || ret > 16) {
bpf_log(log,
"The function %s arg%d type %s is unsupported.\n",
- tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+ tname, i, btf_type_str(t));
return -EINVAL;
}
if (ret == 0) {
@@ -5828,6 +5972,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
m->arg_size[i] = ret;
+ m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0;
}
m->nr_args = nargs;
return 0;
@@ -5911,7 +6056,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
* to context only. And only global functions can be replaced.
* Hence type check only those types.
*/
- if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ if (btf_type_is_int(t1) || btf_is_any_enum(t1))
continue;
if (!btf_type_is_ptr(t1)) {
bpf_log(log,
@@ -6049,15 +6194,44 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
return true;
}
+static bool btf_is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const struct btf_type *t;
+ const char *param_name;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
- bool ptr_to_mem_ok)
+ bool ptr_to_mem_ok,
+ struct bpf_kfunc_arg_meta *kfunc_meta,
+ bool processing_call)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ bool rel = false, kptr_get = false, trusted_args = false;
+ bool sleepable = false;
struct bpf_verifier_log *log = &env->log;
u32 i, nargs, ref_id, ref_obj_id = 0;
bool is_kfunc = btf_is_kernel(btf);
- bool rel = false, kptr_get = false;
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
@@ -6087,12 +6261,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (is_kfunc) {
+ if (is_kfunc && kfunc_meta) {
/* Only kfunc can be release func */
- rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_RELEASE, func_id);
- kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id);
+ rel = kfunc_meta->flags & KF_RELEASE;
+ kptr_get = kfunc_meta->flags & KF_KPTR_GET;
+ trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
+ sleepable = kfunc_meta->flags & KF_SLEEPABLE;
}
/* check that BTF function arguments match actual types that the
@@ -6102,9 +6276,42 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1;
struct bpf_reg_state *reg = &regs[regno];
+ bool obj_ptr = false;
t = btf_type_skip_modifiers(btf, args[i].type, NULL);
if (btf_type_is_scalar(t)) {
+ if (is_kfunc && kfunc_meta) {
+ bool is_buf_size = false;
+
+ /* check for any const scalar parameter of name "rdonly_buf_size"
+ * or "rdwr_buf_size"
+ */
+ if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
+ "rdonly_buf_size")) {
+ kfunc_meta->r0_rdonly = true;
+ is_buf_size = true;
+ } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
+ "rdwr_buf_size"))
+ is_buf_size = true;
+
+ if (is_buf_size) {
+ if (kfunc_meta->r0_size) {
+ bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ bpf_log(log, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ kfunc_meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ }
+
if (reg->type == SCALAR_VALUE)
continue;
bpf_log(log, "R%d is not a scalar\n", regno);
@@ -6117,15 +6324,42 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* These register types have special constraints wrt ref_obj_id
+ * and offset checks. The rest of trusted args don't.
+ */
+ obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID ||
+ reg2btf_ids[base_type(reg->type)];
+
+ /* Check if argument must be a referenced pointer, args + i has
+ * been verified to be a pointer (after skipping modifiers).
+ * PTR_TO_CTX is ok without having non-zero ref_obj_id.
+ */
+ if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) {
+ bpf_log(log, "R%d must be referenced\n", regno);
+ return -EINVAL;
+ }
+
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- if (rel && reg->ref_obj_id)
+ /* Trusted args have the same offset checks as release arguments */
+ if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id))
arg_type |= OBJ_RELEASE;
ret = check_func_arg_reg_off(env, reg, regno, arg_type);
if (ret < 0)
return ret;
+ if (is_kfunc && reg->ref_obj_id) {
+ /* Ensure only one argument is referenced PTR_TO_BTF_ID */
+ if (ref_obj_id) {
+ bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id, ref_obj_id);
+ return -EFAULT;
+ }
+ ref_regno = regno;
+ ref_obj_id = reg->ref_obj_id;
+ }
+
/* kptr_get is only true for kfunc */
if (i == 0 && kptr_get) {
struct bpf_map_value_off_desc *off_desc;
@@ -6171,7 +6405,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
/* rest of the arguments can be anything, like normal kfunc */
- } else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
+ } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
@@ -6198,16 +6432,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
- /* Ensure only one argument is referenced PTR_TO_BTF_ID */
- if (reg->ref_obj_id) {
- if (ref_obj_id) {
- bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id, ref_obj_id);
- return -EFAULT;
- }
- ref_regno = regno;
- ref_obj_id = reg->ref_obj_id;
- }
} else {
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
@@ -6218,7 +6442,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname = btf_name_by_offset(reg_btf,
reg_ref_t->name_off);
if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id, rel && reg->ref_obj_id)) {
+ reg->off, btf, ref_id,
+ trusted_args || (rel && reg->ref_obj_id))) {
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
func_name, i,
btf_type_str(ref_t), ref_tname,
@@ -6226,21 +6451,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname);
return -EINVAL;
}
- } else if (ptr_to_mem_ok) {
+ } else if (ptr_to_mem_ok && processing_call) {
const struct btf_type *resolve_ret;
u32 type_size;
if (is_kfunc) {
bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+ bool arg_dynptr = btf_type_is_struct(ref_t) &&
+ !strcmp(ref_tname,
+ stringify_struct(bpf_dynptr_kern));
/* Permit pointer to mem, but only when argument
* type is pointer to scalar, or struct composed
* (recursively) of scalars.
* When arg_mem_size is true, the pointer can be
* void *.
+ * Also permit initialized local dynamic pointers.
*/
if (!btf_type_is_scalar(ref_t) &&
!__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+ !arg_dynptr &&
(arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
bpf_log(log,
"arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
@@ -6248,6 +6478,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (arg_dynptr) {
+ if (reg->type != PTR_TO_STACK) {
+ bpf_log(log, "arg#%d pointer type %s %s not to stack\n",
+ i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ bpf_log(log,
+ "arg#%d pointer type %s %s must be valid and initialized\n",
+ i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_type_expected(env, reg,
+ ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
+ bpf_log(log,
+ "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
+ i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ continue;
+ }
+
/* Check for mem, len pair */
if (arg_mem_size) {
if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
@@ -6290,11 +6548,21 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
func_name);
return -EINVAL;
}
+
+ if (sleepable && !env->prog->aux->sleepable) {
+ bpf_log(log, "kernel function %s is sleepable but the program is not\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ if (kfunc_meta && ref_obj_id)
+ kfunc_meta->ref_obj_id = ref_obj_id;
+
/* returns argument register number > 0 in case of reference release kfunc */
return rel ? ref_regno : 0;
}
-/* Compare BTF of a function with given bpf_reg_state.
+/* Compare BTF of a function declaration with given bpf_reg_state.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - there is a type mismatch or BTF is not available.
@@ -6321,7 +6589,50 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false);
+
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ *
+ * NOTE: the code is duplicated from btf_check_subprog_arg_match()
+ * because btf_check_func_arg_match() is still doing both. Once that
+ * function is split in 2, we can call from here btf_check_subprog_arg_match()
+ * first, and then treat the calling part in a new code path.
+ */
+int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6334,9 +6645,10 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs)
+ struct bpf_reg_state *regs,
+ struct bpf_kfunc_arg_meta *meta)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, true);
+ return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -6408,7 +6720,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
@@ -6423,7 +6735,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
reg->type = SCALAR_VALUE;
continue;
}
@@ -6450,7 +6762,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
continue;
}
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+ i, btf_type_str(t), tname);
return -EINVAL;
}
return 0;
@@ -6513,7 +6825,7 @@ static void btf_snprintf_show(struct btf_show *show, const char *fmt,
if (len < 0) {
ssnprintf->len_left = 0;
ssnprintf->len = len;
- } else if (len > ssnprintf->len_left) {
+ } else if (len >= ssnprintf->len_left) {
/* no space, drive on to get length we would have written */
ssnprintf->len_left = 0;
ssnprintf->len += len;
@@ -6733,6 +7045,11 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}
+static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
+{
+ return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
+}
+
enum {
BTF_MODULE_F_LIVE = (1 << 0),
};
@@ -6981,16 +7298,16 @@ BTF_TRACING_TYPE_xxx
/* Kernel Function (kfunc) BTF ID set registration API */
-static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
- enum btf_kfunc_type type,
- struct btf_id_set *add_set, bool vmlinux_set)
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ struct btf_id_set8 *add_set)
{
+ bool vmlinux_set = !btf_is_module(btf);
struct btf_kfunc_set_tab *tab;
- struct btf_id_set *set;
+ struct btf_id_set8 *set;
u32 set_cnt;
int ret;
- if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+ if (hook >= BTF_KFUNC_HOOK_MAX) {
ret = -EINVAL;
goto end;
}
@@ -7006,7 +7323,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
btf->kfunc_set_tab = tab;
}
- set = tab->sets[hook][type];
+ set = tab->sets[hook];
/* Warn when register_btf_kfunc_id_set is called twice for the same hook
* for module sets.
*/
@@ -7020,7 +7337,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
* pointer and return.
*/
if (!vmlinux_set) {
- tab->sets[hook][type] = add_set;
+ tab->sets[hook] = add_set;
return 0;
}
@@ -7029,7 +7346,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
* and concatenate all individual sets being registered. While each set
* is individually sorted, they may become unsorted when concatenated,
* hence re-sorting the final set again is required to make binary
- * searching the set using btf_id_set_contains function work.
+ * searching the set using btf_id_set8_contains function work.
*/
set_cnt = set ? set->cnt : 0;
@@ -7044,8 +7361,8 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
}
/* Grow set */
- set = krealloc(tab->sets[hook][type],
- offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+ set = krealloc(tab->sets[hook],
+ offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
GFP_KERNEL | __GFP_NOWARN);
if (!set) {
ret = -ENOMEM;
@@ -7053,15 +7370,15 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
}
/* For newly allocated set, initialize set->cnt to 0 */
- if (!tab->sets[hook][type])
+ if (!tab->sets[hook])
set->cnt = 0;
- tab->sets[hook][type] = set;
+ tab->sets[hook] = set;
/* Concatenate the two sets */
- memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
set->cnt += add_set->cnt;
- sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+ sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
return 0;
end:
@@ -7069,38 +7386,25 @@ end:
return ret;
}
-static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
- const struct btf_kfunc_id_set *kset)
-{
- bool vmlinux_set = !btf_is_module(btf);
- int type, ret = 0;
-
- for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
- if (!kset->sets[type])
- continue;
-
- ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
- if (ret)
- break;
- }
- return ret;
-}
-
-static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
enum btf_kfunc_hook hook,
- enum btf_kfunc_type type,
u32 kfunc_btf_id)
{
- struct btf_id_set *set;
+ struct btf_id_set8 *set;
+ u32 *id;
- if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
- return false;
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return NULL;
if (!btf->kfunc_set_tab)
- return false;
- set = btf->kfunc_set_tab->sets[hook][type];
+ return NULL;
+ set = btf->kfunc_set_tab->sets[hook];
if (!set)
- return false;
- return btf_id_set_contains(set, kfunc_btf_id);
+ return NULL;
+ id = btf_id_set8_contains(set, kfunc_btf_id);
+ if (!id)
+ return NULL;
+ /* The flags for BTF ID are located next to it */
+ return id + 1;
}
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
@@ -7113,6 +7417,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_STRUCT_OPS:
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
@@ -7128,14 +7433,14 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
* keeping the reference for the duration of the call provides the necessary
* protection for looking up a well-formed btf->kfunc_set_tab.
*/
-bool btf_kfunc_id_set_contains(const struct btf *btf,
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
enum bpf_prog_type prog_type,
- enum btf_kfunc_type type, u32 kfunc_btf_id)
+ u32 kfunc_btf_id)
{
enum btf_kfunc_hook hook;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}
/* This function must be invoked only from initcalls/module init functions */
@@ -7162,7 +7467,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
return PTR_ERR(btf);
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- ret = btf_populate_kfunc_set(btf, hook, kset);
+ ret = btf_populate_kfunc_set(btf, hook, kset->set);
btf_put(btf);
return ret;
}
@@ -7302,95 +7607,15 @@ EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
#define MAX_TYPES_ARE_COMPAT_DEPTH 2
-static
-int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
- const struct btf *targ_btf, __u32 targ_id,
- int level)
-{
- const struct btf_type *local_type, *targ_type;
- int depth = 32; /* max recursion depth */
-
- /* caller made sure that names match (ignoring flavor suffix) */
- local_type = btf_type_by_id(local_btf, local_id);
- targ_type = btf_type_by_id(targ_btf, targ_id);
- if (btf_kind(local_type) != btf_kind(targ_type))
- return 0;
-
-recur:
- depth--;
- if (depth < 0)
- return -EINVAL;
-
- local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id);
- targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id);
- if (!local_type || !targ_type)
- return -EINVAL;
-
- if (btf_kind(local_type) != btf_kind(targ_type))
- return 0;
-
- switch (btf_kind(local_type)) {
- case BTF_KIND_UNKN:
- case BTF_KIND_STRUCT:
- case BTF_KIND_UNION:
- case BTF_KIND_ENUM:
- case BTF_KIND_FWD:
- return 1;
- case BTF_KIND_INT:
- /* just reject deprecated bitfield-like integers; all other
- * integers are by default compatible between each other
- */
- return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0;
- case BTF_KIND_PTR:
- local_id = local_type->type;
- targ_id = targ_type->type;
- goto recur;
- case BTF_KIND_ARRAY:
- local_id = btf_array(local_type)->type;
- targ_id = btf_array(targ_type)->type;
- goto recur;
- case BTF_KIND_FUNC_PROTO: {
- struct btf_param *local_p = btf_params(local_type);
- struct btf_param *targ_p = btf_params(targ_type);
- __u16 local_vlen = btf_vlen(local_type);
- __u16 targ_vlen = btf_vlen(targ_type);
- int i, err;
-
- if (local_vlen != targ_vlen)
- return 0;
-
- for (i = 0; i < local_vlen; i++, local_p++, targ_p++) {
- if (level <= 0)
- return -EINVAL;
-
- btf_type_skip_modifiers(local_btf, local_p->type, &local_id);
- btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id);
- err = __bpf_core_types_are_compat(local_btf, local_id,
- targ_btf, targ_id,
- level - 1);
- if (err <= 0)
- return err;
- }
-
- /* tail recurse for return type check */
- btf_type_skip_modifiers(local_btf, local_type->type, &local_id);
- btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id);
- goto recur;
- }
- default:
- return 0;
- }
-}
-
/* Check local and target types for compatibility. This check is used for
* type-based CO-RE relocations and follow slightly different rules than
* field-based relocations. This function assumes that root types were already
* checked for name match. Beyond that initial root-level name check, names
* are completely ignored. Compatibility rules are as follows:
- * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
* kind should match for local and target types (i.e., STRUCT is not
* compatible with UNION);
- * - for ENUMs, the size is ignored;
+ * - for ENUMs/ENUM64s, the size is ignored;
* - for INT, size and signedness are ignored;
* - for ARRAY, dimensionality is ignored, element types are checked for
* compatibility recursively;
@@ -7404,11 +7629,19 @@ recur:
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
const struct btf *targ_btf, __u32 targ_id)
{
- return __bpf_core_types_are_compat(local_btf, local_id,
- targ_btf, targ_id,
+ return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id,
MAX_TYPES_ARE_COMPAT_DEPTH);
}
+#define MAX_TYPES_MATCH_DEPTH 2
+
+int bpf_core_types_match(const struct btf *local_btf, u32 local_id,
+ const struct btf *targ_btf, u32 targ_id)
+{
+ return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false,
+ MAX_TYPES_MATCH_DEPTH);
+}
+
static bool bpf_core_is_flavor_sep(const char *s)
{
/* check X___Y name pattern, where X and Y are not underscores */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index afb414b26d01..bf2fdb33fb31 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,6 +14,8 @@
#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>
@@ -61,6 +63,132 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
return run_ctx.retval;
}
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct sock *sk;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sk = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct socket *sock;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sock = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+ cgrp = task_dfl_cgroup(current);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+ u32 attach_btf_id;
+ int refcnt;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ int i;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
+ return CGROUP_LSM_START + i;
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == 0)
+ return CGROUP_LSM_START + i;
+
+ return -E2BIG;
+
+}
+
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
+ cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+
+ cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ cgroup_lsm_atype[i].refcnt++;
+}
+
+void bpf_cgroup_atype_put(int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ mutex_lock(&cgroup_mutex);
+ if (--cgroup_lsm_atype[i].refcnt <= 0)
+ cgroup_lsm_atype[i].attach_btf_id = 0;
+ WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
+ mutex_unlock(&cgroup_mutex);
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
@@ -157,15 +285,22 @@ static void cgroup_bpf_release(struct work_struct *work)
mutex_lock(&cgroup_mutex);
for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
- struct list_head *progs = &cgrp->bpf.progs[atype];
- struct bpf_prog_list *pl, *pltmp;
+ struct hlist_head *progs = &cgrp->bpf.progs[atype];
+ struct bpf_prog_list *pl;
+ struct hlist_node *pltmp;
- list_for_each_entry_safe(pl, pltmp, progs, node) {
- list_del(&pl->node);
- if (pl->prog)
+ hlist_for_each_entry_safe(pl, pltmp, progs, node) {
+ hlist_del(&pl->node);
+ if (pl->prog) {
+ if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->prog);
bpf_prog_put(pl->prog);
- if (pl->link)
+ }
+ if (pl->link) {
+ if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
bpf_cgroup_link_auto_detach(pl->link);
+ }
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
@@ -217,12 +352,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct list_head *head)
+static u32 prog_list_length(struct hlist_head *head)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
cnt++;
@@ -291,7 +426,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- list_for_each_entry(pl, &p->bpf.progs[atype], node) {
+ hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
@@ -342,7 +477,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cgroup_bpf_get(p);
for (i = 0; i < NR; i++)
- INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+ INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
INIT_LIST_HEAD(&cgrp->bpf.storages);
@@ -418,7 +553,7 @@ cleanup:
#define BPF_CGROUP_MAX_PROGS 64
-static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
struct bpf_prog *replace_prog,
@@ -428,12 +563,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* single-attach case */
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
return NULL;
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
@@ -444,7 +579,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* direct prog multi-attach w/ replacement case */
if (replace_prog) {
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == replace_prog)
/* a match found */
return pl;
@@ -478,9 +613,10 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_prog *new_prog = prog ? : link->link.prog;
enum cgroup_bpf_attach_type atype;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -494,7 +630,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
/* replace_prog implies BPF_F_REPLACE, and vice versa */
return -EINVAL;
- atype = to_cgroup_bpf_attach_type(type);
+ atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -503,7 +639,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
if (!hierarchy_allows_attach(cgrp, atype))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
+ if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -525,12 +661,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
if (pl) {
old_prog = pl->prog;
} else {
+ struct hlist_node *last = NULL;
+
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
- list_add_tail(&pl->node, progs);
+ if (hlist_empty(progs))
+ hlist_add_head(&pl->node, progs);
+ else
+ hlist_for_each(last, progs) {
+ if (last->next)
+ continue;
+ hlist_add_behind(&pl->node, last);
+ break;
+ }
}
pl->prog = prog;
@@ -538,17 +684,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
bpf_cgroup_storages_assign(pl->storage, storage);
cgrp->bpf.flags[atype] = saved_flags;
+ if (type == BPF_LSM_CGROUP) {
+ err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+ if (err)
+ goto cleanup;
+ }
+
err = update_effective_progs(cgrp, atype);
if (err)
- goto cleanup;
+ goto cleanup_trampoline;
- if (old_prog)
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
- else
+ } else {
static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ }
bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
+cleanup_trampoline:
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(new_prog);
+
cleanup:
if (old_prog) {
pl->prog = old_prog;
@@ -556,7 +715,7 @@ cleanup:
}
bpf_cgroup_storages_free(new_storage);
if (!old_prog) {
- list_del(&pl->node);
+ hlist_del(&pl->node);
kfree(pl);
}
return err;
@@ -587,7 +746,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
struct cgroup_subsys_state *css;
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
- struct list_head *head;
+ struct hlist_head *head;
struct cgroup *cg;
int pos;
@@ -603,7 +762,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
continue;
head = &cg->bpf.progs[atype];
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
if (pl->link == link)
@@ -637,10 +796,10 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
bool found = false;
- atype = to_cgroup_bpf_attach_type(link->type);
+ atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -649,7 +808,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
if (link->link.prog->type != new_prog->type)
return -EINVAL;
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->link == link) {
found = true;
break;
@@ -688,7 +847,7 @@ out_unlock:
return ret;
}
-static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
bool allow_multi)
@@ -696,14 +855,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog_list *pl;
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* report error when trying to detach and nothing is attached */
return ERR_PTR(-ENOENT);
/* to maintain backward compatibility NONE and OVERRIDE cgroups
* allow detaching with invalid FD (prog==NULL) in legacy mode
*/
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
if (!prog && !link)
@@ -713,7 +872,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
return ERR_PTR(-EINVAL);
/* find the prog or link and detach it */
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == prog && pl->link == link)
return pl;
}
@@ -721,6 +880,62 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
}
/**
+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
+ * cgrp->bpf.inactive table we can recover by
+ * recomputing the array in place.
+ *
+ * @cgrp: The cgroup which descendants to travers
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @atype: Type of detach operation
+ */
+static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct hlist_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ /* recompute effective prog array in place */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link or prog in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->prog == prog && pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+
+ /* no link or prog match, skip the cgroup of this layer */
+ continue;
+found:
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+
+ /* Remove the program from the array */
+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ "Failed to purge a prog from array at index %d", pos);
+ }
+}
+
+/**
* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
@@ -737,11 +952,16 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
+ u32 attach_btf_id = 0;
u32 flags;
- int err;
- atype = to_cgroup_bpf_attach_type(type);
+ if (prog)
+ attach_btf_id = prog->aux->attach_btf_id;
+ if (link)
+ attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+ atype = bpf_cgroup_atype_find(type, attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -761,26 +981,27 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
pl->link = NULL;
- err = update_effective_progs(cgrp, atype);
- if (err)
- goto cleanup;
+ if (update_effective_progs(cgrp, atype)) {
+ /* if update effective array failed replace the prog with a dummy prog*/
+ pl->prog = old_prog;
+ pl->link = link;
+ purge_effective_progs(cgrp, old_prog, link, atype);
+ }
/* now can actually delete it from this cgroup list */
- list_del(&pl->node);
+ hlist_del(&pl->node);
+
kfree(pl);
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* last program was detached, reset flags to zero */
cgrp->bpf.flags[atype] = 0;
- if (old_prog)
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
+ }
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
return 0;
-
-cleanup:
- /* restore back prog or link */
- pl->prog = old_prog;
- pl->link = link;
- return err;
}
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
@@ -798,57 +1019,98 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
+ __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
+ enum cgroup_bpf_attach_type from_atype, to_atype;
enum cgroup_bpf_attach_type atype;
struct bpf_prog_array *effective;
- struct list_head *progs;
- struct bpf_prog *prog;
int cnt, ret = 0, i;
+ int total_cnt = 0;
u32 flags;
- atype = to_cgroup_bpf_attach_type(type);
- if (atype < 0)
+ if (effective_query && prog_attach_flags)
return -EINVAL;
- progs = &cgrp->bpf.progs[atype];
- flags = cgrp->bpf.flags[atype];
+ if (type == BPF_LSM_CGROUP) {
+ if (!effective_query && attr->query.prog_cnt &&
+ prog_ids && !prog_attach_flags)
+ return -EINVAL;
- effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
- lockdep_is_held(&cgroup_mutex));
+ from_atype = CGROUP_LSM_START;
+ to_atype = CGROUP_LSM_END;
+ flags = 0;
+ } else {
+ from_atype = to_cgroup_bpf_attach_type(type);
+ if (from_atype < 0)
+ return -EINVAL;
+ to_atype = from_atype;
+ flags = cgrp->bpf.flags[from_atype];
+ }
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(effective);
- else
- cnt = prog_list_length(progs);
+ for (atype = from_atype; atype <= to_atype; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ total_cnt += bpf_prog_array_length(effective);
+ } else {
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+ }
+ }
+ /* always output uattr->query.attach_flags as 0 during effective query */
+ flags = effective_query ? 0 : flags;
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
- if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
return -EFAULT;
- if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
/* return early if user requested only program count + flags */
return 0;
- if (attr->query.prog_cnt < cnt) {
- cnt = attr->query.prog_cnt;
+
+ if (attr->query.prog_cnt < total_cnt) {
+ total_cnt = attr->query.prog_cnt;
ret = -ENOSPC;
}
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
- } else {
- struct bpf_prog_list *pl;
- u32 id;
-
- i = 0;
- list_for_each_entry(pl, progs, node) {
- prog = prog_list_prog(pl);
- id = prog->aux->id;
- if (copy_to_user(prog_ids + i, &id, sizeof(id)))
- return -EFAULT;
- if (++i == cnt)
- break;
+ for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+ ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ } else {
+ struct hlist_head *progs;
+ struct bpf_prog_list *pl;
+ struct bpf_prog *prog;
+ u32 id;
+
+ progs = &cgrp->bpf.progs[atype];
+ cnt = min_t(int, prog_list_length(progs), total_cnt);
+ i = 0;
+ hlist_for_each_entry(pl, progs, node) {
+ prog = prog_list_prog(pl);
+ id = prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
+
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i,
+ &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
}
+
+ prog_ids += cnt;
+ total_cnt -= cnt;
}
return ret;
}
@@ -937,6 +1199,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
cg_link->type));
+ if (cg_link->type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
cg = cg_link->cgroup;
cg_link->cgroup = NULL;
@@ -1273,6 +1537,37 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
return ret;
}
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_0(bpf_get_retval)
{
struct bpf_cg_run_ctx *ctx =
@@ -1281,7 +1576,7 @@ BPF_CALL_0(bpf_get_retval)
return ctx->retval;
}
-static const struct bpf_func_proto bpf_get_retval_proto = {
+const struct bpf_func_proto bpf_get_retval_proto = {
.func = bpf_get_retval,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1296,7 +1591,7 @@ BPF_CALL_1(bpf_set_retval, int, retval)
return 0;
}
-static const struct bpf_func_proto bpf_set_retval_proto = {
+const struct bpf_func_proto bpf_set_retval_proto = {
.func = bpf_set_retval,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1304,32 +1599,26 @@ static const struct bpf_func_proto bpf_set_retval_proto = {
};
static const struct bpf_func_proto *
-cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_retval:
- return &bpf_get_retval_proto;
- case BPF_FUNC_set_retval:
- return &bpf_set_retval_proto;
default:
return bpf_base_func_proto(func_id);
}
}
-static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return cgroup_base_func_proto(func_id, prog);
-}
-
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -1842,11 +2131,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_strtol:
- return &bpf_strtol_proto;
- case BPF_FUNC_strtoul:
- return &bpf_strtoul_proto;
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
case BPF_FUNC_sysctl_get_current_value:
@@ -1857,8 +2152,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_set_new_value_proto;
case BPF_FUNC_ktime_get_coarse_ns:
return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -1979,6 +2276,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -2000,8 +2307,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return cgroup_base_func_proto(func_id, prog);
+ return bpf_base_func_proto(func_id);
}
}
@@ -2166,3 +2475,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
+
+/* Common helpers for cgroup hooks with valid process context. */
+const struct bpf_func_proto *
+cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..9fcf09f2ef00
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ mutex_unlock(&cgroup_mutex);
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ p->start_css = &cgrp->self;
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_v1v2_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5f6f3f829b36..25a54e04560e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -68,11 +68,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
{
u8 *ptr = NULL;
- if (k >= SKF_NET_OFF)
+ if (k >= SKF_NET_OFF) {
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
+ } else if (k >= SKF_LL_OFF) {
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return NULL;
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
+ }
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
@@ -107,6 +109,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+#ifdef CONFIG_CGROUP_BPF
+ aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
+#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
mutex_init(&fp->aux->used_maps_mutex);
@@ -176,7 +181,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
* here is relative to the prog itself instead of the main prog.
* This array has one entry for each xlated bpf insn.
*
- * jited_off is the byte off to the last byte of the jited insn.
+ * jited_off is the byte off to the end of the jited insn.
*
* Hence, with
* insn_start:
@@ -647,12 +652,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
return fp->jited && !bpf_prog_was_classic(fp);
}
-static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
-{
- return list_empty(&fp->aux->ksym.lnode) ||
- fp->aux->ksym.lnode.prev == LIST_POISON2;
-}
-
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
@@ -826,17 +825,13 @@ struct bpf_prog_pack {
unsigned long bitmap[];
};
-#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
-
-static size_t bpf_prog_pack_size = -1;
-static size_t bpf_prog_pack_mask = -1;
-
-static int bpf_prog_chunk_count(void)
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
{
- WARN_ON_ONCE(bpf_prog_pack_size == -1);
- return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE;
+ memset(area, 0, size);
}
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
static DEFINE_MUTEX(pack_mutex);
static LIST_HEAD(pack_list);
@@ -844,59 +839,37 @@ static LIST_HEAD(pack_list);
* CONFIG_MMU=n. Use PAGE_SIZE in these cases.
*/
#ifdef PMD_SIZE
-#define BPF_HPAGE_SIZE PMD_SIZE
-#define BPF_HPAGE_MASK PMD_MASK
+#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
#else
-#define BPF_HPAGE_SIZE PAGE_SIZE
-#define BPF_HPAGE_MASK PAGE_MASK
+#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif
-static size_t select_bpf_prog_pack_size(void)
-{
- size_t size;
- void *ptr;
-
- size = BPF_HPAGE_SIZE * num_online_nodes();
- ptr = module_alloc(size);
-
- /* Test whether we can get huge pages. If not just use PAGE_SIZE
- * packs.
- */
- if (!ptr || !is_vm_area_hugepages(ptr)) {
- size = PAGE_SIZE;
- bpf_prog_pack_mask = PAGE_MASK;
- } else {
- bpf_prog_pack_mask = BPF_HPAGE_MASK;
- }
-
- vfree(ptr);
- return size;
-}
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_prog_pack *pack;
- pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())),
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
GFP_KERNEL);
if (!pack)
return NULL;
- pack->ptr = module_alloc(bpf_prog_pack_size);
+ pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
if (!pack->ptr) {
kfree(pack);
return NULL;
}
- bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size);
- bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
+ bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
- set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
return pack;
}
-static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
struct bpf_prog_pack *pack;
@@ -904,10 +877,7 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn
void *ptr = NULL;
mutex_lock(&pack_mutex);
- if (bpf_prog_pack_size == -1)
- bpf_prog_pack_size = select_bpf_prog_pack_size();
-
- if (size > bpf_prog_pack_size) {
+ if (size > BPF_PROG_PACK_SIZE) {
size = round_up(size, PAGE_SIZE);
ptr = module_alloc(size);
if (ptr) {
@@ -919,9 +889,9 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn
goto out;
}
list_for_each_entry(pack, &pack_list, list) {
- pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
nbits, 0);
- if (pos < bpf_prog_chunk_count())
+ if (pos < BPF_PROG_CHUNK_COUNT)
goto found_free_area;
}
@@ -940,23 +910,20 @@ out:
return ptr;
}
-static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(struct bpf_binary_header *hdr)
{
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
unsigned long pos;
- void *pack_ptr;
mutex_lock(&pack_mutex);
- if (hdr->size > bpf_prog_pack_size) {
+ if (hdr->size > BPF_PROG_PACK_SIZE) {
module_memfree(hdr);
goto out;
}
- pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask);
-
list_for_each_entry(tmp, &pack_list, list) {
- if (tmp->ptr == pack_ptr) {
+ if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
pack = tmp;
break;
}
@@ -966,14 +933,14 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
goto out;
nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
- pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+ pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
"bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
bitmap_clear(pack->bitmap, pos, nbits);
- if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
- bpf_prog_chunk_count(), 0) == 0) {
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
list_del(&pack->list);
module_memfree(pack->ptr);
kfree(pack);
@@ -1009,7 +976,7 @@ pure_initcall(bpf_jit_charge_init);
int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
+ if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
if (!bpf_capable()) {
atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
@@ -1065,7 +1032,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = prandom_u32_max(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -1127,7 +1094,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
- start = (get_random_int() % hole) & ~(alignment - 1);
+ start = prandom_u32_max(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
*rw_image = &(*rw_header)->image[start];
@@ -1150,7 +1117,6 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
bpf_prog_pack_free(ro_header);
return PTR_ERR(ptr);
}
- prog->aux->use_bpf_prog_pack = true;
return 0;
}
@@ -1174,17 +1140,23 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
bpf_jit_uncharge_modmem(size);
}
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ return (void *)addr;
+}
+
static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
unsigned long real_start = (unsigned long)fp->bpf_func;
unsigned long addr;
- if (fp->aux->use_bpf_prog_pack)
- addr = real_start & BPF_PROG_CHUNK_MASK;
- else
- addr = real_start & PAGE_MASK;
-
+ addr = real_start & PAGE_MASK;
return (void *)addr;
}
@@ -1197,11 +1169,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- if (fp->aux->use_bpf_prog_pack)
- bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
- else
- bpf_jit_binary_free(hdr);
-
+ bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -1248,7 +1216,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
bool emit_zext)
{
struct bpf_insn *to = to_buff;
- u32 imm_rnd = get_random_int();
+ u32 imm_rnd = get_random_u32();
s16 off;
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
@@ -2039,7 +2007,7 @@ out:
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_EXT_REG]; \
+ u64 regs[MAX_BPF_EXT_REG] = {}; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
@@ -2279,6 +2247,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
kfree_rcu(progs, rcu);
}
+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+ struct bpf_prog_array *progs;
+
+ progs = container_of(rcu, struct bpf_prog_array, rcu);
+ kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
+ return;
+ call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
@@ -2555,6 +2538,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#ifdef CONFIG_BPF_SYSCALL
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
#endif
+#ifdef CONFIG_CGROUP_BPF
+ if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
+ bpf_cgroup_atype_put(aux->cgroup_atype);
+#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
@@ -2641,6 +2628,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
@@ -2651,6 +2639,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
@@ -2714,6 +2704,12 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool __weak bpf_jit_supports_subprog_tailcalls(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_kfunc_call(void)
{
return false;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index f4860ac756cd..b5ba34ddd4b6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -97,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
- cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
+ cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
@@ -118,7 +118,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
free_cmap:
- kfree(cmap);
+ bpf_map_area_free(cmap);
return ERR_PTR(err);
}
@@ -623,7 +623,7 @@ static void cpu_map_free(struct bpf_map *map)
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
bpf_map_area_free(cmap->cpu_map);
- kfree(cmap);
+ bpf_map_area_free(cmap);
}
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index c2867068e5bd..f9a87dcc5535 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -163,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
- dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
+ dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
err = dev_map_init_map(dtab, attr);
if (err) {
- kfree(dtab);
+ bpf_map_area_free(dtab);
return ERR_PTR(err);
}
@@ -240,7 +240,7 @@ static void dev_map_free(struct bpf_map *map)
bpf_map_area_free(dtab->netdev_map);
}
- kfree(dtab);
+ bpf_map_area_free(dtab);
}
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -477,7 +477,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdpf->len);
+ err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
if (unlikely(err))
return err;
@@ -536,7 +536,7 @@ static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
!obj->dev->netdev_ops->ndo_xdp_xmit)
return false;
- if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
+ if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
return false;
return true;
@@ -845,7 +845,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab_netdev *dev;
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
- GFP_ATOMIC | __GFP_NOWARN,
+ GFP_NOWAIT | __GFP_NOWARN,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 2444bd15cc2d..fa64b80b8bca 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -85,12 +85,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
return false;
}
-int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
return -ENOTSUPP;
}
-static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
{
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
int i;
@@ -99,12 +99,12 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
if (d->progs[i].prog)
*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
}
- return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+ return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
}
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
{
- void *old, *new;
+ void *old, *new, *tmp;
u32 noff;
int err;
@@ -117,8 +117,14 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
}
new = d->num_progs ? d->image + noff : NULL;
+ tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
- if (bpf_dispatcher_prepare(d, new))
+ /* Prepare the dispatcher in d->rw_image. Then use
+ * bpf_arch_text_copy to update d->image, which is RO+X.
+ */
+ if (bpf_dispatcher_prepare(d, new, tmp))
+ return;
+ if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
return;
}
@@ -140,9 +146,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_jit_alloc_exec_page();
+ d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
if (!d->image)
goto out;
+ d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!d->rw_image) {
+ u32 size = PAGE_SIZE;
+
+ bpf_arch_text_copy(d->image, &size, sizeof(size));
+ bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ d->image = NULL;
+ goto out;
+ }
bpf_image_ksym_add(d->image, &d->ksym);
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 17fb69c0e0dc..f39ee3e05589 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,6 +14,7 @@
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -61,30 +62,22 @@
*
* As regular device interrupt handlers and soft interrupts are forced into
* thread context, the existing code which does
- * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
* just works.
*
* In theory the BPF locks could be converted to regular spinlocks as well,
* but the bucket locks and percpu_freelist locks can be taken from
* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
- * atomic contexts even on RT. These mechanisms require preallocated maps,
- * so there is no need to invoke memory allocations within the lock held
- * sections.
- *
- * BPF maps which need dynamic allocation are only used from (forced)
- * thread context on RT and can therefore use regular spinlocks which in
- * turn allows to invoke memory allocations from the lock held section.
- *
- * On a non RT kernel this distinction is neither possible nor required.
- * spinlock maps to raw_spinlock and the extra code is optimized out by the
- * compiler.
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
*/
struct bucket {
struct hlist_nulls_head head;
- union {
- raw_spinlock_t raw_lock;
- spinlock_t lock;
- };
+ raw_spinlock_t raw_lock;
};
#define HASHTAB_MAP_LOCK_COUNT 8
@@ -92,6 +85,8 @@ struct bucket {
struct bpf_htab {
struct bpf_map map;
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
struct bucket *buckets;
void *elems;
union {
@@ -99,7 +94,12 @@ struct bpf_htab {
struct bpf_lru lru;
};
struct htab_elem *__percpu *extra_elems;
- atomic_t count; /* number of elements in this hashtable */
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
@@ -114,14 +114,14 @@ struct htab_elem {
struct {
void *padding;
union {
- struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
union {
- struct rcu_head rcu;
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -133,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab)
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
-static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
-{
- return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
-}
-
static void htab_init_buckets(struct bpf_htab *htab)
{
unsigned int i;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- if (htab_use_raw_lock(htab)) {
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- lockdep_set_class(&htab->buckets[i].raw_lock,
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ lockdep_set_class(&htab->buckets[i].raw_lock,
&htab->lockdep_key);
- } else {
- spin_lock_init(&htab->buckets[i].lock);
- lockdep_set_class(&htab->buckets[i].lock,
- &htab->lockdep_key);
- }
cond_resched();
}
}
@@ -165,17 +154,14 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
hash = hash & HASHTAB_MAP_LOCK_MASK;
- migrate_disable();
+ preempt_disable();
if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
return -EBUSY;
}
- if (htab_use_raw_lock(htab))
- raw_spin_lock_irqsave(&b->raw_lock, flags);
- else
- spin_lock_irqsave(&b->lock, flags);
+ raw_spin_lock_irqsave(&b->raw_lock, flags);
*pflags = flags;
return 0;
@@ -186,12 +172,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
unsigned long flags)
{
hash = hash & HASHTAB_MAP_LOCK_MASK;
- if (htab_use_raw_lock(htab))
- raw_spin_unlock_irqrestore(&b->raw_lock, flags);
- else
- spin_unlock_irqrestore(&b->lock, flags);
+ raw_spin_unlock_irqrestore(&b->raw_lock, flags);
__this_cpu_dec(*(htab->map_locked[hash]));
- migrate_enable();
+ preempt_enable();
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -311,12 +294,8 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
- u32 key_size = htab->map.key_size;
-
l = container_of(node, struct htab_elem, lru_node);
- memcpy(l->key, key, key_size);
- check_and_init_map_value(&htab->map,
- l->key + round_up(key_size, 8));
+ memcpy(l->key, key, htab->map.key_size);
return l;
}
@@ -432,8 +411,6 @@ static int htab_map_alloc_check(union bpf_attr *attr)
bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
int numa_node = bpf_map_attr_numa_node(attr);
- BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
- offsetof(struct htab_elem, hash_node.pprev));
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
@@ -495,7 +472,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
struct bpf_htab *htab;
int err, i;
- htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -550,10 +527,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
- htab->hashrnd = get_random_int();
+ htab->hashrnd = get_random_u32();
htab_init_buckets(htab);
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
+ }
+
if (prealloc) {
err = prealloc_init(htab);
if (err)
@@ -567,6 +567,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_prealloc;
}
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
}
return &htab->map;
@@ -574,12 +584,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_prealloc:
prealloc_destroy(htab);
free_map_locked:
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
@@ -851,17 +865,9 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
- free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
check_and_free_fields(htab, l);
- kfree(l);
-}
-
-static void htab_elem_free_rcu(struct rcu_head *head)
-{
- struct htab_elem *l = container_of(head, struct htab_elem, rcu);
- struct bpf_htab *htab = l->htab;
-
- htab_elem_free(htab, l);
+ bpf_mem_cache_free(&htab->ma, l);
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -875,6 +881,31 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
}
}
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
htab_put_fd_value(htab, l);
@@ -883,9 +914,8 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
- atomic_dec(&htab->count);
- l->htab = htab;
- call_rcu(&l->rcu, htab_elem_free_rcu);
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
}
}
@@ -910,13 +940,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
- /* When using prealloc and not setting the initial value on all cpus,
- * zero-fill element values for other cpus (just as what happens when
- * not using prealloc). Otherwise, bpf program has no way to ensure
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
* known initial values for cpus other than current one
* (onallcpus=false always when coming from bpf prog).
*/
- if (htab_is_prealloc(htab) && !onallcpus) {
+ if (!onallcpus) {
u32 size = round_up(htab->map.value_size, 8);
int current_cpu = raw_smp_processor_id();
int cpu;
@@ -967,19 +996,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = container_of(l, struct htab_elem, fnode);
}
} else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries)
- if (!old_elem) {
+ if (is_map_full(htab))
+ if (!old_elem)
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
- l_new = ERR_PTR(-E2BIG);
- goto dec_count;
- }
- l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
- GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
@@ -990,18 +1016,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
memcpy(l_new->key, key, key_size);
if (percpu) {
- size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
- GFP_ATOMIC | __GFP_NOWARN);
+ pptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
if (!pptr) {
- kfree(l_new);
+ bpf_mem_cache_free(&htab->ma, l_new);
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ l_new->ptr_to_pptr = pptr;
+ pptr = *(void **)pptr;
}
pcpu_init_value(htab, pptr, value, onallcpus);
@@ -1020,7 +1046,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new->hash = hash;
return l_new;
dec_count:
- atomic_dec(&htab->count);
+ dec_elem_count(htab);
return l_new;
}
@@ -1420,6 +1446,10 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread, so disable migration here,
+ * since bpf_mem_cache_free() relies on that.
+ */
+ migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1430,6 +1460,7 @@ static void delete_all_elements(struct bpf_htab *htab)
htab_elem_free(htab, l);
}
}
+ migrate_enable();
}
static void htab_free_malloced_timers(struct bpf_htab *htab)
@@ -1479,10 +1510,10 @@ static void htab_map_free(struct bpf_map *map)
* There is no need to synchronize_rcu() here to protect map elements.
*/
- /* some of free_htab_elem() callbacks for elements of this map may
- * not have executed. Wait for them.
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is reponsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
*/
- rcu_barrier();
if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
} else {
@@ -1493,10 +1524,14 @@ static void htab_map_free(struct bpf_map *map)
bpf_map_free_kptr_off_tab(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
free_percpu(htab->map_locked[i]);
lockdep_unregister_key(&htab->lockdep_key);
- kfree(htab);
+ bpf_map_area_free(htab);
}
static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1695,8 +1730,11 @@ again_nocopy:
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
ret = htab_lock_bucket(htab, b, batch, &flags);
- if (ret)
- goto next_batch;
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
}
bucket_cnt = 0;
@@ -2064,6 +2102,7 @@ static int bpf_iter_init_hash_map(void *priv_data,
seq_info->percpu_value_buf = value_buf;
}
+ bpf_map_inc_with_uref(map);
seq_info->map = map;
seq_info->htab = container_of(map, struct bpf_htab, map);
return 0;
@@ -2073,6 +2112,7 @@ static void bpf_iter_fini_hash_map(void *priv_data)
{
struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ bpf_map_put_with_uref(seq_info->map);
kfree(seq_info->percpu_value_buf);
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 225806a02efb..a6b04faed282 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -15,6 +15,7 @@
#include <linux/ctype.h>
#include <linux/jiffies.h>
#include <linux/pid_namespace.h>
+#include <linux/poison.h>
#include <linux/proc_ns.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
@@ -198,6 +199,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
.ret_type = RET_INTEGER,
};
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+ /* NMI safe access to clock tai */
+ return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+ .func = bpf_ktime_get_tai_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
@@ -415,40 +428,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
-
-#ifdef CONFIG_CGROUP_BPF
-
-BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
-{
- /* flags argument is not used now,
- * but provides an ability to extend the API.
- * verifier checks that its value is correct.
- */
- enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
- struct bpf_cgroup_storage *storage;
- struct bpf_cg_run_ctx *ctx;
- void *ptr;
-
- /* get current cgroup storage from BPF run context */
- ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
- storage = ctx->prog_item->cgroup_storage[stype];
-
- if (stype == BPF_CGROUP_STORAGE_SHARED)
- ptr = &READ_ONCE(storage->buf)->data[0];
- else
- ptr = this_cpu_ptr(storage->percpu_buf);
-
- return (unsigned long)ptr;
-}
-
-const struct bpf_func_proto bpf_get_local_storage_proto = {
- .func = bpf_get_local_storage,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-#endif
+#endif /* CONFIG_CGROUPS */
#define BPF_STRTOX_BASE_MASK 0x1F
@@ -577,14 +557,13 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
-#endif
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
{
return strncmp(s1, s2, s1_sz);
}
-const struct bpf_func_proto bpf_strncmp_proto = {
+static const struct bpf_func_proto bpf_strncmp_proto = {
.func = bpf_strncmp,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1398,11 +1377,10 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
}
/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
- * helper is determined dynamically by the verifier.
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
*/
-#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-
-const struct bpf_func_proto bpf_kptr_xchg_proto = {
+static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.func = bpf_kptr_xchg,
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
@@ -1430,7 +1408,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
-static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
}
@@ -1468,6 +1446,8 @@ BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_
{
int err;
+ BTF_TYPE_EMIT(struct bpf_dynptr);
+
err = bpf_dynptr_check_size(size);
if (err)
goto error;
@@ -1487,7 +1467,7 @@ error:
return err;
}
-const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.func = bpf_dynptr_from_mem,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1497,11 +1477,12 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
};
-BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+ u32, offset, u64, flags)
{
int err;
- if (!src->data)
+ if (!src->data || flags)
return -EINVAL;
err = bpf_dynptr_check_off_len(src, offset, len);
@@ -1513,7 +1494,7 @@ BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
return 0;
}
-const struct bpf_func_proto bpf_dynptr_read_proto = {
+static const struct bpf_func_proto bpf_dynptr_read_proto = {
.func = bpf_dynptr_read,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1521,13 +1502,15 @@ const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_DYNPTR,
.arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+ u32, len, u64, flags)
{
int err;
- if (!dst->data || bpf_dynptr_is_rdonly(dst))
+ if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
return -EINVAL;
err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1539,7 +1522,7 @@ BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *,
return 0;
}
-const struct bpf_func_proto bpf_dynptr_write_proto = {
+static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1547,6 +1530,7 @@ const struct bpf_func_proto bpf_dynptr_write_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
@@ -1566,7 +1550,7 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
return (unsigned long)(ptr->data + ptr->offset + offset);
}
-const struct bpf_func_proto bpf_dynptr_data_proto = {
+static const struct bpf_func_proto bpf_dynptr_data_proto = {
.func = bpf_dynptr_data,
.gpl_only = false,
.ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
@@ -1613,6 +1597,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ktime_get_tai_ns:
+ return &bpf_ktime_get_tai_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1623,26 +1609,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
- case BPF_FUNC_ringbuf_reserve_dynptr:
- return &bpf_ringbuf_reserve_dynptr_proto;
- case BPF_FUNC_ringbuf_submit_dynptr:
- return &bpf_ringbuf_submit_dynptr_proto;
- case BPF_FUNC_ringbuf_discard_dynptr:
- return &bpf_ringbuf_discard_dynptr_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_loop:
- return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
- case BPF_FUNC_dynptr_from_mem:
- return &bpf_dynptr_from_mem_proto;
- case BPF_FUNC_dynptr_read:
- return &bpf_dynptr_read_proto;
- case BPF_FUNC_dynptr_write:
- return &bpf_dynptr_write_proto;
- case BPF_FUNC_dynptr_data:
- return &bpf_dynptr_data_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
default:
break;
}
@@ -1671,6 +1643,26 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_timer_cancel_proto;
case BPF_FUNC_kptr_xchg:
return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_user_ringbuf_drain:
+ return &bpf_user_ringbuf_drain_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
default:
break;
}
@@ -1707,3 +1699,21 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
}
}
+
+BTF_SET8_START(tracing_btf_ids)
+#ifdef CONFIG_KEXEC_CORE
+BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
+#endif
+BTF_SET8_END(tracing_btf_ids)
+
+static const struct btf_kfunc_id_set tracing_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tracing_btf_ids,
+};
+
+static int __init kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set);
+}
+
+late_initcall(kfunc_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 8654fc97f5fe..098cf336fae6 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -165,7 +165,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
- __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
+ __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN,
map->numa_node);
if (!new)
return -ENOMEM;
@@ -313,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
- map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
- __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node);
+ map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node);
if (!map)
return ERR_PTR(-ENOMEM);
@@ -346,7 +345,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
- kfree(map);
+ bpf_map_area_free(map);
}
static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index f0d05a3cc4b9..d833496e9e42 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -285,7 +285,7 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN,
+ node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
trie->map.numa_node);
if (!node)
return NULL;
@@ -558,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
attr->value_size > LPM_VAL_SIZE_MAX)
return ERR_PTR(-EINVAL);
- trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
if (!trie)
return ERR_PTR(-ENOMEM);
@@ -609,7 +609,7 @@ static void trie_free(struct bpf_map *map)
}
out:
- kfree(trie);
+ bpf_map_area_free(trie);
}
static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..5f83be1d2018
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,635 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > 4096)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 1;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+
+ struct rcu_head rcu;
+ struct llist_head free_by_rcu;
+ struct llist_head waiting_for_gp;
+ atomic_t call_rcu_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node)
+{
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
+
+ if (c->percpu_size) {
+ void **obj = kmalloc_node(c->percpu_size, flags, node);
+ void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+#endif
+
+#ifdef CONFIG_MEMCG
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ unsigned long flags;
+ void *obj;
+ int i;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (i = 0; i < cnt; i++) {
+ obj = __alloc(c, node);
+ if (!obj)
+ break;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(struct bpf_mem_cache *c, void *obj)
+{
+ if (c->percpu_size) {
+ free_percpu(((void **)obj)[1]);
+ kfree(obj);
+ return;
+ }
+
+ kfree(obj);
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
+ struct llist_node *pos, *t;
+
+ llist_for_each_safe(pos, t, llnode)
+ free_one(c, pos);
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+
+ call_rcu(&c->rcu, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu list.
+ */
+ __llist_add(llnode, &c->free_by_rcu);
+}
+
+static void do_call_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ return;
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ /* There is no concurrent __llist_add(waiting_for_gp) access.
+ * It doesn't race with llist_del_all either.
+ * But there could be two concurrent llist_del_all(waiting_for_gp):
+ * from __free_rcu() and from drain_mem_cache().
+ */
+ __llist_add(llnode, &c->waiting_for_gp);
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu() to wait for normal progs to finish
+ * and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ do {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(flags);
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(flags);
+ if (llnode)
+ enque_to_free(c, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(c, llnode);
+ do_call_rcu(c);
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ */
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+
+ /* To avoid consuming memory assume that 1st run of bpf
+ * prog won't be doing more than 4 map_update_elem from
+ * irq disabled region
+ */
+ alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+ struct bpf_mem_caches *cc, __percpu *pcc;
+ struct bpf_mem_cache *c, __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (percpu)
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ else
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ /* size == 0 && percpu is an invalid combination */
+ if (WARN_ON_ONCE(percpu))
+ return -EINVAL;
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ objcg = get_obj_cgroup_from_current();
+#endif
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ prefill_mem_cache(c, cpu);
+ }
+ }
+ ma->caches = pcc;
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
+ */
+ llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist))
+ free_one(c, llnode);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ free_one(c, llnode);
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp lists was drained, but __free_rcu might
+ * still execute. Wait for it now before we freeing percpu caches.
+ */
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ copy->cache = ma->cache;
+ ma->cache = NULL;
+ copy->caches = ma->caches;
+ ma->caches = NULL;
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_unbound_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ /* objcg is the same across cpus */
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ if (c->objcg)
+ obj_cgroup_put(c->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ int idx;
+
+ if (!ptr)
+ return;
+
+ idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ if (idx < 0)
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index bd09290e3648..13e4efc971e6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -372,7 +372,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
attr->map_type != BPF_MAP_TYPE_HASH)
return ERR_PTR(-EINVAL);
- offmap = kzalloc(sizeof(*offmap), GFP_USER);
+ offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE);
if (!offmap)
return ERR_PTR(-ENOMEM);
@@ -404,7 +404,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
err_unlock:
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
return ERR_PTR(err);
}
@@ -428,7 +428,7 @@ void bpf_map_offload_map_free(struct bpf_map *map)
up_write(&bpf_devs_lock);
rtnl_unlock();
- kfree(offmap);
+ bpf_map_area_free(offmap);
}
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 3d897de89061..b6e7f5c5b9ab 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
node->next = head->first;
- head->first = node;
+ WRITE_ONCE(head->first, node);
}
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
{
int cpu, orig_cpu;
- orig_cpu = cpu = raw_smp_processor_id();
+ orig_cpu = raw_smp_processor_id();
while (1) {
- struct pcpu_freelist_head *head;
+ for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
+ struct pcpu_freelist_head *head;
- head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
- return;
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_spin_trylock(&head->lock)) {
+ pcpu_freelist_push_node(head, node);
+ raw_spin_unlock(&head->lock);
+ return;
+ }
}
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
/* cannot lock any per cpu lock, try extralist */
- if (cpu == orig_cpu &&
- pcpu_freelist_try_push_extra(s, node))
+ if (pcpu_freelist_try_push_extra(s, node))
return;
}
}
@@ -125,31 +123,29 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ continue;
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
- head->first = node->next;
+ WRITE_ONCE(head->first, node->next);
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* per cpu lists are all empty, try extralist */
+ if (!READ_ONCE(s->extralist.first))
+ return NULL;
raw_spin_lock(&s->extralist.lock);
node = s->extralist.first;
if (node)
- s->extralist.first = node->next;
+ WRITE_ONCE(s->extralist.first, node->next);
raw_spin_unlock(&s->extralist.lock);
return node;
}
@@ -159,33 +155,29 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- int orig_cpu, cpu;
+ int cpu;
- orig_cpu = cpu = raw_smp_processor_id();
- while (1) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ continue;
if (raw_spin_trylock(&head->lock)) {
node = head->first;
if (node) {
- head->first = node->next;
+ WRITE_ONCE(head->first, node->next);
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
}
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = 0;
- if (cpu == orig_cpu)
- break;
}
/* cannot pop from per cpu lists, try extralist */
- if (!raw_spin_trylock(&s->extralist.lock))
+ if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
return NULL;
node = s->extralist.first;
if (node)
- s->extralist.first = node->next;
+ WRITE_ONCE(s->extralist.first, node->next);
raw_spin_unlock(&s->extralist.lock);
return node;
}
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index bfe24f8c5a20..6762b1260f2f 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -9,7 +9,7 @@ LLVM_STRIP ?= llvm-strip
TOOLS_PATH := $(abspath ../../../../tools)
BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
BPFTOOL_OUTPUT := $(abs_out)/bpftool
-DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
+DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
BPFTOOL ?= $(DEFAULT_BPFTOOL)
LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
@@ -61,9 +61,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU
OUTPUT=$(abspath $(dir $@))/ prefix= \
DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
-$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
- $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \
- OUTPUT=$(BPFTOOL_OUTPUT)/ \
- LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \
- LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \
- prefix= DESTDIR=$(abs_out)/ install-bin
+$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index a1c0794ae49d..8a5e060de63b 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -78,8 +78,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
if (!qs)
return ERR_PTR(-ENOMEM);
- memset(qs, 0, sizeof(*qs));
-
bpf_map_init_from_attr(&qs->map, attr);
qs->size = size;
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index e2618fb5870e..82c61612f382 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -21,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map)
/* The caller must hold the reuseport_lock */
void bpf_sk_reuseport_detach(struct sock *sk)
{
- uintptr_t sk_user_data;
+ struct sock __rcu **socks;
write_lock_bh(&sk->sk_callback_lock);
- sk_user_data = (uintptr_t)sk->sk_user_data;
- if (sk_user_data & SK_USER_DATA_BPF) {
- struct sock __rcu **socks;
-
- socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+ if (socks) {
WRITE_ONCE(sk->sk_user_data, NULL);
/*
* Do not move this NULL assignment outside of
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index ded4faeca192..9e832acf4692 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -38,10 +38,43 @@ struct bpf_ringbuf {
struct page **pages;
int nr_pages;
spinlock_t spinlock ____cacheline_aligned_in_smp;
- /* Consumer and producer counters are put into separate pages to allow
- * mapping consumer page as r/w, but restrict producer page to r/o.
- * This protects producer position from being modified by user-space
- * application and ruining in-kernel position tracking.
+ /* For user-space producer ring buffers, an atomic_t busy bit is used
+ * to synchronize access to the ring buffers in the kernel, rather than
+ * the spinlock that is used for kernel-producer ring buffers. This is
+ * done because the ring buffer must hold a lock across a BPF program's
+ * callback:
+ *
+ * __bpf_user_ringbuf_peek() // lock acquired
+ * -> program callback_fn()
+ * -> __bpf_user_ringbuf_sample_release() // lock released
+ *
+ * It is unsafe and incorrect to hold an IRQ spinlock across what could
+ * be a long execution window, so we instead simply disallow concurrent
+ * access to the ring buffer by kernel consumers, and return -EBUSY from
+ * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+ */
+ atomic_t busy ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -116,7 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
err_free_pages:
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
return NULL;
}
@@ -136,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
return NULL;
spin_lock_init(&rb->spinlock);
+ atomic_set(&rb->busy, 0);
init_waitqueue_head(&rb->waitq);
init_irq_work(&rb->work, bpf_ringbuf_notify);
@@ -164,7 +198,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
#endif
- rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
+ rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
@@ -172,7 +206,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
if (!rb_map->rb) {
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
return ERR_PTR(-ENOMEM);
}
@@ -190,7 +224,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
vunmap(rb);
for (i = 0; i < nr_pages; i++)
__free_page(pages[i]);
- kvfree(pages);
+ bpf_map_area_free(pages);
}
static void ringbuf_map_free(struct bpf_map *map)
@@ -199,7 +233,7 @@ static void ringbuf_map_free(struct bpf_map *map)
rb_map = container_of(map, struct bpf_ringbuf_map, map);
bpf_ringbuf_free(rb_map->rb);
- kfree(rb_map);
+ bpf_map_area_free(rb_map);
}
static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
@@ -224,7 +258,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
@@ -242,6 +276,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
vma->vm_pgoff + RINGBUF_PGOFF);
}
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ } else {
+ vma->vm_flags &= ~VM_MAYWRITE;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
{
unsigned long cons_pos, prod_pos;
@@ -251,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
return prod_pos - cons_pos;
}
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
- struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+ return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
{
struct bpf_ringbuf_map *rb_map;
@@ -264,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
return 0;
}
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+ return EPOLLOUT | EPOLLWRNORM;
+ return 0;
+}
+
BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = ringbuf_map_alloc,
.map_free = ringbuf_map_free,
- .map_mmap = ringbuf_map_mmap,
- .map_poll = ringbuf_map_poll,
+ .map_mmap = ringbuf_map_mmap_kern,
+ .map_poll = ringbuf_map_poll_kern,
.map_lookup_elem = ringbuf_map_lookup_elem,
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
@@ -278,6 +350,20 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_btf_id = &ringbuf_map_btf_ids[0],
};
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_poll = ringbuf_map_poll_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
* calculate offset from record metadata to ring buffer in pages, rounded
* down. This page offset is stored as part of record metadata and allows to
@@ -312,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
- if (len > rb->mask + 1)
+ if (len > ringbuf_total_data_sz(rb))
return NULL;
cons_pos = smp_load_acquire(&rb->consumer_pos);
@@ -459,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
case BPF_RB_AVAIL_DATA:
return ringbuf_avail_data_sz(rb);
case BPF_RB_RING_SIZE:
- return rb->mask + 1;
+ return ringbuf_total_data_sz(rb);
case BPF_RB_CONS_POS:
return smp_load_acquire(&rb->consumer_pos);
case BPF_RB_PROD_POS:
@@ -553,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
.arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+ int err;
+ u32 hdr_len, sample_len, total_len, flags, *hdr;
+ u64 cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ if (prod_pos % 8)
+ return -EINVAL;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ if (cons_pos >= prod_pos)
+ return -ENODATA;
+
+ hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ hdr_len = smp_load_acquire(hdr);
+ flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+ sample_len = hdr_len & ~flags;
+ total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* The sample must fit within the region advertised by the producer position. */
+ if (total_len > prod_pos - cons_pos)
+ return -EINVAL;
+
+ /* The sample must fit within the data region of the ring buffer. */
+ if (total_len > ringbuf_total_data_sz(rb))
+ return -E2BIG;
+
+ /* The sample must fit into a struct bpf_dynptr. */
+ err = bpf_dynptr_check_size(sample_len);
+ if (err)
+ return -E2BIG;
+
+ if (flags & BPF_RINGBUF_DISCARD_BIT) {
+ /* If the discard bit is set, the sample should be skipped.
+ *
+ * Update the consumer pos, and return -EAGAIN so the caller
+ * knows to skip this sample and try to read the next one.
+ */
+ smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+ return -EAGAIN;
+ }
+
+ if (flags & BPF_RINGBUF_BUSY_BIT)
+ return -ENODATA;
+
+ *sample = (void *)((uintptr_t)rb->data +
+ (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+ *size = sample_len;
+ return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+ u64 consumer_pos;
+ u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* Using smp_load_acquire() is unnecessary here, as the busy-bit
+ * prevents another task from writing to consumer_pos after it was read
+ * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+ */
+ consumer_pos = rb->consumer_pos;
+ /* Synchronizes with smp_load_acquire() in user-space producer. */
+ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+ void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+ long samples, discarded_samples = 0, ret = 0;
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+ int busy = 0;
+
+ if (unlikely(flags & ~wakeup_flags))
+ return -EINVAL;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ /* If another consumer is already consuming a sample, wait for them to finish. */
+ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+ return -EBUSY;
+
+ for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+ int err;
+ u32 size;
+ void *sample;
+ struct bpf_dynptr_kern dynptr;
+
+ err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+ if (err) {
+ if (err == -ENODATA) {
+ break;
+ } else if (err == -EAGAIN) {
+ discarded_samples++;
+ continue;
+ } else {
+ ret = err;
+ goto schedule_work_return;
+ }
+ }
+
+ bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+ ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+ __bpf_user_ringbuf_sample_release(rb, size, flags);
+ }
+ ret = samples - discarded_samples;
+
+schedule_work_return:
+ /* Prevent the clearing of the busy-bit from being reordered before the
+ * storing of any rb consumer or producer positions.
+ */
+ smp_mb__before_atomic();
+ atomic_set(&rb->busy, 0);
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+ irq_work_queue(&rb->work);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+ .func = bpf_user_ringbuf_drain,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 1adbe67cdb95..aecea7451b61 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
int ret;
/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return bpf_get_stackid((unsigned long)(ctx->regs),
(unsigned long) map, flags, 0, 0);
@@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
int err = -EINVAL;
__u64 nr_kernel;
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2b69306d3c6e..7b373a5e861f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -419,35 +419,53 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
#ifdef CONFIG_MEMCG_KMEM
static void bpf_map_save_memcg(struct bpf_map *map)
{
- map->memcg = get_mem_cgroup_from_mm(current->mm);
+ /* Currently if a map is created by a process belonging to the root
+ * memory cgroup, get_obj_cgroup_from_current() will return NULL.
+ * So we have to check map->objcg for being NULL each time it's
+ * being used.
+ */
+ map->objcg = get_obj_cgroup_from_current();
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
- mem_cgroup_put(map->memcg);
+ if (map->objcg)
+ obj_cgroup_put(map->objcg);
+}
+
+static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+{
+ if (map->objcg)
+ return get_mem_cgroup_from_objcg(map->objcg);
+
+ return root_mem_cgroup;
}
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -455,12 +473,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -578,7 +598,7 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
if (off_desc->type == BPF_KPTR_UNREF) {
u64 *p = (u64 *)btf_id_ptr;
- WRITE_ONCE(p, 0);
+ WRITE_ONCE(*p, 0);
continue;
}
old_ptr = xchg(btf_id_ptr, 0);
@@ -618,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
bpf_map_free_id(map, do_idr_lock);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
}
}
@@ -1026,7 +1049,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY) {
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
@@ -1393,19 +1417,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
value_size = bpf_map_value_size(map);
-
- err = -ENOMEM;
- value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
-
- err = -EFAULT;
- if (copy_from_bpfptr(value, uvalue, value_size) != 0)
- goto free_value;
+ }
err = bpf_map_update_value(map, f, key, value, attr->flags);
-free_value:
kvfree(value);
free_key:
kvfree(key);
@@ -1417,9 +1436,9 @@ err_put:
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_user_ptr(attr->key);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
@@ -1439,7 +1458,7 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put;
}
- key = __bpf_copy_key(ukey, map->key_size);
+ key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
@@ -2074,6 +2093,17 @@ struct bpf_prog_kstats {
u64 misses;
};
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
+
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
@@ -3416,6 +3446,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_SK_LOOKUP;
case BPF_XDP:
return BPF_PROG_TYPE_XDP;
+ case BPF_LSM_CGROUP:
+ return BPF_PROG_TYPE_LSM;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -3469,6 +3501,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_LSM:
+ if (ptype == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type != BPF_LSM_CGROUP)
+ return -EINVAL;
+
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
@@ -3506,13 +3543,14 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_LSM:
return cgroup_bpf_prog_detach(attr, ptype);
default:
return -EINVAL;
}
}
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3548,6 +3586,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
+ case BPF_LSM_CGROUP:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
@@ -3857,6 +3896,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct btf *attach_btf = bpf_prog_get_target_btf(prog);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
struct bpf_prog_kstats stats;
@@ -4058,6 +4098,9 @@ static int bpf_prog_get_info_by_fd(struct file *file,
if (prog->aux->btf)
info.btf_id = btf_obj_id(prog->aux->btf);
+ info.attach_btf_id = prog->aux->attach_btf_id;
+ if (attach_btf)
+ info.attach_btf_obj_id = btf_obj_id(attach_btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
@@ -4090,14 +4133,15 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.nr_jited_line_info = 0;
if (info.nr_jited_line_info && ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
+ unsigned long line_addr;
__u64 __user *user_linfo;
u32 i;
user_linfo = u64_to_user_ptr(info.jited_line_info);
ulen = min_t(u32, info.nr_jited_line_info, ulen);
for (i = 0; i < ulen; i++) {
- if (put_user((__u64)(long)prog->aux->jited_linfo[i],
- &user_linfo[i]))
+ line_addr = (unsigned long)prog->aux->jited_linfo[i];
+ if (put_user((__u64)line_addr, &user_linfo[i]))
return -EFAULT;
}
} else {
@@ -4361,7 +4405,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
if (!task)
return -ENOENT;
@@ -4539,6 +4585,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_raw_tp_link_attach(prog, NULL);
else if (prog->expected_attach_type == BPF_TRACE_ITER)
ret = bpf_iter_link_attach(attr, uattr, prog);
+ else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ ret = cgroup_bpf_link_attach(attr, prog);
else
ret = bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
@@ -4905,7 +4953,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
@@ -5035,18 +5083,37 @@ static bool syscall_prog_is_valid_access(int off, int size,
BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
- struct bpf_prog * __maybe_unused prog;
- struct bpf_tramp_run_ctx __maybe_unused run_ctx;
-
switch (cmd) {
case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
case BPF_LINK_CREATE:
case BPF_RAW_TRACEPOINT_OPEN:
break;
+ default:
+ return -EINVAL;
+ }
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+}
+
+
+/* To shut up -Wmissing-prototypes.
+ * This function is used by the kernel light skeleton
+ * to load bpf programs when modules are loaded or during kernel boot.
+ * See tools/lib/bpf/skel_internal.h
+ */
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+ struct bpf_prog * __maybe_unused prog;
+ struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+
+ switch (cmd) {
#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
case BPF_PROG_TEST_RUN:
if (attr->test.data_in || attr->test.data_out ||
@@ -5077,11 +5144,10 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
return 0;
#endif
default:
- return -EINVAL;
+ return ____bpf_sys_bpf(cmd, attr, size);
}
- return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
-EXPORT_SYMBOL(bpf_sys_bpf);
+EXPORT_SYMBOL(kern_sys_bpf);
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
@@ -5130,7 +5196,7 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag
return *res ? 0 : -ENOENT;
}
-const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.func = bpf_kallsyms_lookup_name,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -5145,7 +5211,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
- return &bpf_sys_bpf_proto;
+ return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..c2a2182ce570 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -10,8 +10,17 @@
#include <linux/btf_ids.h>
#include "mmap_unlock_work.h"
+static const char * const iter_task_type_names[] = {
+ "ALL",
+ "TID",
+ "PID",
+};
+
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
};
struct bpf_iter_seq_task_info {
@@ -22,18 +31,115 @@ struct bpf_iter_seq_task_info {
u32 tid;
};
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task, *next_task;
+ struct pid *pid;
+ u32 saved_tid;
+
+ if (!*tid) {
+ /* The first time, the iterator calls this function. */
+ pid = find_pid_ns(common->pid, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return NULL;
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* If the control returns to user space and comes back to the
+ * kernel again, *tid and common->pid_visiting should be the
+ * same for task_seq_start() to pick up the correct task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
+
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ if (!pid)
+ return NULL;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return NULL;
+
+retry:
+ if (!pid_alive(task)) {
+ put_task_struct(task);
+ return NULL;
+ }
+
+ next_task = next_thread(task);
+ put_task_struct(task);
+ if (!next_task)
+ return NULL;
+
+ saved_tid = *tid;
+ *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+ if (!*tid || *tid == common->pid) {
+ /* Run out of tasks of a process. The tasks of a
+ * thread_group are linked as circular linked list.
+ */
+ *tid = saved_tid;
+ return NULL;
+ }
+
+ get_task_struct(next_task);
+ common->pid_visiting = *tid;
+
+ if (skip_if_dup_files && task->files == task->group_leader->files) {
+ task = next_task;
+ goto retry;
+ }
+
+ return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
u32 *tid,
bool skip_if_dup_files)
{
struct task_struct *task = NULL;
struct pid *pid;
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
rcu_read_lock();
retry:
- pid = find_ge_pid(*tid, ns);
+ pid = find_ge_pid(*tid, common->ns);
if (pid) {
- *tid = pid_nr_ns(pid, ns);
+ *tid = pid_nr_ns(pid, common->ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
@@ -56,7 +162,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
struct bpf_iter_seq_task_info *info = seq->private;
struct task_struct *task;
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -73,7 +179,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
++info->tid;
put_task_struct((struct task_struct *)v);
- task = task_seq_get_next(info->common.ns, &info->tid, false);
+ task = task_seq_get_next(&info->common, &info->tid, false);
if (!task)
return NULL;
@@ -117,6 +223,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
put_task_struct((struct task_struct *)v);
}
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ unsigned int flags;
+ struct pid *pid;
+ pid_t tgid;
+
+ if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+ return -EINVAL;
+
+ aux->task.type = BPF_TASK_ITER_ALL;
+ if (linfo->task.tid != 0) {
+ aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.pid = linfo->task.tid;
+ }
+ if (linfo->task.pid != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.pid = linfo->task.pid;
+ }
+ if (linfo->task.pid_fd != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+
+ pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+ aux->task.pid = tgid;
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
static const struct seq_operations task_seq_ops = {
.start = task_seq_start,
.next = task_seq_next,
@@ -137,8 +278,7 @@ struct bpf_iter_seq_task_file_info {
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
- struct pid_namespace *ns = info->common.ns;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
@@ -151,21 +291,18 @@ again:
curr_task = info->task;
curr_fd = info->fd;
} else {
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
info->task = NULL;
- info->tid = curr_tid;
return NULL;
}
- /* set info->task and info->tid */
+ /* set info->task */
info->task = curr_task;
- if (curr_tid == info->tid) {
+ if (saved_tid == info->tid)
curr_fd = info->fd;
- } else {
- info->tid = curr_tid;
+ else
curr_fd = 0;
- }
}
rcu_read_lock();
@@ -186,9 +323,15 @@ again:
/* the current task is done, go to the next task */
rcu_read_unlock();
put_task_struct(curr_task);
+
+ if (info->common.type == BPF_TASK_ITER_TID) {
+ info->task = NULL;
+ return NULL;
+ }
+
info->task = NULL;
info->fd = 0;
- curr_tid = ++(info->tid);
+ saved_tid = ++(info->tid);
goto again;
}
@@ -269,6 +412,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
struct bpf_iter_seq_task_common *common = priv_data;
common->ns = get_pid_ns(task_active_pid_ns(current));
+ common->type = aux->task.type;
+ common->pid = aux->task.pid;
+
return 0;
}
@@ -299,19 +445,18 @@ struct bpf_iter_seq_task_vma_info {
};
enum bpf_task_vma_iter_find_op {
- task_vma_iter_first_vma, /* use mm->mmap */
- task_vma_iter_next_vma, /* use curr_vma->vm_next */
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
task_vma_iter_find_vma, /* use find_vma() to find next vma */
};
static struct vm_area_struct *
task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
{
- struct pid_namespace *ns = info->common.ns;
enum bpf_task_vma_iter_find_op op;
struct vm_area_struct *curr_vma;
struct task_struct *curr_task;
- u32 curr_tid = info->tid;
+ u32 saved_tid = info->tid;
/* If this function returns a non-NULL vma, it holds a reference to
* the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +516,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
}
} else {
again:
- curr_task = task_seq_get_next(ns, &curr_tid, true);
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
if (!curr_task) {
- info->tid = curr_tid + 1;
+ info->tid++;
goto finish;
}
- if (curr_tid != info->tid) {
- info->tid = curr_tid;
+ if (saved_tid != info->tid) {
/* new task, process the first vma */
op = task_vma_iter_first_vma;
} else {
@@ -400,10 +544,10 @@ again:
switch (op) {
case task_vma_iter_first_vma:
- curr_vma = curr_task->mm->mmap;
+ curr_vma = find_vma(curr_task->mm, 0);
break;
case task_vma_iter_next_vma:
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
case task_vma_iter_find_vma:
/* We dropped mmap_lock so it is necessary to use find_vma
@@ -417,7 +561,7 @@ again:
if (curr_vma &&
curr_vma->vm_start == info->prev_vm_start &&
curr_vma->vm_end == info->prev_vm_end)
- curr_vma = curr_vma->vm_next;
+ curr_vma = find_vma(curr_task->mm, curr_vma->vm_end);
break;
}
if (!curr_vma) {
@@ -430,9 +574,12 @@ again:
return curr_vma;
next_task:
+ if (info->common.type == BPF_TASK_ITER_TID)
+ goto finish;
+
put_task_struct(curr_task);
info->task = NULL;
- curr_tid++;
+ info->tid++;
goto again;
finish:
@@ -531,8 +678,33 @@ static const struct bpf_iter_seq_info task_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
};
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+ switch (aux->task.type) {
+ case BPF_TASK_ITER_TID:
+ info->iter.task.tid = aux->task.pid;
+ break;
+ case BPF_TASK_ITER_TGID:
+ info->iter.task.pid = aux->task.pid;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+ seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+ if (aux->task.type == BPF_TASK_ITER_TID)
+ seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+ else if (aux->task.type == BPF_TASK_ITER_TGID)
+ seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
static struct bpf_iter_reg task_reg_info = {
.target = "task",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 1,
.ctx_arg_info = {
@@ -540,6 +712,8 @@ static struct bpf_iter_reg task_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -551,6 +725,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
static struct bpf_iter_reg task_file_reg_info = {
.target = "task_file",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -560,6 +735,8 @@ static struct bpf_iter_reg task_file_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_file_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -571,6 +748,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
static struct bpf_iter_reg task_vma_reg_info = {
.target = "task_vma",
+ .attach_target = bpf_iter_attach_task,
.feature = BPF_ITER_RESCHED,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
@@ -580,6 +758,8 @@ static struct bpf_iter_reg task_vma_reg_info = {
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &task_vma_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
};
BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 93c7675f0c9e..bf0906e1e2b9 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -11,6 +11,9 @@
#include <linux/rcupdate_wait.h>
#include <linux/module.h>
#include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
+#include <linux/delay.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -27,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+{
+ struct bpf_trampoline *tr = ops->private;
+ int ret = 0;
+
+ if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+ /* This is called inside register_ftrace_direct_multi(), so
+ * tr->mutex is already locked.
+ */
+ lockdep_assert_held_once(&tr->mutex);
+
+ /* Instead of updating the trampoline here, we propagate
+ * -EAGAIN to register_ftrace_direct_multi(). Then we can
+ * retry register_ftrace_direct_multi() after updating the
+ * trampoline.
+ */
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+ if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+ return -EBUSY;
+
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+ return -EAGAIN;
+ }
+
+ return 0;
+ }
+
+ /* The normal locking order is
+ * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+ *
+ * The following two commands are called from
+ *
+ * prepare_direct_functions_for_ipmodify
+ * cleanup_direct_functions_after_ipmodify
+ *
+ * In both cases, direct_mutex is already locked. Use
+ * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+ * (something else is making changes to this same trampoline).
+ */
+ if (!mutex_trylock(&tr->mutex)) {
+ /* sleep 1 ms to make sure whatever holding tr->mutex makes
+ * some progress.
+ */
+ msleep(1);
+ return -EAGAIN;
+ }
+
+ switch (cmd) {
+ case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+ tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tr->mutex);
+ return ret;
+}
+#endif
+
bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
{
enum bpf_attach_type eatype = prog->expected_attach_type;
@@ -38,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void *bpf_jit_alloc_exec_page(void)
-{
- void *image;
-
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return NULL;
-
- set_vm_flush_reset_perms(image);
- /* Keep image as writeable. The alternative is to keep flipping ro/rw
- * every time new program is attached or detached.
- */
- set_memory_x((long)image, 1);
- return image;
-}
-
void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
@@ -87,6 +149,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops) {
+ kfree(tr);
+ tr = NULL;
+ goto out;
+ }
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
@@ -126,7 +198,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
@@ -135,15 +207,20 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
return ret;
}
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+ bool lock_direct_mutex)
{
void *ip = tr->func.addr;
int ret;
- if (tr->func.ftrace_managed)
- ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ if (lock_direct_mutex)
+ ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+ else
+ ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ }
return ret;
}
@@ -155,16 +232,21 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
int ret;
faddr = ftrace_location((unsigned long)ip);
- if (faddr)
+ if (faddr) {
+ if (!tr->fops)
+ return -ENOTSUPP;
tr->func.ftrace_managed = true;
+ }
if (bpf_trampoline_module_get(tr))
return -ENOENT;
- if (tr->func.ftrace_managed)
- ret = register_ftrace_direct((long)ip, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ }
if (ret)
bpf_trampoline_module_put(tr);
@@ -306,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
goto out_free_im;
err = -ENOMEM;
- im->image = image = bpf_jit_alloc_exec_page();
+ im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!image)
goto out_uncharge;
+ set_vm_flush_reset_perms(image);
err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
if (err)
@@ -330,11 +413,11 @@ out:
return ERR_PTR(err);
}
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
{
struct bpf_tramp_image *im;
struct bpf_tramp_links *tlinks;
- u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+ u32 orig_flags = tr->flags;
bool ip_arg = false;
int err, total;
@@ -356,34 +439,70 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
goto out;
}
+ /* clear all bits except SHARE_IPMODIFY */
+ tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+
if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
- tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links)
- flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+ /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
+ * should not be set together.
+ */
+ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ } else {
+ tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+ }
if (ip_arg)
- flags |= BPF_TRAMP_F_IP_ARG;
+ tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+ if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
+ (tr->flags & BPF_TRAMP_F_CALL_ORIG))
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+#endif
err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
- &tr->func.model, flags, tlinks,
+ &tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
goto out;
+ set_memory_ro((long)im->image, 1);
+ set_memory_x((long)im->image, 1);
+
WARN_ON(tr->cur_image && tr->selector == 0);
WARN_ON(!tr->cur_image && tr->selector);
if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, tr->cur_image->image, im->image);
+ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
else
/* first time registering */
err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ if (err == -EAGAIN) {
+ /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+ * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+ * trampoline again, and retry register.
+ */
+ /* reset fops->func and fops->trampoline for re-register */
+ tr->fops->func = NULL;
+ tr->fops->trampoline = 0;
+ goto again;
+ }
+#endif
if (err)
goto out;
+
if (tr->cur_image)
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = im;
tr->selector++;
out:
+ /* If any error happens, restore previous flags */
+ if (err)
+ tr->flags = orig_flags;
kfree(tlinks);
return err;
}
@@ -410,7 +529,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
@@ -418,81 +537,256 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
int cnt = 0, i;
kind = bpf_attach_type_to_tramp(link->link.prog);
- mutex_lock(&tr->mutex);
- if (tr->extension_prog) {
+ if (tr->extension_prog)
/* cannot attach fentry/fexit if extension prog is attached.
* cannot overwrite extension prog either.
*/
- err = -EBUSY;
- goto out;
- }
+ return -EBUSY;
for (i = 0; i < BPF_TRAMP_MAX; i++)
cnt += tr->progs_cnt[i];
if (kind == BPF_TRAMP_REPLACE) {
/* Cannot attach extension if fentry/fexit are in use. */
- if (cnt) {
- err = -EBUSY;
- goto out;
- }
+ if (cnt)
+ return -EBUSY;
tr->extension_prog = link->link.prog;
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
- link->link.prog->bpf_func);
- goto out;
- }
- if (cnt >= BPF_MAX_TRAMP_LINKS) {
- err = -E2BIG;
- goto out;
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ link->link.prog->bpf_func);
}
- if (!hlist_unhashed(&link->tramp_hlist)) {
+ if (cnt >= BPF_MAX_TRAMP_LINKS)
+ return -E2BIG;
+ if (!hlist_unhashed(&link->tramp_hlist))
/* prog already linked */
- err = -EBUSY;
- goto out;
- }
+ return -EBUSY;
hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
if (link_exiting->link.prog != link->link.prog)
continue;
/* prog already linked */
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
tr->progs_cnt[kind]++;
- err = bpf_trampoline_update(tr);
+ err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
if (err) {
hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
}
-out:
+ return err;
+}
+
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_link_prog(link, tr);
mutex_unlock(&tr->mutex);
return err;
}
-/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
int err;
kind = bpf_attach_type_to_tramp(link->link.prog);
- mutex_lock(&tr->mutex);
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
- goto out;
+ return err;
}
hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
- err = bpf_trampoline_update(tr);
-out:
+ return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_unlink_prog(link, tr);
mutex_unlock(&tr->mutex);
return err;
}
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+ if (!shim_link->trampoline)
+ return;
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+ bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+ .release = bpf_shim_tramp_link_release,
+ .dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+ bpf_func_t bpf_func,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_prog *p;
+
+ shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+ if (!shim_link)
+ return NULL;
+
+ p = bpf_prog_alloc(1, 0);
+ if (!p) {
+ kfree(shim_link);
+ return NULL;
+ }
+
+ p->jited = false;
+ p->bpf_func = bpf_func;
+
+ p->aux->cgroup_atype = cgroup_atype;
+ p->aux->attach_func_proto = prog->aux->attach_func_proto;
+ p->aux->attach_btf_id = prog->aux->attach_btf_id;
+ p->aux->attach_btf = prog->aux->attach_btf;
+ btf_get(p->aux->attach_btf);
+ p->type = BPF_PROG_TYPE_LSM;
+ p->expected_attach_type = BPF_LSM_MAC;
+ bpf_prog_inc(p);
+ bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+ &bpf_shim_tramp_link_lops, p);
+ bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
+
+ return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+ bpf_func_t bpf_func)
+{
+ struct bpf_tramp_link *link;
+ int kind;
+
+ for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ struct bpf_prog *p = link->link.prog;
+
+ if (p->bpf_func == bpf_func)
+ return container_of(link, struct bpf_shim_tramp_link, link);
+ }
+ }
+
+ return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_attach_target_info tgt_info = {};
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+ int err;
+
+ err = bpf_check_attach_target(NULL, prog, NULL,
+ prog->aux->attach_btf_id,
+ &tgt_info);
+ if (err)
+ return err;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ mutex_lock(&tr->mutex);
+
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ if (shim_link) {
+ /* Reusing existing shim attached by the other program. */
+ bpf_link_inc(&shim_link->link.link);
+
+ mutex_unlock(&tr->mutex);
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+ return 0;
+ }
+
+ /* Allocate and install new shim. */
+
+ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+ if (!shim_link) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+ if (err)
+ goto err;
+
+ shim_link->trampoline = tr;
+ /* note, we're still holding tr refcnt from above */
+
+ mutex_unlock(&tr->mutex);
+
+ return 0;
+err:
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ /* have to release tr while _not_ holding its mutex */
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
+ return err;
+}
+
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_lookup(key);
+ if (WARN_ON_ONCE(!tr))
+ return;
+
+ mutex_lock(&tr->mutex);
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
struct bpf_trampoline *bpf_trampoline_get(u64 key,
struct bpf_attach_target_info *tgt_info)
{
@@ -535,6 +829,10 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* multiple rcu callbacks.
*/
hlist_del(&tr->hlist);
+ if (tr->fops) {
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+ }
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
@@ -553,17 +851,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
return start;
}
-static void notrace inc_misses_counter(struct bpf_prog *prog)
-{
- struct bpf_prog_stats *stats;
- unsigned int flags;
-
- stats = this_cpu_ptr(prog->stats);
- flags = u64_stats_update_begin_irqsave(&stats->syncp);
- u64_stats_inc(&stats->misses);
- u64_stats_update_end_irqrestore(&stats->syncp, flags);
-}
-
/* The logic is similar to bpf_prog_run(), but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
@@ -585,8 +872,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
return bpf_prog_start_time();
@@ -620,7 +907,32 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
+ migrate_enable();
+ rcu_read_unlock();
+}
+
+u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ /* Runtime stats are exported via actual BPF_LSM_CGROUP
+ * programs, not the shims.
+ */
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return NO_START_TIME;
+}
+
+void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
migrate_enable();
rcu_read_unlock();
}
@@ -631,8 +943,8 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
migrate_disable();
might_fault();
- if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
- inc_misses_counter(prog);
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
return 0;
}
@@ -647,11 +959,34 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- __this_cpu_dec(*(prog->active));
+ this_cpu_dec(*(prog->active));
migrate_enable();
rcu_read_unlock_trace();
}
+u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock();
+}
+
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
{
percpu_ref_get(&tr->pcref);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aedac2ac02b9..014ee0953dbd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,7 @@
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
+#include <linux/poison.h>
#include "disasm.h"
@@ -370,6 +371,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
+EXPORT_SYMBOL_GPL(bpf_log);
static const char *ltrim(const char *s)
{
@@ -427,6 +429,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
static bool type_is_pkt_pointer(enum bpf_reg_type type)
{
+ type = base_type(type);
return type == PTR_TO_PACKET ||
type == PTR_TO_PACKET_META;
}
@@ -456,10 +459,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
- return base_type(type) == PTR_TO_SOCKET ||
- base_type(type) == PTR_TO_TCP_SOCK ||
- base_type(type) == PTR_TO_MEM ||
- base_type(type) == PTR_TO_BTF_ID;
+ type = base_type(type);
+ return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MEM || type == PTR_TO_BTF_ID;
}
static bool type_is_rdonly_mem(u32 type)
@@ -467,25 +469,11 @@ static bool type_is_rdonly_mem(u32 type)
return type & MEM_RDONLY;
}
-static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
-{
- return type == ARG_PTR_TO_SOCK_COMMON;
-}
-
static bool type_may_be_null(u32 type)
{
return type & PTR_MAYBE_NULL;
}
-static bool may_be_acquire_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp ||
- func_id == BPF_FUNC_skc_lookup_tcp ||
- func_id == BPF_FUNC_map_lookup_elem ||
- func_id == BPF_FUNC_ringbuf_reserve;
-}
-
static bool is_acquire_function(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -518,6 +506,26 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_skc_to_tcp_request_sock;
}
+static bool is_dynptr_ref_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_dynptr_data;
+}
+
+static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ int ref_obj_uses = 0;
+
+ if (is_ptr_cast_function(func_id))
+ ref_obj_uses++;
+ if (is_acquire_function(func_id, map))
+ ref_obj_uses++;
+ if (is_dynptr_ref_function(func_id))
+ ref_obj_uses++;
+
+ return ref_obj_uses > 1;
+}
+
static bool is_cmpxchg_insn(const struct bpf_insn *insn)
{
return BPF_CLASS(insn->code) == BPF_STX &&
@@ -555,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
+ [PTR_TO_DYNPTR] = "dynptr_ptr",
};
if (type & PTR_MAYBE_NULL) {
@@ -773,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type)
+bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
int spi = get_spi(reg->off);
@@ -790,11 +799,24 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re
return false;
}
+ return true;
+}
+
+bool is_dynptr_type_expected(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type dynptr_type;
+ int spi = get_spi(reg->off);
+
/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
if (arg_type == ARG_PTR_TO_DYNPTR)
return true;
- return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+ dynptr_type = arg_to_dynptr_type(arg_type);
+
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
}
/* The reg state of a pointer or a bounded scalar was saved when
@@ -1086,6 +1108,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
id = ++env->id_gen;
state->refs[new_ofs].id = id;
state->refs[new_ofs].insn_idx = insn_idx;
+ state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;
return id;
}
@@ -1098,6 +1121,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id)
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
+ /* Cannot release caller references in callbacks */
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ return -EINVAL;
if (last_idx && i != last_idx)
memcpy(&state->refs[i], &state->refs[last_idx],
sizeof(*state->refs));
@@ -1562,6 +1588,21 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static void reg_bounds_sync(struct bpf_reg_state *reg)
+{
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(reg);
+}
+
static bool __reg32_bound_s64(s32 a)
{
return a >= 0 && a <= S32_MAX;
@@ -1603,16 +1644,8 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
* so they do not impact tnum bounds calculation.
*/
__mark_reg64_unbounded(reg);
- __update_reg_bounds(reg);
}
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
+ reg_bounds_sync(reg);
}
static bool __reg64_bound_s32(s64 a)
@@ -1628,7 +1661,6 @@ static bool __reg64_bound_u32(u64 a)
static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
{
__mark_reg32_unbounded(reg);
-
if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
reg->s32_min_value = (s32)reg->smin_value;
reg->s32_max_value = (s32)reg->smax_value;
@@ -1637,14 +1669,7 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
reg->u32_min_value = (u32)reg->umin_value;
reg->u32_max_value = (u32)reg->umax_value;
}
-
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __reg_deduce_bounds(reg);
- __reg_bound_offset(reg);
- __update_reg_bounds(reg);
+ reg_bounds_sync(reg);
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -1740,6 +1765,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
+ state->callback_ret_range = tnum_range(0, 0);
init_reg_state(env, state);
mark_verifier_state_scratched(env);
}
@@ -2899,7 +2925,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
}
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
return __mark_chain_precision(env, regno, -1);
}
@@ -5224,6 +5250,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
env,
regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_CTX:
+ /* in case the function doesn't know how to access the context,
+ * (because we are in a program of type SYSCALL for example), we
+ * can not statically check its size.
+ * Dynamically check it now.
+ */
+ if (!env->ops->convert_ctx_access) {
+ enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
+ int offset = access_size - 1;
+
+ /* Allow zero-byte read from PTR_TO_CTX */
+ if (access_size == 0)
+ return zero_size_allowed ? 0 : -EACCES;
+
+ return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+ atype, -1, false);
+ }
+
+ fallthrough;
default: /* scalar_value or invalid ptr */
/* Allow zero-byte read from NULL, regardless of pointer type */
if (zero_size_allowed && access_size == 0 &&
@@ -5534,17 +5579,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
-static bool arg_type_is_alloc_size(enum bpf_arg_type type)
-{
- return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
-}
-
-static bool arg_type_is_int_ptr(enum bpf_arg_type type)
-{
- return type == ARG_PTR_TO_INT ||
- type == ARG_PTR_TO_LONG;
-}
-
static bool arg_type_is_release(enum bpf_arg_type type)
{
return type & OBJ_RELEASE;
@@ -5668,6 +5702,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK }
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types dynptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+ }
+};
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -5694,7 +5734,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
[ARG_PTR_TO_KPTR] = &kptr_types,
- [ARG_PTR_TO_DYNPTR] = &stack_ptr_types,
+ [ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5763,13 +5803,22 @@ found:
if (meta->func_id == BPF_FUNC_kptr_xchg) {
if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
return -EACCES;
- } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- btf_vmlinux, *arg_btf_id,
- strict_type_match)) {
- verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
- return -EACCES;
+ } else {
+ if (arg_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, kernel_type_name(reg->btf, reg->btf_id),
+ kernel_type_name(btf_vmlinux, *arg_btf_id));
+ return -EACCES;
+ }
}
}
@@ -5848,6 +5897,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
+ u32 *arg_btf_id = NULL;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -5884,7 +5934,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
*/
goto skip_type_check;
- err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta);
+ /* arg_btf_id and arg_size are in a union. */
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID)
+ arg_btf_id = fn->arg_btf_id[arg];
+
+ err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
if (err)
return err;
@@ -5925,7 +5979,8 @@ skip_type_check:
meta->ref_obj_id = reg->ref_obj_id;
}
- if (arg_type == ARG_CONST_MAP_PTR) {
+ switch (base_type(arg_type)) {
+ case ARG_CONST_MAP_PTR:
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
if (meta->map_ptr) {
/* Use map_uid (which is unique id of inner map) to reject:
@@ -5950,7 +6005,8 @@ skip_type_check:
}
meta->map_ptr = reg->map_ptr;
meta->map_uid = reg->map_uid;
- } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ break;
+ case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -5967,7 +6023,8 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
+ break;
+ case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
return 0;
@@ -5983,14 +6040,16 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->value_size, false,
meta);
- } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
+ break;
+ case ARG_PTR_TO_PERCPU_BTF_ID:
if (!reg->btf_id) {
verbose(env, "Helper has invalid btf_id in R%d\n", regno);
return -EACCES;
}
meta->ret_btf = reg->btf;
meta->ret_btf_id = reg->btf_id;
- } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ break;
+ case ARG_PTR_TO_SPIN_LOCK:
if (meta->func_id == BPF_FUNC_spin_lock) {
if (process_spin_lock(env, regno, true))
return -EACCES;
@@ -6001,21 +6060,39 @@ skip_type_check:
verbose(env, "verifier internal error\n");
return -EFAULT;
}
- } else if (arg_type == ARG_PTR_TO_TIMER) {
+ break;
+ case ARG_PTR_TO_TIMER:
if (process_timer_func(env, regno, meta))
return -EACCES;
- } else if (arg_type == ARG_PTR_TO_FUNC) {
+ break;
+ case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno;
- } else if (base_type(arg_type) == ARG_PTR_TO_MEM) {
+ break;
+ case ARG_PTR_TO_MEM:
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
*/
meta->raw_mode = arg_type & MEM_UNINIT;
- } else if (arg_type_is_mem_size(arg_type)) {
- bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
-
- err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
- } else if (arg_type_is_dynptr(arg_type)) {
+ if (arg_type & MEM_FIXED_SIZE) {
+ err = check_helper_mem_access(env, regno,
+ fn->arg_size[arg], false,
+ meta);
+ }
+ break;
+ case ARG_CONST_SIZE:
+ err = check_mem_size_reg(env, reg, regno, false, meta);
+ break;
+ case ARG_CONST_SIZE_OR_ZERO:
+ err = check_mem_size_reg(env, reg, regno, true, meta);
+ break;
+ case ARG_PTR_TO_DYNPTR:
+ /* We only need to check for initialized / uninitialized helper
+ * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
+ * assumption is that if it is, that a helper function
+ * initialized the dynptr on behalf of the BPF program.
+ */
+ if (base_type(reg->type) == PTR_TO_DYNPTR)
+ break;
if (arg_type & MEM_UNINIT) {
if (!is_dynptr_reg_valid_uninit(env, reg)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
@@ -6031,39 +6108,55 @@ skip_type_check:
}
meta->uninit_dynptr_regno = regno;
- } else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
+ } else if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ arg + 1);
+ return -EINVAL;
+ } else if (!is_dynptr_type_expected(env, reg, arg_type)) {
const char *err_extra = "";
switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
case DYNPTR_TYPE_LOCAL:
- err_extra = "local ";
+ err_extra = "local";
break;
case DYNPTR_TYPE_RINGBUF:
- err_extra = "ringbuf ";
+ err_extra = "ringbuf";
break;
default:
+ err_extra = "<unknown>";
break;
}
-
- verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
err_extra, arg + 1);
return -EINVAL;
}
- } else if (arg_type_is_alloc_size(arg_type)) {
+ break;
+ case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
regno);
return -EACCES;
}
meta->mem_size = reg->var_off.value;
- } else if (arg_type_is_int_ptr(arg_type)) {
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_INT:
+ case ARG_PTR_TO_LONG:
+ {
int size = int_ptr_type_to_size(arg_type);
err = check_helper_mem_access(env, regno, size, false, meta);
if (err)
return err;
err = check_ptr_alignment(env, reg, 0, size, true);
- } else if (arg_type == ARG_PTR_TO_CONST_STR) {
+ break;
+ }
+ case ARG_PTR_TO_CONST_STR:
+ {
struct bpf_map *map = reg->map_ptr;
int map_off;
u64 map_addr;
@@ -6102,9 +6195,12 @@ skip_type_check:
verbose(env, "string is not zero-terminated\n");
return -EINVAL;
}
- } else if (arg_type == ARG_PTR_TO_KPTR) {
+ break;
+ }
+ case ARG_PTR_TO_KPTR:
if (process_kptr_func(env, regno, meta))
return -EACCES;
+ break;
}
return err;
@@ -6144,7 +6240,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
- return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
+ return env->prog->jit_requested &&
+ bpf_jit_supports_subprog_tailcalls();
}
static int check_map_func_compatibility(struct bpf_verifier_env *env,
@@ -6176,6 +6273,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_ringbuf_discard_dynptr)
goto error;
break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ if (func_id != BPF_FUNC_user_ringbuf_drain)
+ goto error;
+ break;
case BPF_MAP_TYPE_STACK_TRACE:
if (func_id != BPF_FUNC_get_stackid)
goto error;
@@ -6295,6 +6396,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_RINGBUF)
goto error;
break;
+ case BPF_FUNC_user_ringbuf_drain:
+ if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -6400,11 +6505,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
return count <= 1;
}
-static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
- enum bpf_arg_type arg_next)
+static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
{
- return (base_type(arg_curr) == ARG_PTR_TO_MEM) !=
- arg_type_is_mem_size(arg_next);
+ bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
+ bool has_size = fn->arg_size[arg] != 0;
+ bool is_next_size = false;
+
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type))
+ is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
+
+ if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
+ return is_next_size;
+
+ return has_size == is_next_size || is_next_size == is_fixed;
}
static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -6415,43 +6528,16 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
* helper function specification.
*/
if (arg_type_is_mem_size(fn->arg1_type) ||
- base_type(fn->arg5_type) == ARG_PTR_TO_MEM ||
- check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
- check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
- check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
- check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+ check_args_pair_invalid(fn, 0) ||
+ check_args_pair_invalid(fn, 1) ||
+ check_args_pair_invalid(fn, 2) ||
+ check_args_pair_invalid(fn, 3) ||
+ check_args_pair_invalid(fn, 4))
return false;
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
-{
- int count = 0;
-
- if (arg_type_may_be_refcounted(fn->arg1_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg2_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg3_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg4_type))
- count++;
- if (arg_type_may_be_refcounted(fn->arg5_type))
- count++;
-
- /* A reference acquiring function cannot acquire
- * another refcounted ptr.
- */
- if (may_be_acquire_function(func_id) && count)
- return false;
-
- /* We only support one arg being unreferenced at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- return count <= 1;
-}
-
static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
int i;
@@ -6460,50 +6546,35 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
return false;
- if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+ if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
+ /* arg_btf_id and arg_size are in a union. */
+ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
+ !(fn->arg_type[i] & MEM_FIXED_SIZE)))
return false;
}
return true;
}
-static int check_func_proto(const struct bpf_func_proto *fn, int func_id,
- struct bpf_call_arg_meta *meta)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_btf_id_ok(fn) &&
- check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
+ check_btf_id_ok(fn) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
*/
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
- struct bpf_func_state *state)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (reg_is_pkt_pointer_any(&regs[i]))
- mark_reg_unknown(env, regs, i);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
if (reg_is_pkt_pointer_any(reg))
__mark_reg_unknown(env, reg);
- }
-}
-
-static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
-{
- struct bpf_verifier_state *vstate = env->cur_state;
- int i;
-
- for (i = 0; i <= vstate->curframe; i++)
- __clear_all_pkt_pointers(env, vstate->frame[i]);
+ }));
}
enum {
@@ -6532,41 +6603,24 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
-static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state,
- int ref_obj_id)
-{
- struct bpf_reg_state *regs = state->regs, *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].ref_obj_id == ref_obj_id)
- mark_reg_unknown(env, regs, i);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(env, reg);
- }
-}
-
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
*/
static int release_reference(struct bpf_verifier_env *env,
int ref_obj_id)
{
- struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
int err;
- int i;
err = release_reference_state(cur_func(env), ref_obj_id);
if (err)
return err;
- for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], ref_obj_id);
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ __mark_reg_unknown(env, reg);
+ }));
return 0;
}
@@ -6614,7 +6668,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
func_info_aux = env->prog->aux->func_info_aux;
if (func_info_aux)
is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_subprog_arg_match(env, subprog, caller->regs);
+ err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
if (is_global) {
@@ -6788,6 +6842,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return err;
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6809,6 +6864,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6838,6 +6894,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
return 0;
}
@@ -6865,6 +6922,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
+ callee->callback_ret_range = tnum_range(0, 1);
+ return 0;
+}
+
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+ * callback_ctx, u64 flags);
+ * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
return 0;
}
@@ -6892,7 +6973,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller = state->frame[state->curframe];
if (callee->in_callback_fn) {
/* enforce R0 return value range [0, 1]. */
- struct tnum range = tnum_range(0, 1);
+ struct tnum range = callee->callback_ret_range;
if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n");
@@ -6907,10 +6988,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
+ /* callback_fn frame should have released its own additions to parent's
+ * reference state at this point, or check_reference_leak would
+ * complain, hence it must be the same as the caller. There is no need
+ * to copy it back.
+ */
+ if (!callee->in_callback_fn) {
+ /* Transfer references to the caller */
+ err = copy_reference_state(caller, callee);
+ if (err)
+ return err;
+ }
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
@@ -6943,9 +7031,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
ret_reg->s32_max_value = meta->msize_max_value;
ret_reg->smin_value = -MAX_ERRNO;
ret_reg->s32_min_value = -MAX_ERRNO;
- __reg_deduce_bounds(ret_reg);
- __reg_bound_offset(ret_reg);
- __update_reg_bounds(ret_reg);
+ reg_bounds_sync(ret_reg);
}
static int
@@ -7001,8 +7087,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
struct bpf_reg_state *regs = cur_regs(env), *reg;
struct bpf_map *map = meta->map_ptr;
- struct tnum range;
- u64 val;
+ u64 val, max;
int err;
if (func_id != BPF_FUNC_tail_call)
@@ -7012,10 +7097,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EINVAL;
}
- range = tnum_range(0, map->max_entries - 1);
reg = &regs[BPF_REG_3];
+ val = reg->var_off.value;
+ max = map->max_entries;
- if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+ if (!(register_is_const(reg) && val < max)) {
bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
return 0;
}
@@ -7023,8 +7109,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
err = mark_chain_precision(env, BPF_REG_3);
if (err)
return err;
-
- val = reg->var_off.value;
if (bpf_map_key_unseen(aux))
bpf_map_key_store(aux, val);
else if (!bpf_map_key_poisoned(aux) &&
@@ -7036,13 +7120,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env)
{
struct bpf_func_state *state = cur_func(env);
+ bool refs_lingering = false;
int i;
+ if (state->frameno && !state->in_callback_fn)
+ return 0;
+
for (i = 0; i < state->acquired_refs; i++) {
+ if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
}
- return state->acquired_refs ? -EINVAL : 0;
+ return refs_lingering ? -EINVAL : 0;
}
static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
@@ -7103,9 +7194,45 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ bool reg_is_null = register_is_null(reg);
+
+ if (reg_is_null)
+ mark_chain_precision(env, BPF_REG_4);
+
+ return reg_is_null;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+ struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+ if (!state->initialized) {
+ state->initialized = 1;
+ state->fit_for_inline = loop_flag_is_zero(env);
+ state->callback_subprogno = subprogno;
+ return;
+ }
+
+ if (!state->fit_for_inline)
+ return;
+
+ state->fit_for_inline = (loop_flag_is_zero(env) &&
+ state->callback_subprogno == subprogno);
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
const struct bpf_func_proto *fn = NULL;
enum bpf_return_type ret_type;
enum bpf_type_flag ret_flag;
@@ -7153,7 +7280,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn, func_id, &meta);
+ err = check_func_proto(fn, func_id);
if (err) {
verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
@@ -7255,6 +7382,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
err = check_bpf_snprintf_call(env, regs);
break;
case BPF_FUNC_loop:
+ update_loop_inline_state(env, meta.subprogno);
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_loop_callback_state);
break;
@@ -7264,6 +7392,46 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
reg_type_str(env, regs[BPF_REG_1].type));
return -EACCES;
}
+ break;
+ case BPF_FUNC_set_retval:
+ if (prog_type == BPF_PROG_TYPE_LSM &&
+ env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+ return -EINVAL;
+ }
+ }
+ break;
+ case BPF_FUNC_dynptr_data:
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
+
+ if (base_type(reg->type) != PTR_TO_DYNPTR)
+ /* Find the id of the dynptr we're
+ * tracking the reference of
+ */
+ meta.ref_obj_id = stack_slot_get_id(env, reg);
+ break;
+ }
+ }
+ if (i == MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n");
+ return -EFAULT;
+ }
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_user_ringbuf_callback_state);
+ break;
}
if (err)
@@ -7280,13 +7448,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* update return register (already marked as written above) */
ret_type = fn->ret_type;
- ret_flag = type_flag(fn->ret_type);
- if (ret_type == RET_INTEGER) {
+ ret_flag = type_flag(ret_type);
+
+ switch (base_type(ret_type)) {
+ case RET_INTEGER:
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (ret_type == RET_VOID) {
+ break;
+ case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT;
- } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
+ break;
+ case RET_PTR_TO_MAP_VALUE:
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -7305,20 +7477,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
map_value_has_spin_lock(meta.map_ptr)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
+ break;
+ case RET_PTR_TO_SOCKET:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
+ break;
+ case RET_PTR_TO_SOCK_COMMON:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
+ break;
+ case RET_PTR_TO_TCP_SOCK:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
- } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
+ break;
+ case RET_PTR_TO_ALLOC_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
+ break;
+ case RET_PTR_TO_MEM_OR_BTF_ID:
+ {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -7350,7 +7528,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+ break;
+ }
+ case RET_PTR_TO_BTF_ID:
+ {
struct btf *ret_btf;
int ret_btf_id;
@@ -7360,6 +7541,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
ret_btf = meta.kptr_off_desc->kptr.btf;
ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
} else {
+ if (fn->ret_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
+ func_id_name(func_id));
+ return -EINVAL;
+ }
ret_btf = btf_vmlinux;
ret_btf_id = *fn->ret_btf_id;
}
@@ -7371,7 +7558,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- } else {
+ break;
+ }
+ default:
verbose(env, "unknown return type %u of func %s#%d\n",
base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
@@ -7380,7 +7569,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (is_ptr_cast_function(func_id)) {
+ if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
+ func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
@@ -7392,21 +7587,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].id = id;
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = id;
- } else if (func_id == BPF_FUNC_dynptr_data) {
- int dynptr_id = 0, i;
-
- /* Find the id of the dynptr we're acquiring a reference to */
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- if (arg_type_is_dynptr(fn->arg_type[i])) {
- if (dynptr_id) {
- verbose(env, "verifier internal error: multiple dynptr args in func\n");
- return -EFAULT;
- }
- dynptr_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
- }
- }
- /* For release_reference() */
- regs[BPF_REG_0].ref_obj_id = dynptr_id;
}
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
@@ -7478,11 +7658,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_kfunc_arg_meta meta = { 0 };
const char *func_name, *ptr_type_name;
u32 i, nargs, func_id, ptr_type_id;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
struct btf *desc_btf;
+ u32 *kfunc_flags;
bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
@@ -7498,18 +7680,23 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name = btf_name_by_offset(desc_btf, func->name_off);
func_proto = btf_type_by_id(desc_btf, func->type);
- if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_CHECK, func_id)) {
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
+ if (!kfunc_flags) {
verbose(env, "calling kernel function %s is not allowed\n",
func_name);
return -EACCES;
}
+ if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n");
+ return -EACCES;
+ }
+
+ acq = *kfunc_flags & KF_ACQUIRE;
- acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_ACQUIRE, func_id);
+ meta.flags = *kfunc_flags;
/* Check the arguments */
- err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
+ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
@@ -7530,7 +7717,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
- if (acq && !btf_type_is_ptr(t)) {
+ if (acq && !btf_type_is_struct_ptr(desc_btf, t)) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
}
@@ -7542,19 +7729,34 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
&ptr_type_id);
if (!btf_type_is_struct(ptr_type)) {
- ptr_type_name = btf_name_by_offset(desc_btf,
- ptr_type->name_off);
- verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
- func_name, btf_type_str(ptr_type),
- ptr_type_name);
- return -EINVAL;
+ if (!meta.r0_size) {
+ ptr_type_name = btf_name_by_offset(desc_btf,
+ ptr_type->name_off);
+ verbose(env,
+ "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name,
+ btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.r0_size;
+
+ if (meta.r0_rdonly)
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+
+ /* Ensures we don't access the memory after a release_reference() */
+ if (meta.ref_obj_id)
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
}
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].type = PTR_TO_BTF_ID;
- regs[BPF_REG_0].btf_id = ptr_type_id;
- if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+ if (*kfunc_flags & KF_RET_NULL) {
regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
regs[BPF_REG_0].id = ++env->id_gen;
@@ -7661,11 +7863,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
- return &env->insn_aux_data[env->insn_idx];
-}
-
enum {
REASON_BOUNDS = -1,
REASON_TYPE = -2,
@@ -8202,11 +8399,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
return -EINVAL;
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
-
+ reg_bounds_sync(dst_reg);
if (sanitize_check_bounds(env, insn, dst_reg) < 0)
return -EACCES;
if (sanitize_needed(opcode)) {
@@ -8944,10 +9137,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
/* ALU32 ops are zero extended into 64bit register */
if (alu32)
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
return 0;
}
@@ -9043,7 +9233,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (opcode == BPF_END || opcode == BPF_NEG) {
if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != 0 ||
+ if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
verbose(env, "BPF_NEG uses reserved fields\n");
@@ -9136,10 +9326,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->dst_reg);
}
zext_32_to_64(dst_reg);
-
- __update_reg_bounds(dst_reg);
- __reg_deduce_bounds(dst_reg);
- __reg_bound_offset(dst_reg);
+ reg_bounds_sync(dst_reg);
}
} else {
/* case: R = imm
@@ -9211,34 +9398,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static void __find_good_pkt_pointers(struct bpf_func_state *state,
- struct bpf_reg_state *dst_reg,
- enum bpf_reg_type type, int new_range)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++) {
- reg = &state->regs[i];
- if (reg->type == type && reg->id == dst_reg->id)
- /* keep the maximum range already checked */
- reg->range = max(reg->range, new_range);
- }
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
-}
-
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
- int new_range, i;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int new_range;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -9303,9 +9470,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i <= vstate->curframe; i++)
- __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
- new_range);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }));
}
static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
@@ -9577,26 +9746,33 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
return;
switch (opcode) {
+ /* JEQ/JNE comparison doesn't change the register equivalence.
+ *
+ * r1 = r2;
+ * if (r1 == 42) goto label;
+ * ...
+ * label: // here both r1 and r2 are known to be 42.
+ *
+ * Hence when marking register as known preserve it's ID.
+ */
case BPF_JEQ:
+ if (is_jmp32) {
+ __mark_reg32_known(true_reg, val32);
+ true_32off = tnum_subreg(true_reg->var_off);
+ } else {
+ ___mark_reg_known(true_reg, val);
+ true_64off = true_reg->var_off;
+ }
+ break;
case BPF_JNE:
- {
- struct bpf_reg_state *reg =
- opcode == BPF_JEQ ? true_reg : false_reg;
-
- /* JEQ/JNE comparison doesn't change the register equivalence.
- * r1 = r2;
- * if (r1 == 42) goto label;
- * ...
- * label: // here both r1 and r2 are known to be 42.
- *
- * Hence when marking register as known preserve it's ID.
- */
- if (is_jmp32)
- __mark_reg32_known(reg, val32);
- else
- ___mark_reg_known(reg, val);
+ if (is_jmp32) {
+ __mark_reg32_known(false_reg, val32);
+ false_32off = tnum_subreg(false_reg->var_off);
+ } else {
+ ___mark_reg_known(false_reg, val);
+ false_64off = false_reg->var_off;
+ }
break;
- }
case BPF_JSET:
if (is_jmp32) {
false_32off = tnum_and(false_32off, tnum_const(~val32));
@@ -9735,21 +9911,8 @@ static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
dst_reg->smax_value);
src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
dst_reg->var_off);
- /* We might have learned new bounds from the var_off. */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
- /* We might have learned something about the sign bit. */
- __reg_deduce_bounds(src_reg);
- __reg_deduce_bounds(dst_reg);
- /* We might have learned some bits from the bounds. */
- __reg_bound_offset(src_reg);
- __reg_bound_offset(dst_reg);
- /* Intersecting with the old var_off might have improved our bounds
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
- * then new var_off is (0; 0x7f...fc) which improves our umax.
- */
- __update_reg_bounds(src_reg);
- __update_reg_bounds(dst_reg);
+ reg_bounds_sync(src_reg);
+ reg_bounds_sync(dst_reg);
}
static void reg_combine_min_max(struct bpf_reg_state *true_src,
@@ -9800,7 +9963,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (!reg_may_point_to_spin_lock(reg)) {
/* For not-NULL ptr, reg->ref_obj_id will be reset
- * in release_reg_references().
+ * in release_reference().
*
* reg->id is still used by spin_lock ptr. Other
* than spin_lock ptr type, reg->id can be reset.
@@ -9810,22 +9973,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
-static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
- bool is_null)
-{
- struct bpf_reg_state *reg;
- int i;
-
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
-
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
-}
-
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -9833,10 +9980,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs, *reg;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
@@ -9845,8 +9991,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
*/
WARN_ON_ONCE(release_reference_state(state, id));
- for (i = 0; i <= vstate->curframe; i++)
- __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }));
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -9959,23 +10106,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
{
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- int i, j;
-
- for (i = 0; i <= vstate->curframe; i++) {
- state = vstate->frame[i];
- for (j = 0; j < MAX_BPF_REG; j++) {
- reg = &state->regs[j];
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
- bpf_for_each_spilled_reg(j, state, reg) {
- if (!reg)
- continue;
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
- *reg = *known_reg;
- }
- }
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ *reg = *known_reg;
+ }));
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -10379,11 +10514,21 @@ static int check_return_code(struct bpf_verifier_env *env)
const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog &&
- (prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
- prog_type == BPF_PROG_TYPE_LSM) &&
- !prog->aux->attach_func_proto->type)
- return 0;
+ if (!is_subprog) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ /* See below, can be 0 or 0-1 depending on hook. */
+ break;
+ fallthrough;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ }
/* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
@@ -10474,6 +10619,22 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_SK_LOOKUP:
range = tnum_range(SK_DROP, SK_PASS);
break;
+
+ case BPF_PROG_TYPE_LSM:
+ if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+ /* Regular BPF_PROG_TYPE_LSM programs can return
+ * any value.
+ */
+ return 0;
+ }
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ range = tnum_range(1, 1);
+ }
+ break;
+
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -10490,6 +10651,10 @@ static int check_return_code(struct bpf_verifier_env *env)
if (!tnum_in(range, reg->var_off)) {
verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
+ if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+ prog_type == BPF_PROG_TYPE_LSM &&
+ !prog->aux->attach_func_proto->type)
+ verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
return -EINVAL;
}
@@ -10901,7 +11066,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
scalar_return =
- btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
goto err_free;
@@ -12246,6 +12411,16 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback
+ * function, for which reference_state must
+ * match caller reference state when it exits.
+ */
+ err = check_reference_leak(env);
+ if (err)
+ return err;
+
if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
@@ -12255,10 +12430,6 @@ static int do_check(struct bpf_verifier_env *env)
continue;
}
- err = check_reference_leak(env);
- if (err)
- return err;
-
err = check_return_code(env);
if (err)
return err;
@@ -12471,14 +12642,6 @@ err_put:
return err;
}
-static int check_map_prealloc(struct bpf_map *map)
-{
- return (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
- map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
- !(map->map_flags & BPF_F_NO_PREALLOC);
-}
-
static bool is_tracing_prog_type(enum bpf_prog_type type)
{
switch (type) {
@@ -12486,56 +12649,19 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
return true;
default:
return false;
}
}
-static bool is_preallocated_map(struct bpf_map *map)
-{
- if (!check_map_prealloc(map))
- return false;
- if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
- return false;
- return true;
-}
-
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- /*
- * Validate that trace type programs use preallocated hash maps.
- *
- * For programs attached to PERF events this is mandatory as the
- * perf NMI can hit any arbitrary code sequence.
- *
- * All other trace types using preallocated hash maps are unsafe as
- * well because tracepoint or kprobes can be inside locked regions
- * of the memory allocator or at a place where a recursion into the
- * memory allocator would see inconsistent state.
- *
- * On RT enabled kernels run-time allocation of all trace type
- * programs is strictly prohibited due to lock type constraints. On
- * !RT kernels it is allowed for backwards compatibility reasons for
- * now, but warnings are emitted so developers are made aware of
- * the unsafety and can fix their programs before this is enforced.
- */
- if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
- if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
- verbose(env, "perf_event programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- verbose(env, "trace type programs can only use preallocated hash map\n");
- return -EINVAL;
- }
- WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
- verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
- }
if (map_value_has_spin_lock(map)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
@@ -12582,13 +12708,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
- if (!is_preallocated_map(map)) {
- verbose(env,
- "Sleepable programs can only use preallocated maps\n");
- return -EINVAL;
- }
- break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
@@ -13229,7 +13350,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
aux[adj_idx].ptr_type == PTR_TO_CTX)
continue;
- imm_rnd = get_random_int();
+ imm_rnd = get_random_u32();
rnd_hi32_patch[0] = insn;
rnd_hi32_patch[1].imm = imm_rnd;
rnd_hi32_patch[3].dst_reg = load_reg;
@@ -13382,9 +13503,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
- } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
- verbose(env, "Writes through BTF pointers are not allowed\n");
- return -EINVAL;
}
continue;
default:
@@ -13544,6 +13662,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
@@ -13556,9 +13675,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
poke->aux = func[i]->aux;
}
- /* Use bpf_prog_F_tag to indicate functions in stack traces.
- * Long term would need debug info to populate names
- */
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
@@ -14294,6 +14410,142 @@ patch_call_imm:
return 0;
}
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
+ struct bpf_insn insn_buf[] = {
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
+ BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 16),
+ /* spill R6, R7, R8 to use these as loop vars */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
+ /* initialize loop vars */
+ BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
+ BPF_MOV32_IMM(reg_loop_cnt, 0),
+ BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
+ BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
+ BPF_CALL_REL(0),
+ /* increment loop counter */
+ BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
+ /* jump to loop header if callback returned 0 */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
+ /* restore original values of R6, R7, R8 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
+ };
+
+ *cnt = ARRAY_SIZE(insn_buf);
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
+ }
+
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+ return 0;
+}
+
static void free_states(struct bpf_verifier_env *env)
{
struct bpf_verifier_state_list *sl, *sln;
@@ -14713,6 +14965,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
fallthrough;
case BPF_MODIFY_RETURN:
case BPF_LSM_MAC:
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
if (!btf_type_is_func(t)) {
@@ -14829,8 +15082,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM) {
- verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
+ prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n");
return -EINVAL;
}
@@ -15031,6 +15284,9 @@ skip_full_check:
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
+ if (ret == 0)
+ ret = optimize_bpf_loop(env);
+
if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 9594cfd1cf2c..08caad776717 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -1,329 +1,101 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ * Clang Control Flow Integrity (CFI) error handling.
*
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
*/
-#include <linux/hardirq.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/printk.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate.h>
-#include <linux/vmalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/set_memory.h>
+#include <linux/cfi.h>
-/* Compiler-defined handler names */
-#ifdef CONFIG_CFI_PERMISSIVE
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail
-#else
-#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
-#endif
-
-static inline void handle_cfi_failure(void *ptr)
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+ unsigned long *target, u32 type)
{
- if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
- WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+ if (target)
+ pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+ (void *)addr, (void *)*target, type);
else
- panic("CFI failure (target: %pS)\n", ptr);
-}
-
-#ifdef CONFIG_MODULES
-#ifdef CONFIG_CFI_CLANG_SHADOW
-/*
- * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
- * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
- */
-typedef u16 shadow_t;
-#define SHADOW_INVALID ((shadow_t)~0UL)
-
-struct cfi_shadow {
- /* Page index for the beginning of the shadow */
- unsigned long base;
- /* An array of __cfi_check locations (as indices to the shadow) */
- shadow_t shadow[1];
-} __packed;
-
-/*
- * The shadow covers ~128M from the beginning of the module region. If
- * the region is larger, we fall back to __module_address for the rest.
- */
-#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
-
-/* The in-memory size of struct cfi_shadow, always at least one page */
-#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
-#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
-#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
-
-/* The actual size of the shadow array, minus metadata */
-#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
-#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
-
-static DEFINE_MUTEX(shadow_update_lock);
-static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
-
-/* Returns the index in the shadow for the given address */
-static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
-{
- unsigned long index;
- unsigned long page = ptr >> PAGE_SHIFT;
-
- if (unlikely(page < s->base))
- return -1; /* Outside of module area */
-
- index = page - s->base;
-
- if (index >= SHADOW_ARR_SLOTS)
- return -1; /* Cannot be addressed with shadow */
-
- return (int)index;
-}
-
-/* Returns the page address for an index in the shadow */
-static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- return (s->base + index) << PAGE_SHIFT;
-}
+ pr_err("CFI failure at %pS (no target information)\n",
+ (void *)addr);
-/* Returns the __cfi_check function address for the given shadow location */
-static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
- int index)
-{
- if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
- return 0;
-
- if (unlikely(s->shadow[index] == SHADOW_INVALID))
- return 0;
-
- /* __cfi_check is always page aligned */
- return (s->base + s->shadow[index]) << PAGE_SHIFT;
-}
-
-static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
- struct cfi_shadow *next)
-{
- int i, index, check;
-
- /* Mark everything invalid */
- memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
-
- if (!prev)
- return; /* No previous shadow */
-
- /* If the base address didn't change, an update is not needed */
- if (prev->base == next->base) {
- memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
- return;
+ if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+ __warn(NULL, 0, (void *)addr, 0, regs, NULL);
+ return BUG_TRAP_TYPE_WARN;
}
- /* Convert the previous shadow to the new address range */
- for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
- if (prev->shadow[i] == SHADOW_INVALID)
- continue;
-
- index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
- if (index < 0)
- continue;
-
- check = ptr_to_shadow(next,
- shadow_to_check_fn(prev, prev->shadow[i]));
- if (check < 0)
- continue;
-
- next->shadow[index] = (shadow_t)check;
- }
+ return BUG_TRAP_TYPE_BUG;
}
-static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
{
- int check_index;
- unsigned long check = (unsigned long)mod->cfi_check;
- unsigned long ptr;
-
- if (unlikely(!PAGE_ALIGNED(check))) {
- pr_warn("cfi: not using shadow for module %s\n", mod->name);
- return;
- }
-
- check_index = ptr_to_shadow(s, check);
- if (check_index < 0)
- return; /* Module not addressable with shadow */
-
- /* For each page, store the check function index in the shadow */
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
-
- if (index >= 0) {
- /* Each page must only contain one module */
- WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
- s->shadow[index] = (shadow_t)check_index;
- }
- }
+ return (unsigned long)((long)p + (long)*p);
}
-static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
- unsigned long min_addr, unsigned long max_addr)
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
{
- unsigned long ptr;
-
- for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
- int index = ptr_to_shadow(s, ptr);
+ s32 *p;
- if (index >= 0)
- s->shadow[index] = SHADOW_INVALID;
+ for (p = start; p < end; ++p) {
+ if (trap_address(p) == addr)
+ return true;
}
-}
-typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
- unsigned long min_addr, unsigned long max_addr);
-
-static void update_shadow(struct module *mod, unsigned long base_addr,
- update_shadow_fn fn)
-{
- struct cfi_shadow *prev;
- struct cfi_shadow *next;
- unsigned long min_addr, max_addr;
-
- next = vmalloc(SHADOW_SIZE);
-
- mutex_lock(&shadow_update_lock);
- prev = rcu_dereference_protected(cfi_shadow,
- mutex_is_locked(&shadow_update_lock));
-
- if (next) {
- next->base = base_addr >> PAGE_SHIFT;
- prepare_next_shadow(prev, next);
-
- min_addr = (unsigned long)mod->core_layout.base;
- max_addr = min_addr + mod->core_layout.text_size;
- fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
-
- set_memory_ro((unsigned long)next, SHADOW_PAGES);
- }
-
- rcu_assign_pointer(cfi_shadow, next);
- mutex_unlock(&shadow_update_lock);
- synchronize_rcu();
-
- if (prev) {
- set_memory_rw((unsigned long)prev, SHADOW_PAGES);
- vfree(prev);
- }
-}
-
-void cfi_module_add(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, add_module_to_shadow);
-}
-
-void cfi_module_remove(struct module *mod, unsigned long base_addr)
-{
- update_shadow(mod, base_addr, remove_module_from_shadow);
+ return false;
}
-static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
- unsigned long ptr)
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ struct module *mod)
{
- int index;
+ char *secstrings;
+ unsigned int i;
- if (unlikely(!s))
- return NULL; /* No shadow available */
+ mod->kcfi_traps = NULL;
+ mod->kcfi_traps_end = NULL;
- index = ptr_to_shadow(s, ptr);
- if (index < 0)
- return NULL; /* Cannot be addressed with shadow */
+ secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- return (cfi_check_fn)shadow_to_check_fn(s, index);
-}
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn;
-
- rcu_read_lock_sched_notrace();
- fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched_notrace();
-
- return fn;
-}
-
-#else /* !CONFIG_CFI_CLANG_SHADOW */
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+ continue;
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
- return NULL;
+ mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+ mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+ break;
+ }
}
-#endif /* CONFIG_CFI_CLANG_SHADOW */
-
-static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+static bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = NULL;
struct module *mod;
+ bool found = false;
rcu_read_lock_sched_notrace();
- mod = __module_address(ptr);
- if (mod)
- fn = mod->cfi_check;
- rcu_read_unlock_sched_notrace();
- return fn;
-}
-
-static inline cfi_check_fn find_check_fn(unsigned long ptr)
-{
- cfi_check_fn fn = NULL;
-
- if (is_kernel_text(ptr))
- return __cfi_check;
-
- /*
- * Indirect call checks can happen when RCU is not watching. Both
- * the shadow and __module_address use RCU, so we need to wake it
- * up if necessary.
- */
- RCU_NONIDLE({
- if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
- fn = find_shadow_check_fn(ptr);
+ mod = __module_address(addr);
+ if (mod)
+ found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
- if (!fn)
- fn = find_module_check_fn(ptr);
- });
+ rcu_read_unlock_sched_notrace();
- return fn;
+ return found;
}
-
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
{
- cfi_check_fn fn = find_check_fn((unsigned long)ptr);
-
- if (likely(fn))
- fn(id, ptr, diag);
- else /* Don't allow unchecked modules */
- handle_cfi_failure(ptr);
+ return false;
}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+#endif /* CONFIG_MODULES */
-#else /* !CONFIG_MODULES */
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+bool is_cfi_trap(unsigned long addr)
{
- handle_cfi_failure(ptr); /* No modules */
-}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+ if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+ return true;
-#endif /* CONFIG_MODULES */
-
-void cfi_failure_handler(void *data, void *ptr, void *vtable)
-{
- handle_cfi_failure(ptr);
+ return is_module_cfi_trap(addr);
}
-EXPORT_SYMBOL(cfi_failure_handler);
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5da09c74228d..fd4020835ec6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
@@ -233,6 +232,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
@@ -249,6 +249,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
+void cgroup_attach_lock(bool lock_threadgroup);
+void cgroup_attach_unlock(bool lock_threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index afc6c0e9c966..52bb5a74a23b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -59,7 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(true);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -71,7 +71,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(true);
mutex_unlock(&cgroup_mutex);
return retval;
@@ -875,6 +875,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
+ if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -898,6 +900,8 @@ enum cgroup1_param {
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
+ Opt_favordynmods,
+ Opt_nofavordynmods,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@@ -909,6 +913,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
+ fsparam_flag ("favordynmods", Opt_favordynmods),
+ fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
{}
};
@@ -960,6 +966,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_nofavordynmods:
+ ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
@@ -1211,8 +1223,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
- if (ret)
+ if (!ret)
+ cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+ else
cgroup_free_root(root);
+
return ret;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1779ccddb734..2319946715e0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -217,6 +217,7 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
/* cgroup optional features */
enum cgroup_opt_features {
@@ -279,8 +280,6 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
@@ -765,7 +764,8 @@ struct css_set init_css_set = {
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
@@ -1240,7 +1240,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1307,6 +1308,20 @@ struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
return root_cgrp->root;
}
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ if (favor && !favoring) {
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+}
+
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@@ -1367,6 +1382,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_root_count--;
}
+ cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
@@ -1377,73 +1393,87 @@ static void cgroup_destroy_root(struct cgroup_root *root)
}
/*
- * look up cgroup associated with current task's cgroup namespace on the
- * specified hierarchy
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
{
- struct cgroup *res = NULL;
- struct css_set *cset;
-
- lockdep_assert_held(&css_set_lock);
+ struct cgroup *res_cgroup = NULL;
- rcu_read_lock();
-
- cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
- res = &root->cgrp;
+ res_cgroup = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
+ res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
- res = c;
+ res_cgroup = c;
break;
}
}
}
- rcu_read_unlock();
- BUG_ON(!res);
- return res;
+ BUG_ON(!res_cgroup);
+ return res_cgroup;
}
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
- struct cgroup_root *root)
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
+ struct css_set *cset;
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
+ rcu_read_lock();
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ res = __cset_cgroup_from_root(cset, root);
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
+ rcu_read_unlock();
- BUG_ON(!res);
return res;
}
/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_lock);
+
+ return __cset_cgroup_from_root(cset, root);
+}
+
+/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
@@ -1678,12 +1708,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1706,14 +1740,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
return 0;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -1809,6 +1851,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
if (ss->css_rstat_flush) {
list_del_rcu(&css->rstat_css_node);
+ synchronize_rcu();
list_add_rcu(&css->rstat_css_node,
&dcgrp->rstat_css_list);
}
@@ -1864,6 +1907,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
nr__cgroup2_params
@@ -1871,6 +1915,7 @@ enum cgroup2_param {
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
@@ -1890,6 +1935,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
@@ -1908,6 +1956,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
@@ -1924,6 +1975,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
@@ -1974,7 +2027,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- root->flags = ctx->flags;
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@@ -2027,7 +2081,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
}
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -2150,7 +2204,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2196,6 +2250,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
+
+#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+#endif
return 0;
}
@@ -2334,7 +2392,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
+ ret = strscpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
@@ -2344,6 +2402,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+void cgroup_attach_lock(bool lock_threadgroup)
+{
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2570,10 +2669,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
- /* mixables don't care */
- if (cgroup_is_mixable(dst_cgrp))
- return 0;
-
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@@ -2597,21 +2692,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
+ list_del_init(&cset->mg_dst_preload_node);
put_css_set_locked(cset);
}
@@ -2651,7 +2752,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&src_cset->mg_src_preload_node))
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2664,7 +2765,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2689,7 +2790,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
@@ -2708,7 +2809,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2716,8 +2817,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2813,8 +2914,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ bool *threadgroup_locked)
{
struct task_struct *tsk;
pid_t pid;
@@ -2831,12 +2931,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2867,17 +2963,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
goto out_unlock_rcu;
out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
- }
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2885,8 +2978,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2941,29 +3034,47 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ cgroup_attach_lock(has_tasks);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
@@ -2975,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -3212,11 +3323,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -3609,27 +3716,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
@@ -3649,8 +3756,8 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY;
}
- psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
- new = psi_trigger_create(psi, buf, nbytes, res);
+ psi = cgroup_psi(cgrp);
+ new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
@@ -3666,21 +3773,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@ -3700,6 +3872,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
bool cgroup_psi_enabled(void)
{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
@@ -4052,8 +4227,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -4118,21 +4291,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
@@ -4146,26 +4323,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
@@ -4187,6 +4364,12 @@ int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
@@ -4292,6 +4475,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4923,13 +5126,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct task_struct *task;
const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4955,7 +5158,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -5051,10 +5254,14 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5062,7 +5269,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5070,12 +5277,27 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_PRESSURE,
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -5352,8 +5574,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5405,7 +5626,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5838,16 +6059,11 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
- /*
- * The latency of the synchronize_rcu() is too high for cgroups,
- * avoid it at the cost of forcing all readers into the slow path.
- */
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
@@ -5964,16 +6180,22 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
/*
* cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
- * On success return the cgrp, on failure return NULL
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
*/
struct cgroup *cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp = NULL;
+ struct cgroup *cgrp, *root_cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
- goto out;
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
rcu_read_lock();
@@ -5982,9 +6204,17 @@ struct cgroup *cgroup_get_from_id(u64 id)
cgrp = NULL;
rcu_read_unlock();
-
kernfs_put(kn);
-out:
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -6014,7 +6244,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -6080,16 +6310,37 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list);
}
-static struct cgroup *cgroup_get_from_file(struct file *f)
+/**
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
+ *
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
+ */
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
- struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
- cgrp = css->cgroup;
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
@@ -6561,8 +6812,10 @@ struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
if (!kn)
goto out;
@@ -6587,15 +6840,15 @@ out:
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
struct file *f;
@@ -6604,10 +6857,29 @@ struct cgroup *cgroup_get_from_fd(int fd)
if (!f)
return ERR_PTR(-EBADF);
- cgrp = cgroup_get_from_file(f);
+ cgrp = cgroup_v1v2_get_from_file(f);
fput(f);
return cgrp;
}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+ return cgrp;
+}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
static u64 power_of_ten(int power)
@@ -6720,9 +6992,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
- if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
- continue;
-
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
@@ -6742,8 +7011,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
@@ -6759,6 +7031,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
{
return snprintf(buf, PAGE_SIZE,
"nsdelegate\n"
+ "favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n");
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 71a418858a5e..b474289c15b8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -33,6 +33,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
+#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
@@ -85,6 +86,30 @@ struct fmeter {
spinlock_t lock; /* guards read or write of above */
};
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+};
+
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+};
+
struct cpuset {
struct cgroup_subsys_state css;
@@ -168,6 +193,9 @@ struct cpuset {
int use_parent_ecpus;
int child_ecpus_count;
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
};
@@ -175,20 +203,22 @@ struct cpuset {
/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
+ * 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
-#define PRS_ERROR -1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+ return prs_state < 0;
+}
/*
* Temporary cpumasks for working with partitions that are passed among
@@ -268,25 +298,43 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
+{
+ if (is_partition_valid(cs))
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
/*
* Send notification event of whenever partition_root_state changes.
*/
-static inline void notify_partition_change(struct cpuset *cs,
- int old_prs, int new_prs)
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
- if (old_prs != new_prs)
- cgroup_file_notify(&cs->partition_file);
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
+
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
}
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
+ .partition_root_state = PRS_ROOT,
};
/**
@@ -404,6 +452,41 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * It is assumed that @cs is a valid partition root. @excluded_child should
+ * be non-NULL when this cpuset is going to become a partition itself.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+
+ if (cs->css.cgroup->nr_populated_csets)
+ return true;
+ if (!excluded_child && !cs->nr_subparts_cpus)
+ return cgroup_is_populated(cs->css.cgroup);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (child == excluded_child)
+ continue;
+ if (is_partition_valid(child))
+ continue;
+ if (cgroup_is_populated(child->css.cgroup)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
/*
* Return in pmask the portion of a task's cpusets's cpus_allowed that
* are online and are capable of running the task. If none are found,
@@ -659,22 +742,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
-
- /*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
*/
@@ -698,6 +765,22 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
trial->cpus_allowed))
goto out;
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ goto out;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
rcu_read_unlock();
@@ -875,7 +958,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ if (!is_partition_valid(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -1081,7 +1164,7 @@ static void rebuild_sched_domains_locked(void)
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_root(cs)) {
+ if (!is_partition_valid(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
@@ -1127,10 +1210,18 @@ static void update_tasks_cpumask(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
+ while ((task = css_task_iter_next(&it))) {
+ /*
+ * Percpu kthreads in top_cpuset are ignored
+ */
+ if (top_cs && (task->flags & PF_KTHREAD) &&
+ kthread_is_per_cpu(task))
+ continue;
set_cpus_allowed_ptr(task, cs->effective_cpus);
+ }
css_task_iter_end(&it);
}
@@ -1165,15 +1256,18 @@ enum subparts_cmd {
partcmd_enable, /* Enable partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's subparts_cpus */
+ partcmd_invalidate, /* Make partition invalid */
};
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cpuset: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
+ * Return: 0 or a partition root state error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The cpus_allowed mask of the given cpuset will
@@ -1184,38 +1278,36 @@ enum subparts_cmd {
* For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
+ * into parent's effective_cpus. 0 will always be returned.
*
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will only be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
*
- * The partcmd_enable and partcmd_disable commands are used by
- * update_prstate(). The partcmd_update command is used by
- * update_cpumasks_hier() with newmask NULL and update_cpumask() with
- * newmask set.
- *
- * The checking is more strict when enabling partition root than the
- * other two commands.
+ * For partcmd_invalidate, the current partition will be made invalid.
*
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change().
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
+ *
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_error will be updated
+ * directly.
*/
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
- struct cpuset *parent = parent_cs(cpuset);
+ struct cpuset *parent = parent_cs(cs);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
int old_prs, new_prs;
- bool part_error = false; /* Partition error? */
+ int part_error = PERR_NONE; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1224,126 +1316,165 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
-
- /*
- * Enabling/disabling partition root is not allowed if there are
- * online children.
- */
- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
- return -EBUSY;
-
- /*
- * Enabling partition root is not allowed if not all the CPUs
- * can be granted from parent's effective_cpus or at least one
- * CPU will be left after that.
- */
- if ((cmd == partcmd_enable) &&
- (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
- return -EINVAL;
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if ((newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cs->cpus_allowed)))
+ return PERR_CPUSEMPTY;
/*
- * A cpumask update cannot make parent's effective_cpus become empty.
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
*/
adding = deleting = false;
- old_prs = new_prs = cpuset->partition_root_state;
+ old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_enable) {
- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ /*
+ * Enabling partition root is not allowed if cpus_allowed
+ * doesn't overlap parent's cpus_allowed.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+ return PERR_INVCPUS;
+
+ /*
+ * A parent can be left with no CPU as long as there is no
+ * task directly associated with the parent partition.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ partition_is_populated(parent, cs))
+ return PERR_NOCPUS;
+
+ cpumask_copy(tmp->addmask, cs->cpus_allowed);
adding = true;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ /*
+ * Need to remove cpus from parent's subparts_cpus for valid
+ * partition root.
+ */
+ deleting = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
+ } else if (cmd == partcmd_invalidate) {
+ if (is_prs_invalid(old_prs))
+ return 0;
+
+ /*
+ * Make the current partition invalid. It is assumed that
+ * invalidation is caused by violating cpu exclusivity rule.
+ */
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
parent->subparts_cpus);
+ if (old_prs > 0) {
+ new_prs = -old_prs;
+ part_error = PERR_NOTEXCL;
+ }
} else if (newmask) {
/*
* partcmd_update with newmask:
*
+ * Compute add/delete mask to/from subparts_cpus
+ *
* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->effective_cpus
+ * addmask = newmask & parent->cpus_allowed
* & ~parent->subparts_cpus
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->subparts_cpus);
- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
- * Return error if the new effective_cpus could become empty.
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
*/
if (adding &&
- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
- if (!deleting)
- return -EINVAL;
- /*
- * As some of the CPUs in subparts_cpus might have
- * been offlined, we need to compute the real delmask
- * to confirm that.
- */
- if (!cpumask_and(tmp->addmask, tmp->delmask,
- cpu_active_mask))
- return -EINVAL;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
+ partition_is_populated(parent, cs)) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
} else {
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effective_cpus
+ * delmask = cpus_allowed & parent->subparts_cpus
+ * addmask = cpus_allowed & parent->cpus_allowed
+ * & ~parent->subparts_cpus
*
- * Note that parent's subparts_cpus may have been
- * pre-shrunk in case there is a change in the cpu list.
- * So no deletion is needed.
+ * This gets invoked either due to a hotplug event or from
+ * update_cpumasks_hier(). This can cause the state of a
+ * partition root to transition from valid to invalid or vice
+ * versa. So we still need to compute the addmask and delmask.
+
+ * A partition error happens when:
+ * 1) Cpuset is valid partition, but parent does not distribute
+ * out any CPUs.
+ * 2) Parent has tasks and all its effective CPUs will have
+ * to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = cpumask_equal(tmp->addmask,
- parent->effective_cpus);
+ cpumask_and(tmp->addmask, cs->cpus_allowed,
+ parent->cpus_allowed);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+
+ if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
+ (adding &&
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ partition_is_populated(parent, cs))) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ }
+
+ if (part_error && is_partition_valid(cs) &&
+ parent->nr_subparts_cpus)
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
if (cmd == partcmd_update) {
- int prev_prs = cpuset->partition_root_state;
-
/*
- * Check for possible transition between PRS_ENABLED
- * and PRS_ERROR.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
if (part_error)
- new_prs = PRS_ERROR;
+ new_prs = -old_prs;
break;
- case PRS_ERROR:
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
if (!part_error)
- new_prs = PRS_ENABLED;
+ new_prs = -old_prs;
break;
}
- /*
- * Set part_error if previously in invalid state.
- */
- part_error = (prev_prs == PRS_ERROR);
- }
-
- if (!part_error && (new_prs == PRS_ERROR))
- return 0; /* Nothing need to be done */
-
- if (new_prs == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */
- adding = false;
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
}
if (!adding && !deleting && (new_prs == old_prs))
return 0;
/*
+ * Transitioning between invalid to valid or vice versa may require
+ * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
+ */
+ if (old_prs != new_prs) {
+ if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
+ (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
+ return PERR_NOTEXCL;
+ if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+
+ /*
* Change the parent's subparts_cpus.
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
@@ -1369,18 +1500,32 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
if (old_prs != new_prs)
- cpuset->partition_root_state = new_prs;
+ cs->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
- notify_partition_change(cpuset, old_prs, new_prs);
- return cmd == partcmd_update;
+ if (adding || deleting)
+ update_tasks_cpumask(parent);
+
+ /*
+ * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
+ * rebuild_sched_domains_locked() may be called.
+ */
+ if (old_prs != new_prs) {
+ if (old_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ else if (new_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ }
+ notify_partition_change(cs, old_prs);
+ return 0;
}
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
+ * @force: don't skip any descendant cpusets if set
*
* When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
@@ -1389,7 +1534,8 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
*
* Called with cpuset_rwsem held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -1399,14 +1545,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
+ bool update_parent = false;
compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some CPUs.
+ * parent, which is guaranteed to have some CPUs unless
+ * it is a partition root that has explicitly distributed
+ * out all its CPUs.
*/
if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ if (is_partition_valid(cp) &&
+ cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
+ goto update_parent_subparts;
+
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->use_parent_ecpus) {
cp->use_parent_ecpus = true;
@@ -1420,14 +1573,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
/*
* Skip the whole subtree if the cpumask remains the same
- * and has no partition root state.
+ * and has no partition root state and force flag not set.
*/
- if (!cp->partition_root_state &&
+ if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+update_parent_subparts:
/*
* update_parent_subparts_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
@@ -1437,36 +1591,22 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
old_prs = new_prs = cp->partition_root_state;
if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
- case PRS_DISABLED:
- /*
- * If parent is not a partition root or an
- * invalid partition root, clear its state
- * and its CS_CPU_EXCLUSIVE flag.
- */
- WARN_ON_ONCE(cp->partition_root_state
- != PRS_ERROR);
- new_prs = PRS_DISABLED;
-
- /*
- * clear_bit() is an atomic operation and
- * readers aren't interested in the state
- * of CS_CPU_EXCLUSIVE anyway. So we can
- * just update the flag without holding
- * the callback_lock.
- */
- clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ update_parent = true;
break;
- case PRS_ENABLED:
- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
- update_tasks_cpumask(parent);
- break;
-
- case PRS_ERROR:
+ default:
/*
- * When parent is invalid, it has to be too.
+ * When parent is not a partition root or is
+ * invalid, child partition roots become
+ * invalid too.
*/
- new_prs = PRS_ERROR;
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
+ WRITE_ONCE(cp->prs_err,
+ is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART);
break;
}
}
@@ -1475,42 +1615,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
continue;
rcu_read_unlock();
+ if (update_parent) {
+ update_parent_subparts_cpumask(cp, partcmd_update, NULL,
+ tmp);
+ /*
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
+ */
+ new_prs = cp->partition_root_state;
+ }
+
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
+ if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
+ /*
+ * Put all active subparts_cpus back to effective_cpus.
+ */
+ cpumask_or(tmp->new_cpus, tmp->new_cpus,
+ cp->subparts_cpus);
+ cpumask_and(tmp->new_cpus, tmp->new_cpus,
+ cpu_active_mask);
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
- } else if (cp->nr_subparts_cpus) {
+ }
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus) {
/*
* Make sure that effective_cpus & subparts_cpus
* are mutually exclusive.
- *
- * In the unlikely event that effective_cpus
- * becomes empty. we clear cp->nr_subparts_cpus and
- * let its child partition roots to compete for
- * CPUs again.
*/
cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
cp->subparts_cpus);
- if (cpumask_empty(cp->effective_cpus)) {
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- cpumask_clear(cp->subparts_cpus);
- cp->nr_subparts_cpus = 0;
- } else if (!cpumask_subset(cp->subparts_cpus,
- tmp->new_cpus)) {
- cpumask_andnot(cp->subparts_cpus,
- cp->subparts_cpus, tmp->new_cpus);
- cp->nr_subparts_cpus
- = cpumask_weight(cp->subparts_cpus);
- }
}
- if (new_prs != old_prs)
- cp->partition_root_state = new_prs;
-
+ cp->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
- notify_partition_change(cp, old_prs, new_prs);
+
+ notify_partition_change(cp, old_prs);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1526,7 +1668,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_root(cp)))
+ is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -1570,7 +1712,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
- update_cpumasks_hier(sibling, tmp);
+ update_cpumasks_hier(sibling, tmp, false);
rcu_read_lock();
css_put(&sibling->css);
}
@@ -1588,6 +1730,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
+ bool invalidate = false;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1615,10 +1758,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
-
#ifdef CONFIG_CPUMASK_OFFSTACK
/*
* Use the cpumasks in trialcs for tmpmasks when they are pointers
@@ -1629,28 +1768,70 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
tmp.new_cpus = trialcs->cpus_allowed;
#endif
+ retval = validate_change(cs, trialcs);
+
+ if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ struct cpuset *cp, *parent;
+ struct cgroup_subsys_state *css;
+
+ /*
+ * The -EINVAL error code indicates that partition sibling
+ * CPU exclusivity rule has been violated. We still allow
+ * the cpumask change to proceed while invalidating the
+ * partition. However, any conflicting sibling partitions
+ * have to be marked as invalid too.
+ */
+ invalidate = true;
+ rcu_read_lock();
+ parent = parent_cs(cs);
+ cpuset_for_each_child(cp, css, parent)
+ if (is_partition_valid(cp) &&
+ cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
+ rcu_read_unlock();
+ update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ retval = 0;
+ }
+ if (retval < 0)
+ return retval;
+
if (cs->partition_root_state) {
- /* Cpumask of a partition root cannot be empty */
- if (cpumask_empty(trialcs->cpus_allowed))
- return -EINVAL;
- if (update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp) < 0)
- return -EINVAL;
+ if (invalidate)
+ update_parent_subparts_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
+ else
+ update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp);
}
+ compute_effective_cpumask(trialcs->effective_cpus, trialcs,
+ parent_cs(cs));
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
/*
- * Make sure that subparts_cpus is a subset of cpus_allowed.
+ * Make sure that subparts_cpus, if not empty, is a subset of
+ * cpus_allowed. Clear subparts_cpus if partition not valid or
+ * empty effective cpus with tasks.
*/
if (cs->nr_subparts_cpus) {
- cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ if (!is_partition_valid(cs) ||
+ (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
+ partition_is_populated(cs, NULL))) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ } else {
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
}
spin_unlock_irq(&callback_lock);
- update_cpumasks_hier(cs, &tmp);
+ /* effective_cpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, false);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
@@ -2026,16 +2207,18 @@ out:
return err;
}
-/*
+/**
* update_prstate - update partition_root_state
- * cs: the cpuset to update
- * new_prs: new partition root state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
*
* Call with cpuset_rwsem held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err, old_prs = cs->partition_root_state;
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
+ bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
@@ -2043,28 +2226,33 @@ static int update_prstate(struct cpuset *cs, int new_prs)
return 0;
/*
- * Cannot force a partial or invalid partition root to a full
- * partition root.
+ * For a previously invalid partition root, leave it at being
+ * invalid if new_prs is not "member".
*/
- if (new_prs && (old_prs == PRS_ERROR))
- return -EINVAL;
+ if (new_prs && is_prs_invalid(old_prs)) {
+ cs->partition_root_state = -new_prs;
+ return 0;
+ }
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
- err = -EINVAL;
if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be NULL.
+ * cannot be empty.
*/
- if (cpumask_empty(cs->cpus_allowed))
+ if (cpumask_empty(cs->cpus_allowed)) {
+ err = PERR_CPUSEMPTY;
goto out;
+ }
err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err)
+ if (err) {
+ err = PERR_NOTEXCL;
goto out;
+ }
err = update_parent_subparts_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
@@ -2072,47 +2260,77 @@ static int update_prstate(struct cpuset *cs, int new_prs)
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
goto out;
}
+
+ if (new_prs == PRS_ISOLATED) {
+ /*
+ * Disable the load balance flag should not return an
+ * error unless the system is running out of memory.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ sched_domain_rebuilt = true;
+ }
+ } else if (old_prs && new_prs) {
+ /*
+ * A change in load balance state only, no change in cpumasks.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
+ sched_domain_rebuilt = true;
+ goto out; /* Sched domain is rebuilt in update_flag() */
} else {
/*
- * Turning off partition root will clear the
- * CS_CPU_EXCLUSIVE bit.
+ * Switching back to member is always allowed even if it
+ * disables child partitions.
*/
- if (old_prs == PRS_ERROR) {
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- err = 0;
- goto out;
- }
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
+ &tmpmask);
- err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmpmask);
- if (err)
- goto out;
+ /*
+ * If there are child partitions, they will all become invalid.
+ */
+ if (unlikely(cs->nr_subparts_cpus)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(cs->effective_cpus, cs, parent);
+ spin_unlock_irq(&callback_lock);
+ }
/* Turning off CS_CPU_EXCLUSIVE will not return error */
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+
+ if (!is_sched_load_balance(cs)) {
+ /* Make sure load balance is on */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ sched_domain_rebuilt = true;
+ }
}
- /*
- * Update cpumask of parent's tasks except when it is the top
- * cpuset as some system daemons cannot be mapped to other CPUs.
- */
- if (parent != &top_cpuset)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent);
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
- rebuild_sched_domains_locked();
+ if (!sched_domain_rebuilt)
+ rebuild_sched_domains_locked();
out:
- if (!err) {
- spin_lock_irq(&callback_lock);
- cs->partition_root_state = new_prs;
- spin_unlock_irq(&callback_lock);
- notify_partition_change(cs, old_prs, new_prs);
- }
+ /*
+ * Make partition invalid if an error happen
+ */
+ if (err)
+ new_prs = -new_prs;
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
+ /*
+ * Update child cpusets, if present.
+ * Force update if switching back to member.
+ */
+ if (!list_empty(&cs->css.children))
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
+ notify_partition_change(cs, old_prs);
free_cpumasks(NULL, &tmpmask);
- return err;
+ return 0;
}
/*
@@ -2238,8 +2456,14 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
+ /*
+ * Task cannot be moved to a cpuset with empty effective cpus.
+ */
+ if (cpumask_empty(cs->effective_cpus))
+ goto out_unlock;
+
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->cpus_allowed);
+ ret = task_can_attach(task, cs->effective_cpus);
if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
@@ -2289,7 +2513,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- cpus_read_lock();
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2343,7 +2567,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
- cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
@@ -2599,16 +2822,29 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
+ const char *err, *type = NULL;
switch (cs->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
seq_puts(seq, "root\n");
break;
- case PRS_DISABLED:
+ case PRS_ISOLATED:
+ seq_puts(seq, "isolated\n");
+ break;
+ case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
- case PRS_ERROR:
- seq_puts(seq, "root invalid\n");
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
+ err = perr_strings[READ_ONCE(cs->prs_err)];
+ if (err)
+ seq_printf(seq, "%s invalid (%s)\n", type, err);
+ else
+ seq_printf(seq, "%s invalid\n", type);
break;
}
return 0;
@@ -2627,9 +2863,11 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
* Convert "root" to ENABLED, and convert "member" to DISABLED.
*/
if (!strcmp(buf, "root"))
- val = PRS_ENABLED;
+ val = PRS_ROOT;
else if (!strcmp(buf, "member"))
- val = PRS_DISABLED;
+ val = PRS_MEMBER;
+ else if (!strcmp(buf, "isolated"))
+ val = PRS_ISOLATED;
else
return -EINVAL;
@@ -2928,7 +3166,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
- if (is_partition_root(cs))
+ if (is_partition_valid(cs))
update_prstate(cs, 0);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
@@ -3104,7 +3342,8 @@ hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
- if (cpumask_empty(new_cpus))
+ /* A partition root is allowed to have empty effective cpus */
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
@@ -3173,11 +3412,31 @@ retry:
/*
* In the unlikely event that a partition root has empty
- * effective_cpus or its parent becomes erroneous, we have to
- * transition it to the erroneous state.
+ * effective_cpus with tasks, we will have to invalidate child
+ * partitions, if present, by setting nr_subparts_cpus to 0 to
+ * reclaim their cpus.
*/
- if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
- (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
+ cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * Force the partition to become invalid if either one of
+ * the following conditions hold:
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
+ */
+ if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
+ (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
+ int old_prs, parent_prs;
+
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
if (cs->nr_subparts_cpus) {
spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
@@ -3186,39 +3445,32 @@ retry:
compute_effective_cpumask(&new_cpus, cs, parent);
}
- /*
- * If the effective_cpus is empty because the child
- * partitions take away all the CPUs, we can keep
- * the current partition and let the child partitions
- * fight for available CPUs.
- */
- if ((parent->partition_root_state == PRS_ERROR) ||
- cpumask_empty(&new_cpus)) {
- int old_prs;
-
- update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, tmp);
- old_prs = cs->partition_root_state;
- if (old_prs != PRS_ERROR) {
- spin_lock_irq(&callback_lock);
- cs->partition_root_state = PRS_ERROR;
- spin_unlock_irq(&callback_lock);
- notify_partition_change(cs, old_prs, PRS_ERROR);
- }
+ old_prs = cs->partition_root_state;
+ parent_prs = parent->partition_root_state;
+ if (is_partition_valid(cs)) {
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(cs);
+ spin_unlock_irq(&callback_lock);
+ if (is_prs_invalid(parent_prs))
+ WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
+ else if (!parent_prs)
+ WRITE_ONCE(cs->prs_err, PERR_NOTPART);
+ else
+ WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
+ notify_partition_change(cs, old_prs);
}
cpuset_force_rebuild();
}
/*
- * On the other hand, an erroneous partition root may be transitioned
- * back to a regular one or a partition root with no CPU allocated
- * from the parent may change to erroneous.
+ * On the other hand, an invalid partition root may be transitioned
+ * back to a regular one.
*/
- if (is_partition_root(parent) &&
- ((cs->partition_root_state == PRS_ERROR) ||
- !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
- cpuset_force_rebuild();
+ else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
+ if (is_partition_valid(cs))
+ cpuset_force_rebuild();
+ }
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d173..1b6b21851e9d 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
}
mutex_unlock(&freezer_mutex);
@@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
freezer->state = 0;
@@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
__thaw_task(task);
} else {
freeze_task(task);
+
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec(&freezer_active);
unfreeze_cgroup(freezer);
}
}
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 511af87f685e..7695e60bcb40 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -47,6 +47,7 @@ struct pids_cgroup {
*/
atomic64_t counter;
atomic64_t limit;
+ int64_t watermark;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -85,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css)
kfree(css_pids(css));
}
+static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
+{
+ /*
+ * This is racy, but we don't need perfectly accurate tallying of
+ * the watermark, and this lets us avoid extra atomic overhead.
+ */
+ if (nr_pids > READ_ONCE(p->watermark))
+ WRITE_ONCE(p->watermark, nr_pids);
+}
+
/**
* pids_cancel - uncharge the local pid count
* @pids: the pid cgroup state
@@ -128,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num)
{
struct pids_cgroup *p;
- for (p = pids; parent_pids(p); p = parent_pids(p))
- atomic64_add(num, &p->counter);
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ pids_update_watermark(p, new);
+ }
}
/**
@@ -156,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
*/
if (new > limit)
goto revert;
+
+ /*
+ * Not technically accurate if we go over limit somewhere up
+ * the hierarchy, but that's tolerable for the watermark.
+ */
+ pids_update_watermark(p, new);
}
return 0;
@@ -311,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static s64 pids_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return READ_ONCE(pids->watermark);
+}
+
static int pids_events_show(struct seq_file *sf, void *v)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
@@ -332,6 +360,11 @@ static struct cftype pids_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
.name = "events",
.seq_show = pids_events_show,
.file_offset = offsetof(struct pids_cgroup, events_file),
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 24b5c2ab5598..793ecff29038 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
return pos;
}
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -310,6 +340,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -318,6 +351,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -398,6 +434,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
default:
break;
}
@@ -411,8 +452,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
+ struct task_cputime *cputime = &bstat->cputime;
int i;
cputime->stime = 0;
@@ -438,6 +480,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
}
}
@@ -445,27 +491,61 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct task_cputime cputime;
+ struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time;
+#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
cgroup_rstat_flush_release();
} else {
- root_cgroup_cputime(&cputime);
- usage = cputime.sum_exec_runtime;
- utime = cputime.utime;
- stime = cputime.stime;
+ root_cgroup_cputime(&bstat);
+ usage = bstat.cputime.sum_exec_runtime;
+ utime = bstat.cputime.utime;
+ stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = bstat.forceidle_sum;
+#endif
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+ do_div(forceidle_time, NSEC_PER_USEC);
+#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index eb701b2ac72f..44b0f0146a3f 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -7,7 +7,6 @@
# CONFIG_OABI_COMPAT is not set
# CONFIG_SYSVIPC is not set
# CONFIG_USELIB is not set
-CONFIG_ANDROID=y
CONFIG_ANDROID_BINDER_IPC=y
CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
CONFIG_ANDROID_LOW_MEMORY_KILLER=y
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..38a7c5362c9c
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1 @@
+CONFIG_RUST=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
index dcd86f32f4ed..6fac5b405334 100644
--- a/kernel/configs/x86_debug.config
+++ b/kernel/configs/x86_debug.config
@@ -7,12 +7,11 @@ CONFIG_DEBUG_SLAB=y
CONFIG_DEBUG_KMEMLEAK=y
CONFIG_DEBUG_PAGEALLOC=y
CONFIG_SLUB_DEBUG_ON=y
-CONFIG_KMEMCHECK=y
CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
CONFIG_GCOV_KERNEL=y
CONFIG_LOCKDEP=y
CONFIG_PROVE_LOCKING=y
CONFIG_SCHEDSTATS=y
-CONFIG_VMLINUX_VALIDATION=y
+CONFIG_NOINSTR_VALIDATION=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index ff756221f112..436f806aa1ed 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -34,7 +34,6 @@ CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
CONFIG_XEN_SCSI_FRONTEND=m
# others
CONFIG_XEN_BALLOON=y
-CONFIG_XEN_SCRUB_PAGES=y
CONFIG_XEN_DEV_EVTCHN=m
CONFIG_XEN_BLKDEV_FRONTEND=m
CONFIG_XEN_NETDEV_FRONTEND=m
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 36a98c48aedc..77978e372377 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,18 +1,20 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Context tracking: Probe on high level context boundaries such as kernel
- * and userspace. This includes syscalls and exceptions entry/exit.
+ * Context tracking: Probe on high level context boundaries such as kernel,
+ * userspace, guest or idle.
*
* This is used by RCU to remove its dependency on the timer tick while a CPU
- * runs in userspace.
+ * runs in idle, userspace or guest mode.
*
- * Started by Frederic Weisbecker:
+ * User/guest tracking started by Frederic Weisbecker:
*
- * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker
*
* Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
* Steven Rostedt, Peter Zijlstra for suggestions and improvements.
*
+ * RCU extended quiescent state bits imported from kernel/rcu/tree.c
+ * where the relevant authorship may be found.
*/
#include <linux/context_tracking.h>
@@ -21,6 +23,411 @@
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
+#include <trace/events/rcu.h>
+
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+ .dynticks_nesting = 1,
+ .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
+#endif
+ .state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
+};
+EXPORT_SYMBOL_GPL(context_tracking);
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+#define TPS(x) tracepoint_string(x)
+
+/* Record the current task on dyntick-idle entry. */
+static __always_inline void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static __always_inline void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
+static __always_inline void rcu_dynticks_task_trace_enter(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = true;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
+static __always_inline void rcu_dynticks_task_trace_exit(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = false;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
+ */
+static noinstr void ct_kernel_exit_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
+ seq = ct_state_inc(offset);
+ // RCU is no longer watching. Better be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
+ */
+static noinstr void ct_kernel_enter_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ seq = ct_state_inc(offset);
+ // RCU is now watching. Better not be in an extended quiescent state!
+ rcu_dynticks_task_trace_exit(); // After ->dynticks update!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+static void noinstr ct_kernel_exit(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE);
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ ct_dynticks_nesting() == 0);
+ if (ct_dynticks_nesting() != 1) {
+ // RCU will still be watching, so just do accounting and leave.
+ ct->dynticks_nesting--;
+ return;
+ }
+
+ instrumentation_begin();
+ lockdep_assert_irqs_disabled();
+ trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ // RCU is watching here ...
+ ct_kernel_exit_state(offset);
+ // ... but is no longer watching here.
+ rcu_dynticks_task_enter();
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
+ */
+static void noinstr ct_kernel_enter(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ long oldval;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ oldval = ct_dynticks_nesting();
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
+ if (oldval) {
+ // RCU was already watching, so just do accounting and leave.
+ ct->dynticks_nesting++;
+ return;
+ }
+ rcu_dynticks_task_exit();
+ // RCU is not watching here ...
+ ct_kernel_enter_state(offset);
+ // ... but is watching here.
+ instrumentation_begin();
+
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(ct->dynticks_nesting, 1);
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+ instrumentation_end();
+}
+
+/**
+ * ct_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to ct_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_exit(void)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ instrumentation_begin();
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0);
+ WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (ct_dynticks_nmi_nesting() != 1) {
+ trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2,
+ ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */
+ ct_dynticks_nmi_nesting() - 2);
+ instrumentation_end();
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
+ WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+ instrumentation_end();
+
+ // RCU is watching here ...
+ ct_kernel_exit_state(RCU_DYNTICKS_IDX);
+ // ... but is no longer watching here.
+
+ if (!in_nmi())
+ rcu_dynticks_task_enter();
+}
+
+/**
+ * ct_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
+ * ct->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
+ *
+ * If you add or remove a call to ct_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_enter(void)
+{
+ long incby = 2;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ /* Complain about underflow. */
+ WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (rcu_dynticks_curr_cpu_in_eqs()) {
+
+ if (!in_nmi())
+ rcu_dynticks_task_exit();
+
+ // RCU is not watching here ...
+ ct_kernel_enter_state(RCU_DYNTICKS_IDX);
+ // ... but is watching here.
+
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+ instrument_atomic_read(&ct->state, sizeof(ct->state));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ incby = 1;
+ } else if (!in_nmi()) {
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ } else {
+ instrumentation_begin();
+ }
+
+ trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
+ ct_dynticks_nmi_nesting(),
+ ct_dynticks_nmi_nesting() + incby, ct_dynticks());
+ instrumentation_end();
+ WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */
+ ct_dynticks_nmi_nesting() + incby);
+ barrier();
+}
+
+/**
+ * ct_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * If you add or remove a call to ct_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_enter(void)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
+}
+EXPORT_SYMBOL_GPL(ct_idle_enter);
+
+/**
+ * ct_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * If you add or remove a call to ct_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_exit(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(ct_idle_exit);
+
+/**
+ * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_enter(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_enter();
+}
+
+/**
+ * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_exit(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_exit();
+}
+
+/*
+ * Wrapper for ct_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_enter();
+ local_irq_restore(flags);
+}
+
+/*
+ * Wrapper for ct_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_exit();
+ local_irq_restore(flags);
+}
+#else
+static __always_inline void ct_kernel_exit(bool user, int offset) { }
+static __always_inline void ct_kernel_enter(bool user, int offset) { }
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
+#ifdef CONFIG_CONTEXT_TRACKING_USER
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -28,9 +435,6 @@
DEFINE_STATIC_KEY_FALSE(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
-DEFINE_PER_CPU(struct context_tracking, context_tracking);
-EXPORT_SYMBOL_GPL(context_tracking);
-
static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -51,29 +455,32 @@ static __always_inline void context_tracking_recursion_exit(void)
}
/**
- * context_tracking_enter - Inform the context tracking that the CPU is going
- * enter user or guest space mode.
+ * __ct_user_enter - Inform the context tracking that the CPU is going
+ * to enter user or guest space mode.
*
* This function must be called right before we switch from the kernel
* to user or guest space, when it's guaranteed the remaining kernel
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void noinstr __context_tracking_enter(enum ctx_state state)
+void noinstr __ct_user_enter(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ lockdep_assert_irqs_disabled();
+
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
if (!context_tracking_recursion_enter())
return;
- if ( __this_cpu_read(context_tracking.state) != state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() != state) {
+ if (ct->active) {
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
- * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
if (state == CONTEXT_USER) {
@@ -82,35 +489,77 @@ void noinstr __context_tracking_enter(enum ctx_state state)
vtime_user_enter(current);
instrumentation_end();
}
- rcu_user_enter();
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
+
+ /*
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_eqs_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+ ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ atomic_add(state, &ct->state);
+ }
}
- /*
- * Even if context tracking is disabled on this CPU, because it's outside
- * the full dynticks mask for example, we still have to keep track of the
- * context transitions and states to prevent inconsistency on those of
- * other CPUs.
- * If a task triggers an exception in userspace, sleep on the exception
- * handler and then migrate to another CPU, that new CPU must know where
- * the exception returns by the time we call exception_exit().
- * This information can only be provided by the previous CPU when it called
- * exception_enter().
- * OTOH we can spare the calls to vtime and RCU when context_tracking.active
- * is false because we know that CPU is not tickless.
- */
- __this_cpu_write(context_tracking.state, state);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__ct_user_enter);
-void context_tracking_enter(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_restore() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_enter() through user_enter_irqoff()
+ * or context_tracking_guest_enter(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_enter(enum ctx_state state)
{
unsigned long flags;
/*
* Some contexts may involve an exception occuring in an irq,
* leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
* This would mess up the dyntick_nesting count though. And rcu_irq_*()
* helpers are enough to protect RCU uses inside the exception. So
* just return immediately if we detect we are in an IRQ.
@@ -119,21 +568,32 @@ void context_tracking_enter(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_enter(state);
+ __ct_user_enter(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_enter);
-EXPORT_SYMBOL_GPL(context_tracking_enter);
+NOKPROBE_SYMBOL(ct_user_enter);
+EXPORT_SYMBOL_GPL(ct_user_enter);
-void context_tracking_user_enter(void)
+/**
+ * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls
+ * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call user_enter_irqoff(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void user_enter_callable(void)
{
user_enter();
}
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(user_enter_callable);
/**
- * context_tracking_exit - Inform the context tracking that the CPU is
- * exiting user or guest mode and entering the kernel.
+ * __ct_user_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
*
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
@@ -143,32 +603,64 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void noinstr __context_tracking_exit(enum ctx_state state)
+void noinstr __ct_user_exit(enum ctx_state state)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
if (!context_tracking_recursion_enter())
return;
- if (__this_cpu_read(context_tracking.state) == state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() == state) {
+ if (ct->active) {
/*
- * We are going to run code that may use RCU. Inform
- * RCU core about that (ie: we may need the tick again).
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
*/
- rcu_user_exit();
+ ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
if (state == CONTEXT_USER) {
instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
instrumentation_end();
}
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ atomic_set(&ct->state, CONTEXT_KERNEL);
+
+ } else {
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ atomic_set(&ct->state, CONTEXT_KERNEL);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ atomic_sub(state, &ct->state);
+ }
}
- __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-EXPORT_SYMBOL_GPL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__ct_user_exit);
-void context_tracking_exit(enum ctx_state state)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_save() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_exit() through user_exit_irqoff()
+ * or context_tracking_guest_exit(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_exit(enum ctx_state state)
{
unsigned long flags;
@@ -176,19 +668,30 @@ void context_tracking_exit(enum ctx_state state)
return;
local_irq_save(flags);
- __context_tracking_exit(state);
+ __ct_user_exit(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_exit);
-EXPORT_SYMBOL_GPL(context_tracking_exit);
+NOKPROBE_SYMBOL(ct_user_exit);
+EXPORT_SYMBOL_GPL(ct_user_exit);
-void context_tracking_user_exit(void)
+/**
+ * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
+ * involving illegal RCU uses through tracing and lockdep. This is unlikely
+ * to be fixed as this function is obsolete. The preferred way is to call
+ * user_exit_irqoff(). It should be the arch entry code responsibility to
+ * call into context tracking with IRQs disabled.
+ */
+void user_exit_callable(void)
{
user_exit();
}
-NOKPROBE_SYMBOL(context_tracking_user_exit);
+NOKPROBE_SYMBOL(user_exit_callable);
-void __init context_tracking_cpu_set(int cpu)
+void __init ct_cpu_track_user(int cpu)
{
static __initdata bool initialized = false;
@@ -212,12 +715,14 @@ void __init context_tracking_cpu_set(int cpu)
initialized = true;
}
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
}
#endif
+
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 246efc74e3f3..ba4ba71facf9 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -35,11 +35,11 @@ static int cpu_pm_notify(enum cpu_pm_event event)
* disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know
* this.
*/
- rcu_irq_enter_irqson();
+ ct_irq_enter_irqson();
rcu_read_lock();
ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
rcu_read_unlock();
- rcu_irq_exit_irqson();
+ ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
@@ -49,11 +49,11 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev
unsigned long flags;
int ret;
- rcu_irq_enter_irqson();
+ ct_irq_enter_irqson();
raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
- rcu_irq_exit_irqson();
+ ct_irq_exit_irqson();
return notifier_to_errno(ret);
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 71122e01623c..a0eb4d5cf557 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -9,12 +9,15 @@
#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/sizes.h>
#include <asm/page.h>
#include <asm/sections.h>
#include <crypto/sha1.h>
+#include "kallsyms_internal.h"
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -43,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline,
unsigned long long *crash_base)
{
char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
/* for each entry of the comma-separated list */
do {
@@ -87,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
cur = tmp;
- if (size >= system_ram) {
+ if (size >= total_mem) {
pr_warn("crashkernel: invalid size\n");
return -EINVAL;
}
/* match ? */
- if (system_ram >= start && system_ram < end) {
+ if (total_mem >= start && total_mem < end) {
*crash_size = size;
break;
}
@@ -480,6 +492,19 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
+#ifdef CONFIG_KALLSYMS
+ VMCOREINFO_SYMBOL(kallsyms_names);
+ VMCOREINFO_SYMBOL(kallsyms_num_syms);
+ VMCOREINFO_SYMBOL(kallsyms_token_table);
+ VMCOREINFO_SYMBOL(kallsyms_token_index);
+#ifdef CONFIG_KALLSYMS_BASE_RELATIVE
+ VMCOREINFO_SYMBOL(kallsyms_offsets);
+ VMCOREINFO_SYMBOL(kallsyms_relative_base);
+#else
+ VMCOREINFO_SYMBOL(kallsyms_addresses);
+#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */
+#endif /* CONFIG_KALLSYMS */
+
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7beceb447211..d5e9ccde3ab8 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -50,7 +50,6 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/rcupdate.h>
#include <linux/irq.h>
#include <linux/security.h>
@@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm) {
- int i;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache.vmas[i])
- continue;
- flush_cache_range(current->vmacache.vmas[i],
- addr, addr + BREAK_INSTR_SIZE);
- }
- }
-
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 164ed9ef77a3..e39cb696cfbd 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -214,13 +214,22 @@ void __delayacct_freepages_end(void)
&current->delays->freepages_count);
}
-void __delayacct_thrashing_start(void)
+void __delayacct_thrashing_start(bool *in_thrashing)
{
+ *in_thrashing = !!current->in_thrashing;
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 1;
current->delays->thrashing_start = local_clock();
}
-void __delayacct_thrashing_end(void)
+void __delayacct_thrashing_end(bool *in_thrashing)
{
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 0;
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 375fb3c9538d..c21abc77c53e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -74,7 +74,7 @@ out_unmap_membase:
return ERR_PTR(-ENOMEM);
}
-static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
{
if (!mem)
return;
@@ -126,10 +126,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
ret = dma_assign_coherent_memory(dev, mem);
if (ret)
- dma_release_coherent_memory(mem);
+ _dma_release_coherent_memory(mem);
return ret;
}
+void dma_release_coherent_memory(struct device *dev)
+{
+ if (dev)
+ _dma_release_coherent_memory(dev->dma_mem);
+}
+
static void *__dma_alloc_from_coherent(struct device *dev,
struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index ac740630c79c..18c93c2276ca 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -350,11 +350,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
unsigned long *flags)
{
- unsigned int max_range = dma_get_max_seg_size(ref->dev);
struct dma_debug_entry *entry, index = *ref;
- unsigned int range = 0;
+ int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1);
- while (range <= max_range) {
+ for (int i = 0; i < limit; i++) {
entry = __hash_bucket_find(*bucket, ref, containing_match);
if (entry)
@@ -364,7 +363,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
* Nothing found, go back a hash bucket
*/
put_hash_bucket(*bucket, *flags);
- range += (1 << HASH_FN_SHIFT);
index.dev_addr -= (1 << HASH_FN_SHIFT);
*bucket = get_hash_bucket(&index, flags);
}
@@ -564,7 +562,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
rc = active_cacheline_insert(entry);
if (rc == -ENOMEM) {
- pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
+ pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
} else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
err_printk(entry->dev, entry,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index e978f36e6be8..63859a101ed8 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -357,7 +357,7 @@ void dma_direct_free(struct device *dev, size_t size,
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
- if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
+ if (dma_set_encrypted(dev, cpu_addr, size))
return;
}
@@ -392,7 +392,6 @@ void dma_direct_free_pages(struct device *dev, size_t size,
struct page *page, dma_addr_t dma_addr,
enum dma_data_direction dir)
{
- unsigned int page_order = get_order(size);
void *vaddr = page_address(page);
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
@@ -400,7 +399,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
- if (dma_set_encrypted(dev, vaddr, 1 << page_order))
+ if (dma_set_encrypted(dev, vaddr, size))
return;
__dma_direct_free_pages(dev, page, size);
}
@@ -454,29 +453,60 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_sync_dma_for_cpu_all();
}
+/*
+ * Unmaps segments, except for ones marked as pci_p2pdma which do not
+ * require any further action as they contain a bus address.
+ */
void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *sg;
int i;
- for_each_sg(sgl, sg, nents, i)
- dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
- attrs);
+ for_each_sg(sgl, sg, nents, i) {
+ if (sg_is_dma_bus_address(sg))
+ sg_dma_unmark_bus_address(sg);
+ else
+ dma_direct_unmap_page(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
+ }
}
#endif
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
- int i;
+ struct pci_p2pdma_map_state p2pdma_state = {};
+ enum pci_p2pdma_map_type map;
struct scatterlist *sg;
+ int i, ret;
for_each_sg(sgl, sg, nents, i) {
+ if (is_pci_p2pdma_page(sg_page(sg))) {
+ map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg);
+ switch (map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ continue;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI
+ * host bridge must be mapped with CPU physical
+ * address and not PCI bus addresses. This is
+ * done with dma_direct_map_page() below.
+ */
+ break;
+ default:
+ ret = -EREMOTEIO;
+ goto out_unmap;
+ }
+ }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
- if (sg->dma_address == DMA_MAPPING_ERROR)
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
goto out_unmap;
+ }
sg_dma_len(sg) = sg->length;
}
@@ -484,7 +514,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return -EIO;
+ return ret;
}
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index a78c0ba70645..e38ffc5e6bdd 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -8,6 +8,7 @@
#define _KERNEL_DMA_DIRECT_H
#include <linux/dma-direct.h>
+#include <linux/memremap.h>
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
@@ -87,10 +88,15 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
- if (is_swiotlb_force_bounce(dev))
+ if (is_swiotlb_force_bounce(dev)) {
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
+ }
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+ if (is_pci_p2pdma_page(page))
+ return DMA_MAPPING_ERROR;
if (is_swiotlb_active(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index db7244291b74..33437d620644 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -10,6 +10,7 @@
#include <linux/dma-map-ops.h>
#include <linux/export.h>
#include <linux/gfp.h>
+#include <linux/kmsan.h>
#include <linux/of_device.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
+ kmsan_handle_dma(page, offset, size, dir);
debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
@@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
- if (ents > 0)
+ if (ents > 0) {
+ kmsan_handle_dma_sg(sg, nents, dir);
debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
- else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
- ents != -EIO))
+ } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO && ents != -EREMOTEIO)) {
return -EIO;
+ }
return ents;
}
@@ -249,12 +253,15 @@ EXPORT_SYMBOL(dma_map_sg_attrs);
* Returns 0 on success or a negative error code on error. The following
* error codes are supported with the given meaning:
*
- * -EINVAL An invalid argument, unaligned access or other error
- * in usage. Will not succeed if retried.
- * -ENOMEM Insufficient resources (like memory or IOVA space) to
- * complete the mapping. Should succeed if retried later.
- * -EIO Legacy error code with an unknown meaning. eg. this is
- * returned if a lower level call returned DMA_MAPPING_ERROR.
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned
+ * DMA_MAPPING_ERROR.
+ * -EREMOTEIO The DMA device cannot access P2PDMA memory specified
+ * in the sg_table. This will not succeed if retried.
*/
int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
enum dma_data_direction dir, unsigned long attrs)
@@ -704,7 +711,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
}
EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
-int dma_supported(struct device *dev, u64 mask)
+static int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -718,7 +725,24 @@ int dma_supported(struct device *dev, u64 mask)
return 1;
return ops->dma_supported(dev, mask);
}
-EXPORT_SYMBOL(dma_supported);
+
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ /* if ops is not set, dma direct will be used which supports P2PDMA */
+ if (!ops)
+ return true;
+
+ /*
+ * Note: dma_ops_bypass is not checked here because P2PDMA should
+ * not be used with dma mapping ops that do not have support even
+ * if the specific device is bypassing them.
+ */
+
+ return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
void arch_dma_set_mask(struct device *dev, u64 mask);
@@ -773,6 +797,18 @@ size_t dma_max_mapping_size(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+size_t dma_opt_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (ops && ops->opt_mapping_size)
+ size = ops->opt_mapping_size();
+
+ return min(dma_max_mapping_size(dev), size);
+}
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
+
bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index dfa1de89dc94..339a990554e7 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,6 +62,12 @@
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+struct io_tlb_slot {
+ phys_addr_t orig_addr;
+ size_t alloc_size;
+ unsigned int list;
+};
+
static bool swiotlb_force_bounce;
static bool swiotlb_force_disable;
@@ -70,6 +76,62 @@ struct io_tlb_mem io_tlb_default_mem;
phys_addr_t swiotlb_unencrypted_base;
static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used: The number of used IO TLB block.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock: The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+ unsigned long used;
+ unsigned int index;
+ spinlock_t lock;
+};
+
+/*
+ * Round up number of slabs to the next power of 2. The last area is going
+ * be smaller than the rest if default_nslabs is not power of two.
+ * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
+ * otherwise a segment may span two or more areas. It conflicts with free
+ * contiguous slots tracking: free slots are treated contiguous no matter
+ * whether they cross an area boundary.
+ *
+ * Return true if default_nslabs is rounded up.
+ */
+static bool round_up_default_nslabs(void)
+{
+ if (!default_nareas)
+ return false;
+
+ if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
+ default_nslabs = IO_TLB_SEGSIZE * default_nareas;
+ else if (is_power_of_2(default_nslabs))
+ return false;
+ default_nslabs = roundup_pow_of_two(default_nslabs);
+ return true;
+}
+
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+ /* use a single area when non is specified */
+ if (!nareas)
+ nareas = 1;
+ else if (!is_power_of_2(nareas))
+ nareas = roundup_pow_of_two(nareas);
+
+ default_nareas = nareas;
+
+ pr_info("area num %d.\n", nareas);
+ if (round_up_default_nslabs())
+ pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+ (default_nslabs << IO_TLB_SHIFT) >> 20);
+}
static int __init
setup_io_tlb_npages(char *str)
@@ -81,6 +143,10 @@ setup_io_tlb_npages(char *str)
}
if (*str == ',')
++str;
+ if (isdigit(*str))
+ swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
+ if (*str == ',')
+ ++str;
if (!strcmp(str, "force"))
swiotlb_force_bounce = true;
else if (!strcmp(str, "noforce"))
@@ -112,8 +178,11 @@ void __init swiotlb_adjust_size(unsigned long size)
*/
if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
return;
+
size = ALIGN(size, IO_TLB_SIZE);
default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ if (round_up_default_nslabs())
+ size = default_nslabs << IO_TLB_SHIFT;
pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
}
@@ -192,7 +261,8 @@ void __init swiotlb_update_mem_attributes(void)
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
- unsigned long nslabs, bool late_alloc)
+ unsigned long nslabs, unsigned int flags,
+ bool late_alloc, unsigned int nareas)
{
void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -200,13 +270,18 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->nslabs = nslabs;
mem->start = start;
mem->end = mem->start + bytes;
- mem->index = 0;
mem->late_alloc = late_alloc;
+ mem->nareas = nareas;
+ mem->area_nslabs = nslabs / mem->nareas;
- if (swiotlb_force_bounce)
- mem->force_bounce = true;
+ mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+
+ for (i = 0; i < mem->nareas; i++) {
+ spin_lock_init(&mem->areas[i].lock);
+ mem->areas[i].index = 0;
+ mem->areas[i].used = 0;
+ }
- spin_lock_init(&mem->lock);
for (i = 0; i < mem->nslabs; i++) {
mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
@@ -233,7 +308,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
int (*remap)(void *tlb, unsigned long nslabs))
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
- unsigned long nslabs = default_nslabs;
+ unsigned long nslabs;
size_t alloc_size;
size_t bytes;
void *tlb;
@@ -244,6 +319,14 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
return;
/*
+ * default_nslabs maybe changed when adjust area number.
+ * So allocate bounce buffer after adjusting area number.
+ */
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ nslabs = default_nslabs;
+ /*
* By default allocate the bounce buffer memory from low memory, but
* allow to pick a location everywhere for hypervisors with guest
* memory encryption.
@@ -263,20 +346,30 @@ retry:
memblock_free(tlb, PAGE_ALIGN(bytes));
nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
- if (nslabs < IO_TLB_MIN_SLABS)
- panic("%s: Failed to remap %zu bytes\n",
- __func__, bytes);
- goto retry;
+ if (nslabs >= IO_TLB_MIN_SLABS)
+ goto retry;
+
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return;
}
alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!mem->slots)
- panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
- __func__, alloc_size, PAGE_SIZE);
+ if (!mem->slots) {
+ pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+ return;
+ }
+
+ mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
+ default_nareas), SMP_CACHE_BYTES);
+ if (!mem->areas) {
+ pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
+ return;
+ }
- swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
- mem->force_bounce = flags & SWIOTLB_FORCE;
+ swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
+ default_nareas);
if (flags & SWIOTLB_VERBOSE)
swiotlb_print_info();
@@ -284,7 +377,7 @@ retry:
void __init swiotlb_init(bool addressing_limit, unsigned int flags)
{
- return swiotlb_init_remap(addressing_limit, flags, NULL);
+ swiotlb_init_remap(addressing_limit, flags, NULL);
}
/*
@@ -298,7 +391,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
struct io_tlb_mem *mem = &io_tlb_default_mem;
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned char *vstart = NULL;
- unsigned int order;
+ unsigned int order, area_order;
bool retried = false;
int rc = 0;
@@ -339,19 +432,34 @@ retry:
(PAGE_SIZE << order) >> 20);
}
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ default_nareas));
+ mem->areas = (struct io_tlb_area *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+ if (!mem->areas)
+ goto error_area;
+
mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(array_size(sizeof(*mem->slots), nslabs)));
- if (!mem->slots) {
- free_pages((unsigned long)vstart, order);
- return -ENOMEM;
- }
+ if (!mem->slots)
+ goto error_slots;
set_memory_decrypted((unsigned long)vstart,
(nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true);
+ swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
+ default_nareas);
swiotlb_print_info();
return 0;
+
+error_slots:
+ free_pages((unsigned long)mem->areas, area_order);
+error_area:
+ free_pages((unsigned long)vstart, order);
+ return -ENOMEM;
}
void __init swiotlb_exit(void)
@@ -359,6 +467,7 @@ void __init swiotlb_exit(void)
struct io_tlb_mem *mem = &io_tlb_default_mem;
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
+ unsigned int area_order;
if (swiotlb_force_bounce)
return;
@@ -373,9 +482,14 @@ void __init swiotlb_exit(void)
set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
if (mem->late_alloc) {
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ mem->nareas));
+ free_pages((unsigned long)mem->areas, area_order);
free_pages(tbl_vaddr, get_order(tbl_size));
free_pages((unsigned long)mem->slots, get_order(slots_size));
} else {
+ memblock_free_late(__pa(mem->areas),
+ array_size(sizeof(*mem->areas), mem->nareas));
memblock_free_late(mem->start, tbl_size);
memblock_free_late(__pa(mem->slots), slots_size);
}
@@ -436,9 +550,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
if (PageHighMem(pfn_to_page(pfn))) {
- /* The buffer does not have a mapping. Map it in and copy */
unsigned int offset = orig_addr & ~PAGE_MASK;
- char *buffer;
+ struct page *page;
unsigned int sz = 0;
unsigned long flags;
@@ -446,12 +559,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
sz = min_t(size_t, PAGE_SIZE - offset, size);
local_irq_save(flags);
- buffer = kmap_atomic(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
if (dir == DMA_TO_DEVICE)
- memcpy(vaddr, buffer + offset, sz);
+ memcpy_from_page(vaddr, page, offset, sz);
else
- memcpy(buffer + offset, vaddr, sz);
- kunmap_atomic(buffer);
+ memcpy_to_page(page, offset, vaddr, sz);
local_irq_restore(flags);
size -= sz;
@@ -466,7 +578,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
}
-#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
+static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
+{
+ return start + (idx << IO_TLB_SHIFT);
+}
/*
* Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
@@ -478,9 +593,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
return nr_slots(boundary_mask + 1);
}
-static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
{
- if (index >= mem->nslabs)
+ if (index >= mem->area_nslabs)
return 0;
return index;
}
@@ -489,10 +604,12 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
* Find a suitable number of IO TLB entries size that will fit this request and
* allocate a buffer from that IO TLB pool.
*/
-static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size, unsigned int alloc_align_mask)
+static int swiotlb_do_find_slots(struct device *dev, int area_index,
+ phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_area *area = mem->areas + area_index;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -503,8 +620,11 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
unsigned int index, wrap, count = 0, i;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned long flags;
+ unsigned int slot_base;
+ unsigned int slot_index;
BUG_ON(!nslots);
+ BUG_ON(area_index >= mem->nareas);
/*
* For mappings with an alignment requirement don't bother looping to
@@ -516,16 +636,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
- spin_lock_irqsave(&mem->lock, flags);
- if (unlikely(nslots > mem->nslabs - mem->used))
+ spin_lock_irqsave(&area->lock, flags);
+ if (unlikely(nslots > mem->area_nslabs - area->used))
goto not_found;
- index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
+ slot_base = area_index * mem->area_nslabs;
+ index = wrap = wrap_area_index(mem, ALIGN(area->index, stride));
+
do {
+ slot_index = slot_base + index;
+
if (orig_addr &&
- (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
- (orig_addr & iotlb_align_mask)) {
- index = wrap_index(mem, index + 1);
+ (slot_addr(tbl_dma_addr, slot_index) &
+ iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+ index = wrap_area_index(mem, index + 1);
continue;
}
@@ -534,26 +658,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
* contiguous buffers, we allocate the buffers from that slot
* and mark the entries as '0' indicating unavailable.
*/
- if (!iommu_is_span_boundary(index, nslots,
+ if (!iommu_is_span_boundary(slot_index, nslots,
nr_slots(tbl_dma_addr),
max_slots)) {
- if (mem->slots[index].list >= nslots)
+ if (mem->slots[slot_index].list >= nslots)
goto found;
}
- index = wrap_index(mem, index + stride);
+ index = wrap_area_index(mem, index + stride);
} while (index != wrap);
not_found:
- spin_unlock_irqrestore(&mem->lock, flags);
+ spin_unlock_irqrestore(&area->lock, flags);
return -1;
found:
- for (i = index; i < index + nslots; i++) {
+ for (i = slot_index; i < slot_index + nslots; i++) {
mem->slots[i].list = 0;
- mem->slots[i].alloc_size =
- alloc_size - (offset + ((i - index) << IO_TLB_SHIFT));
+ mem->slots[i].alloc_size = alloc_size - (offset +
+ ((i - slot_index) << IO_TLB_SHIFT));
}
- for (i = index - 1;
+ for (i = slot_index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
mem->slots[i].list; i--)
mem->slots[i].list = ++count;
@@ -561,14 +685,42 @@ found:
/*
* Update the indices to avoid searching in the next round.
*/
- if (index + nslots < mem->nslabs)
- mem->index = index + nslots;
+ if (index + nslots < mem->area_nslabs)
+ area->index = index + nslots;
else
- mem->index = 0;
- mem->used += nslots;
+ area->index = 0;
+ area->used += nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
+ return slot_index;
+}
- spin_unlock_irqrestore(&mem->lock, flags);
- return index;
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ int start = raw_smp_processor_id() & (mem->nareas - 1);
+ int i = start, index;
+
+ do {
+ index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size,
+ alloc_align_mask);
+ if (index >= 0)
+ return index;
+ if (++i >= mem->nareas)
+ i = 0;
+ } while (i != start);
+
+ return -1;
+}
+
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+ int i;
+ unsigned long used = 0;
+
+ for (i = 0; i < mem->nareas; i++)
+ used += mem->areas[i].used;
+ return used;
}
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
@@ -582,8 +734,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
int index;
phys_addr_t tlb_addr;
- if (!mem)
- panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ if (!mem || !mem->nslabs) {
+ dev_warn_ratelimited(dev,
+ "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
@@ -600,7 +755,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, mem->nslabs, mem->used);
+ alloc_size, mem->nslabs, mem_used(mem));
return (phys_addr_t)DMA_MAPPING_ERROR;
}
@@ -615,7 +770,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
/*
* When dir == DMA_FROM_DEVICE we could omit the copy from the orig
* to the tlb buffer, if we knew for sure the device will
- * overwirte the entire current content. But we don't. Thus
+ * overwrite the entire current content. But we don't. Thus
* unconditional bounce may prevent leaking swiotlb content (i.e.
* kernel memory) to user-space.
*/
@@ -630,6 +785,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ int aindex = index / mem->area_nslabs;
+ struct io_tlb_area *area = &mem->areas[aindex];
int count, i;
/*
@@ -638,7 +795,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
* While returning the entries to the free list, we merge the entries
* with slots below and above the pool being returned.
*/
- spin_lock_irqsave(&mem->lock, flags);
+ BUG_ON(aindex >= mem->nareas);
+
+ spin_lock_irqsave(&area->lock, flags);
if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
count = mem->slots[index + nslots].list;
else
@@ -662,8 +821,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
i--)
mem->slots[i].list = ++count;
- mem->used -= nslots;
- spin_unlock_irqrestore(&mem->lock, flags);
+ area->used -= nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
}
/*
@@ -758,6 +917,13 @@ bool is_swiotlb_active(struct device *dev)
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);
+static int io_tlb_used_get(void *data, u64 *val)
+{
+ *val = mem_used(&io_tlb_default_mem);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
+
static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
const char *dirname)
{
@@ -766,7 +932,8 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
return;
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
- debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+ debugfs_create_file("io_tlb_used", 0400, mem->debugfs, NULL,
+ &fops_io_tlb_used);
}
static int __init __maybe_unused swiotlb_create_default_debugfs(void)
@@ -817,6 +984,9 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
struct io_tlb_mem *mem = rmem->priv;
unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+ /* Set Per-device io tlb area to one */
+ unsigned int nareas = 1;
+
/*
* Since multiple devices can share the same pool, the private data,
* io_tlb_mem struct, will be initialized by the first device attached
@@ -833,10 +1003,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
return -ENOMEM;
}
+ mem->areas = kcalloc(nareas, sizeof(*mem->areas),
+ GFP_KERNEL);
+ if (!mem->areas) {
+ kfree(mem->slots);
+ kfree(mem);
+ return -ENOMEM;
+ }
+
set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
rmem->size >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
- mem->force_bounce = true;
+ swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
+ false, nareas);
mem->for_alloc = true;
rmem->priv = mem;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 032f164abe7c..846add8394c4 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,6 +5,7 @@
#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
+#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
user_exit_irqoff();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
}
@@ -321,7 +323,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
}
/*
- * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
@@ -332,12 +334,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
- * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
- * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
@@ -350,8 +352,9 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
- rcu_irq_enter();
+ ct_irq_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
@@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
@@ -418,7 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
instrumentation_end();
- rcu_irq_exit();
+ ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
@@ -436,7 +440,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
* was not watching on entry.
*/
if (state.exit_rcu)
- rcu_irq_exit();
+ ct_irq_exit();
}
}
@@ -449,9 +453,10 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
- rcu_nmi_enter();
+ ct_nmi_enter();
instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
@@ -469,7 +474,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
}
instrumentation_end();
- rcu_nmi_exit();
+ ct_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 9d09f489b60e..2e0f75bcb7fd 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
int ret;
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
- clear_notify_signal();
- if (task_work_pending(current))
- task_work_run();
- }
-
- if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 8591c180b52b..91a62f566743 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,4 +2,5 @@
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o
obj-$(CONFIG_UPROBES) += uprobes.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 80782cddb1da..01933db7629c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -54,6 +54,7 @@
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
+#include <linux/task_work.h>
#include "internal.h"
@@ -1468,6 +1469,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
+ lockdep_assert_held(&ctx->lock);
+
if (adv)
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
@@ -1819,6 +1822,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
+ if (event->attr.read_format & PERF_FORMAT_LOST)
+ entry += sizeof(u64);
+
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
@@ -2221,16 +2227,22 @@ static inline int __pmu_filter_match(struct perf_event *event)
static inline int pmu_filter_match(struct perf_event *event)
{
struct perf_event *sibling;
+ unsigned long flags;
+ int ret = 1;
if (!__pmu_filter_match(event))
return 0;
+ local_irq_save(flags);
for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling))
- return 0;
+ if (!__pmu_filter_match(sibling)) {
+ ret = 0;
+ break;
+ }
}
+ local_irq_restore(flags);
- return 1;
+ return ret;
}
static inline int
@@ -2265,11 +2277,26 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (READ_ONCE(event->pending_disable) >= 0) {
- WRITE_ONCE(event->pending_disable, -1);
+ if (event->pending_disable) {
+ event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
+
+ if (event->pending_sigtrap) {
+ bool dec = true;
+
+ event->pending_sigtrap = 0;
+ if (state != PERF_EVENT_STATE_OFF &&
+ !event->pending_work) {
+ event->pending_work = 1;
+ dec = false;
+ task_work_add(current, &event->pending_task, TWA_RESUME);
+ }
+ if (dec)
+ local_dec(&event->ctx->nr_pending);
+ }
+
perf_event_set_state(event, state);
if (!is_software_event(event))
@@ -2421,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in perf_event_exit_event().
*
- * When called from perf_pending_event it's OK because event->ctx
+ * When called from perf_pending_irq it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
@@ -2460,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- WRITE_ONCE(event->pending_disable, smp_processor_id());
- /* can fail, see perf_pending_event_disable() */
- irq_work_queue(&event->pending);
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending_irq);
}
#define MAX_INTERRUPTS (~0ULL)
@@ -3417,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+ perf_pmu_disable(pmu);
+
+ /* PMIs are disabled; ctx->nr_pending is stable. */
+ if (local_read(&ctx->nr_pending) ||
+ local_read(&next_ctx->nr_pending)) {
+ /*
+ * Must not swap out ctx when there's pending
+ * events that rely on the ctx->task relation.
+ */
+ raw_spin_unlock(&next_ctx->lock);
+ rcu_read_unlock();
+ goto inside_switch;
+ }
+
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- perf_pmu_disable(pmu);
-
if (cpuctx->sched_cb_usage && pmu->sched_task)
pmu->sched_task(ctx, false);
@@ -3462,6 +3500,7 @@ unlock:
raw_spin_lock(&ctx->lock);
perf_pmu_disable(pmu);
+inside_switch:
if (cpuctx->sched_cb_usage && pmu->sched_task)
pmu->sched_task(ctx, false);
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
@@ -4454,7 +4493,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
*value = local64_read(&event->count);
if (enabled || running) {
- u64 __enabled, __running, __now;;
+ u64 __enabled, __running, __now;
calc_timer_values(event, &__now, &__enabled, &__running);
if (enabled)
@@ -4928,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
static void _free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending);
+ irq_work_sync(&event->pending_irq);
unaccount_event(event);
@@ -5260,11 +5299,15 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -5321,7 +5364,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5331,6 +5374,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -6253,10 +6298,10 @@ again:
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Raced against perf_mmap_close(); remove the
+ * event and try again.
*/
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
goto again;
}
@@ -6422,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
return;
/*
- * perf_pending_event() can race with the task exiting.
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
*/
if (current->flags & PF_EXITING)
return;
@@ -6431,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
event->attr.type, event->attr.sig_data);
}
-static void perf_pending_event_disable(struct perf_event *event)
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_irq(struct perf_event *event)
{
- int cpu = READ_ONCE(event->pending_disable);
+ int cpu = READ_ONCE(event->oncpu);
+ /*
+ * If the event isn't running; we done. event_sched_out() will have
+ * taken care of things.
+ */
if (cpu < 0)
return;
+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
if (cpu == smp_processor_id()) {
- WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
+ if (event->pending_sigtrap) {
+ event->pending_sigtrap = 0;
perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
+ local_dec(&event->ctx->nr_pending);
+ }
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
}
-
- perf_event_disable_local(event);
return;
}
@@ -6467,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event)
* irq_work_queue(); // FAILS
*
* irq_work_run()
- * perf_pending_event()
+ * perf_pending_irq()
*
* But the event runs on CPU-B and wants disabling there.
*/
- irq_work_queue_on(&event->pending, cpu);
+ irq_work_queue_on(&event->pending_irq, cpu);
}
-static void perf_pending_event(struct irq_work *entry)
+static void perf_pending_irq(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry, struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
int rctx;
- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();
- perf_pending_event_disable(event);
-
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
+ __perf_pending_irq(event);
+
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
+static void perf_pending_task(struct callback_head *head)
+{
+ struct perf_event *event = container_of(head, struct perf_event, pending_task);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+
+ if (event->pending_work) {
+ event->pending_work = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_pending);
+ }
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+ preempt_enable_notrace();
+}
+
#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
@@ -6785,11 +6868,10 @@ out_put:
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
- struct perf_event *event)
+ struct perf_event *event,
+ u64 sample_type)
{
- u64 sample_type = event->attr.sample_type;
-
- data->type = sample_type;
+ data->type = event->attr.sample_type;
header->size += event->id_header_size;
if (sample_type & PERF_SAMPLE_TID) {
@@ -6818,7 +6900,7 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_event *event)
{
if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event);
+ __perf_event_header__init_id(header, data, event, event->attr.sample_type);
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -6858,7 +6940,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
@@ -6872,6 +6954,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6882,9 +6966,16 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ unsigned long flags;
+ u64 values[6];
int n = 0;
+ /*
+ * Disabling interrupts avoids all counter scheduling
+ * (context switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -6900,6 +6991,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -6913,9 +7006,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
+
+ local_irq_restore(flags);
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
@@ -6952,11 +7049,6 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
-static inline bool perf_sample_save_hw_index(struct perf_event *event)
-{
- return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
-}
-
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -7038,14 +7130,14 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- if (data->br_stack) {
+ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
size_t size;
size = data->br_stack->nr
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
- if (perf_sample_save_hw_index(event))
+ if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
@@ -7288,6 +7380,7 @@ void perf_prepare_sample(struct perf_event_header *header,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
+ u64 filtered_sample_type;
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
@@ -7295,7 +7388,12 @@ void perf_prepare_sample(struct perf_event_header *header,
header->misc = 0;
header->misc |= perf_misc_flags(regs);
- __perf_event_header__init_id(header, data, event);
+ /*
+ * Clear the sample flags that have already been done by the
+ * PMU driver.
+ */
+ filtered_sample_type = sample_type & ~data->sample_flags;
+ __perf_event_header__init_id(header, data, event, filtered_sample_type);
if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
data->ip = perf_instruction_pointer(regs);
@@ -7303,7 +7401,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
data->callchain = perf_callchain(event, regs);
size += data->callchain->nr;
@@ -7315,7 +7413,7 @@ void perf_prepare_sample(struct perf_event_header *header,
struct perf_raw_record *raw = data->raw;
int size;
- if (raw) {
+ if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) {
struct perf_raw_frag *frag = &raw->frag;
u32 sum = 0;
@@ -7331,6 +7429,7 @@ void perf_prepare_sample(struct perf_event_header *header,
frag->pad = raw->size - sum;
} else {
size = sizeof(u64);
+ data->raw = NULL;
}
header->size += size;
@@ -7338,8 +7437,8 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
int size = sizeof(u64); /* nr */
- if (data->br_stack) {
- if (perf_sample_save_hw_index(event))
+ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
+ if (branch_sample_hw_index(event))
size += sizeof(u64);
size += data->br_stack->nr
@@ -7388,6 +7487,20 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ data->weight.full = 0;
+
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ data->data_src.val = PERF_MEM_NA;
+
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ data->txn = 0;
+
+ if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
+ if (filtered_sample_type & PERF_SAMPLE_ADDR)
+ data->addr = 0;
+ }
+
if (sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7403,7 +7516,8 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
+ filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
#ifdef CONFIG_CGROUP_PERF
@@ -9164,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event)
*/
static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
int ret = 0;
@@ -9188,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
}
+ if (event->attr.sigtrap) {
+ /*
+ * Should not be able to return to user space without processing
+ * pending_sigtrap (kernel events can overflow multiple times).
+ */
+ WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+ if (!event->pending_sigtrap) {
+ event->pending_sigtrap = 1;
+ local_inc(&event->ctx->nr_pending);
+ }
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending_irq);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
- irq_work_queue(&event->pending);
+ irq_work_queue(&event->pending_irq);
}
return ret;
}
int perf_event_overflow(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
return __perf_event_overflow(event, 1, data, regs);
}
@@ -9974,8 +10100,16 @@ static void bpf_overflow_handler(struct perf_event *event,
goto out;
rcu_read_lock();
prog = READ_ONCE(event->prog);
- if (prog)
+ if (prog) {
+ if (prog->call_get_stack &&
+ (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
+ !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) {
+ data->callchain = perf_callchain(event, regs);
+ data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
+
ret = bpf_prog_run(prog, &ctx);
+ }
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -10001,7 +10135,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event,
if (event->attr.precise_ip &&
prog->call_get_stack &&
- (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
event->attr.exclude_callchain_kernel ||
event->attr.exclude_callchain_user)) {
/*
@@ -10068,26 +10202,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
u64 bpf_cookie)
{
- bool is_kprobe, is_tracepoint, is_syscall_tp;
+ bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
- is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+ is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
+ is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
- if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
+ if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
- if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+ if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
return -EINVAL;
+ if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
+ /* only uprobe programs are allowed to be sleepable */
+ return -EINVAL;
+
/* Kprobe override only works for kprobes, not uprobes. */
- if (prog->kprobe_override &&
- !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+ if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
if (is_tracepoint || is_syscall_tp) {
@@ -10210,8 +10348,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!vma->vm_file)
continue;
@@ -10914,7 +11053,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -10925,7 +11064,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -10936,7 +11075,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -11509,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
- event->pending_disable = -1;
- init_irq_work(&event->pending, perf_pending_event);
+ init_irq_work(&event->pending_irq, perf_pending_irq);
+ init_task_work(&event->pending_task, perf_pending_task);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11532,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (parent_event)
event->event_caps = parent_event->event_caps;
- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
-
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
@@ -11690,11 +11826,9 @@ err_pmu:
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (event->ns)
- put_pid_ns(event->ns);
if (event->hw.target)
put_task_struct(event->hw.target);
- kmem_cache_free(perf_event_cache, event);
+ call_rcu(&event->rcu_head, free_event_rcu);
return ERR_PTR(err);
}
@@ -11825,14 +11959,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -11870,8 +12015,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
@@ -11881,6 +12033,12 @@ set:
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -11888,20 +12046,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index f32320ac02fd..c3797701339c 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -17,61 +17,276 @@
* This file contains the arch-independent routines.
*/
+#include <linux/hw_breakpoint.h>
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/irqflags.h>
-#include <linux/kallsyms.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/bug.h>
-#include <linux/hw_breakpoint.h>
/*
- * Constraints data
+ * Datastructure to track the total uses of N slots across tasks or CPUs;
+ * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots.
+ */
+struct bp_slots_histogram {
+#ifdef hw_breakpoint_slots
+ atomic_t count[hw_breakpoint_slots(0)];
+#else
+ atomic_t *count;
+#endif
+};
+
+/*
+ * Per-CPU constraints data.
*/
struct bp_cpuinfo {
- /* Number of pinned cpu breakpoints in a cpu */
- unsigned int cpu_pinned;
- /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
- unsigned int *tsk_pinned;
- /* Number of non-pinned cpu/task breakpoints in a cpu */
- unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+ /* Number of pinned CPU breakpoints in a CPU. */
+ unsigned int cpu_pinned;
+ /* Histogram of pinned task breakpoints in a CPU. */
+ struct bp_slots_histogram tsk_pinned;
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
+/* Number of pinned CPU breakpoints globally. */
+static struct bp_slots_histogram cpu_pinned[TYPE_MAX];
+/* Number of pinned CPU-independent task breakpoints. */
+static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX];
+
/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};
-static int constraints_initialized;
+static bool constraints_initialized __ro_after_init;
-/* Gather the number of total pinned and un-pinned bp in a cpuset */
-struct bp_busy_slots {
- unsigned int pinned;
- unsigned int flexible;
-};
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ * (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ * 2. Holding a write-lock is required for computations that require a
+ * stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ * 3. In all other cases, non-atomic accesses require the appropriately held
+ * lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+ struct task_struct *tsk = bp->hw.target;
-__weak int hw_breakpoint_weight(struct perf_event *bp)
+ return tsk ? &tsk->perf_event_mutex : NULL;
+}
+
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx) {
+ /*
+ * Fully analogous to the perf_try_init_event() nesting
+ * argument in the comment near perf_event_ctx_lock_nested();
+ * this child->perf_event_mutex cannot ever deadlock against
+ * the parent->perf_event_mutex usage from
+ * perf_event_task_{en,dis}able().
+ *
+ * Specifically, inherited events will never occur on
+ * ->perf_event_list.
+ */
+ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING);
+ percpu_down_read(&bp_cpuinfo_sem);
+ } else {
+ percpu_down_write(&bp_cpuinfo_sem);
+ }
+
+ return tsk_mtx;
+}
+
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
+{
+ if (tsk_mtx) {
+ percpu_up_read(&bp_cpuinfo_sem);
+ mutex_unlock(tsk_mtx);
+ } else {
+ percpu_up_write(&bp_cpuinfo_sem);
+ }
+}
+
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+ (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+ percpu_is_read_locked(&bp_cpuinfo_sem));
+}
+
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx)
+ lockdep_assert_held(tsk_mtx);
+ lockdep_assert_held(&bp_cpuinfo_sem);
+}
+
+#ifdef hw_breakpoint_slots
+/*
+ * Number of breakpoint slots is constant, and the same for all types.
+ */
+static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA));
+static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); }
+static inline int init_breakpoint_slots(void) { return 0; }
+#else
+/*
+ * Dynamic number of breakpoint slots.
+ */
+static int __nr_bp_slots[TYPE_MAX] __ro_after_init;
+
+static inline int hw_breakpoint_slots_cached(int type)
+{
+ return __nr_bp_slots[type];
+}
+
+static __init bool
+bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL);
+ return hist->count;
+}
+
+static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist)
+{
+ kfree(hist->count);
+}
+
+static __init int init_breakpoint_slots(void)
+{
+ int i, cpu, err_cpu;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ __nr_bp_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ if (!bp_slots_histogram_alloc(&info->tsk_pinned, i))
+ goto err;
+ }
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ if (!bp_slots_histogram_alloc(&cpu_pinned[i], i))
+ goto err;
+ if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i))
+ goto err;
+ }
+
+ return 0;
+err:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ bp_slots_histogram_free(&cpu_pinned[i]);
+ bp_slots_histogram_free(&tsk_pinned_all[i]);
+ }
+
+ return -ENOMEM;
+}
+#endif
+
+static inline void
+bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val)
+{
+ const int old_idx = old - 1;
+ const int new_idx = old_idx + val;
+
+ if (old_idx >= 0)
+ WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0);
+ if (new_idx >= 0)
+ WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0);
+}
+
+static int
+bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count = atomic_read(&hist->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist->count[i]);
+ if (count > 0)
+ return i + 1;
+ WARN(count < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+static int
+bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2,
+ enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count1 = atomic_read(&hist1->count[i]);
+ const int count2 = atomic_read(&hist2->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]);
+ ASSERT_EXCLUSIVE_WRITER(hist2->count[i]);
+ if (count1 + count2 > 0)
+ return i + 1;
+ WARN(count1 < 0, "inconsistent breakpoint slots histogram");
+ WARN(count2 < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+#ifndef hw_breakpoint_weight
+static inline int hw_breakpoint_weight(struct perf_event *bp)
{
return 1;
}
+#endif
static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
@@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
}
/*
- * Report the maximum number of pinned breakpoints a task
- * have in this cpu
+ * Return the maximum number of pinned breakpoints a task has in this CPU.
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int i;
+ struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned;
- for (i = nr_slots[type] - 1; i >= 0; i--) {
- if (tsk_pinned[i] > 0)
- return i + 1;
- }
-
- return 0;
+ /*
+ * At this point we want to have acquired the bp_cpuinfo_sem as a
+ * writer to ensure that there are no concurrent writers in
+ * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+ */
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type);
}
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
+ *
+ * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent,
+ * returns a negative value.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;
- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter->attr.bp_type) == type &&
- (iter->cpu < 0 || cpu == iter->cpu))
- count += hw_breakpoint_weight(iter);
+ /*
+ * We need a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
+
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) != type)
+ continue;
+
+ if (iter->cpu >= 0) {
+ if (cpu == -1) {
+ count = -1;
+ goto out;
+ } else if (cpu != iter->cpu)
+ continue;
+ }
+
+ count += hw_breakpoint_weight(iter);
}
+out:
+ rcu_read_unlock();
return count;
}
@@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
}
/*
- * Report the number of pinned/un-pinned breakpoints we have in
- * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ * Returns the max pinned breakpoint slots in a given
+ * CPU (cpu > -1) or across all of them (cpu = -1).
*/
-static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
- enum bp_type_idx type)
+static int
+max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int pinned_slots = 0;
int cpu;
+ if (bp->hw.target && bp->cpu < 0) {
+ int max_pinned = task_bp_pinned(-1, bp, type);
+
+ if (max_pinned >= 0) {
+ /*
+ * Fast path: task_bp_pinned() is CPU-independent and
+ * returns the same value for any CPU.
+ */
+ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type);
+ return max_pinned;
+ }
+ }
+
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
@@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
else
nr += task_bp_pinned(cpu, bp, type);
- if (nr > slots->pinned)
- slots->pinned = nr;
-
- nr = info->flexible;
- if (nr > slots->flexible)
- slots->flexible = nr;
+ pinned_slots = max(nr, pinned_slots);
}
-}
-/*
- * For now, continue to consider flexible as pinned, until we can
- * ensure no flexible event can ever be scheduled before a pinned event
- * in a same cpu.
- */
-static void
-fetch_this_slot(struct bp_busy_slots *slots, int weight)
-{
- slots->pinned += weight;
-}
-
-/*
- * Add a pinned breakpoint for the given task in our constraint table
- */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
- enum bp_type_idx type, int weight)
-{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int old_idx, new_idx;
-
- old_idx = task_bp_pinned(cpu, bp, type) - 1;
- new_idx = old_idx + weight;
-
- if (old_idx >= 0)
- tsk_pinned[old_idx]--;
- if (new_idx >= 0)
- tsk_pinned[new_idx]++;
+ return pinned_slots;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
- int weight)
+static int
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight)
{
- const struct cpumask *cpumask = cpumask_of_bp(bp);
- int cpu;
+ int cpu, next_tsk_pinned;
if (!enable)
weight = -weight;
- /* Pinned counter cpu profiling */
if (!bp->hw.target) {
- get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ /*
+ * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the
+ * global histogram.
+ */
+ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type);
+
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight);
+ info->cpu_pinned += weight;
+ return 0;
+ }
+
+ /*
+ * If bp->hw.target, tsk_pinned is only modified, but not used
+ * otherwise. We can permit concurrent updates as long as there are no
+ * other uses: having acquired bp_cpuinfo_sem as a reader allows
+ * concurrent updates here. Uses of tsk_pinned will require acquiring
+ * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+ */
+ lockdep_assert_held_read(&bp_cpuinfo_sem);
+
+ /*
+ * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global
+ * histogram. We need to take care of 4 cases:
+ *
+ * 1. This breakpoint targets all CPUs (cpu < 0), and there may only
+ * exist other task breakpoints targeting all CPUs. In this case we
+ * can simply update the global slots histogram.
+ *
+ * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may
+ * only exist other task breakpoints targeting all CPUs.
+ *
+ * a. On enable: remove the existing breakpoints from the global
+ * slots histogram and use the per-CPU histogram.
+ *
+ * b. On disable: re-insert the existing breakpoints into the global
+ * slots histogram and remove from per-CPU histogram.
+ *
+ * 3. Some other existing task breakpoints target specific CPUs. Only
+ * update the per-CPU slots histogram.
+ */
+
+ if (!enable) {
+ /*
+ * Remove before updating histograms so we can determine if this
+ * was the last task breakpoint for a specific CPU.
+ */
+ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ if (ret)
+ return ret;
+ }
+ /*
+ * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint.
+ */
+ next_tsk_pinned = task_bp_pinned(-1, bp, type);
+
+ if (next_tsk_pinned >= 0) {
+ if (bp->cpu < 0) { /* Case 1: fast path */
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight);
+ } else if (enable) { /* Case 2.a: slow path */
+ /* Add existing to per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ 0, next_tsk_pinned);
+ }
+ /* Add this first CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned,
+ -next_tsk_pinned);
+ } else { /* Case 2.b: slow path */
+ /* Remove this last CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned + hw_breakpoint_weight(bp), weight);
+ /* Remove all from per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, -next_tsk_pinned);
+ }
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned);
+ }
+ } else { /* Case 3: slow path */
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+
+ for_each_cpu(cpu, cpumask) {
+ next_tsk_pinned = task_bp_pinned(cpu, bp, type);
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ }
}
- /* Pinned counter task profiling */
- for_each_cpu(cpu, cpumask)
- toggle_bp_task_slot(bp, cpu, type, weight);
+ /*
+ * Readers want a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
- else
- list_del(&bp->hw.bp_list);
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ return 0;
}
__weak int arch_reserve_bp_slot(struct perf_event *bp)
@@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
}
/*
- * Constraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter.
+ *
+ * Note: Flexible breakpoints are currently unimplemented, but outlined in the
+ * below algorithm for completeness. The implementation treats flexible as
+ * pinned due to no guarantee that we currently always schedule flexible events
+ * before a pinned event in a same CPU.
*
* == Non-pinned counter == (Considered as pinned for now)
*
@@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*/
static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
- struct bp_busy_slots slots = {0};
enum bp_type_idx type;
+ int max_pinned_slots;
int weight;
int ret;
@@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- fetch_bp_busy_slots(&slots, bp, type);
- /*
- * Simulate the addition of this breakpoint to the constraints
- * and see the result.
- */
- fetch_this_slot(&slots, weight);
-
- /* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ /* Check if this new breakpoint can be satisfied across all CPUs. */
+ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight;
+ if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
ret = arch_reserve_bp_slot(bp);
if (ret)
return ret;
- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}
int reserve_bp_slot(struct perf_event *bp)
{
- int ret;
-
- mutex_lock(&nr_bp_mutex);
-
- ret = __reserve_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}
void release_bp_slot(struct perf_event *bp)
{
- mutex_lock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
}
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
@@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
{
- int ret;
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_lock(&nr_bp_mutex);
- ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ int ret;
+
+ if (bp_constraints_is_locked(bp))
return -1;
- return __reserve_bp_slot(bp, bp->attr.bp_type);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
+
+ return ret;
}
int dbg_release_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ if (bp_constraints_is_locked(bp))
return -1;
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
__release_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
return 0;
}
@@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+/**
+ * hw_breakpoint_is_used - check if breakpoints are currently used
+ *
+ * Returns: true if breakpoints are used, false otherwise.
+ */
+bool hw_breakpoint_is_used(void)
+{
+ int cpu;
+
+ if (!constraints_initialized)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+
+ if (info->cpu_pinned)
+ return true;
+
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ if (atomic_read(&info->tsk_pinned.count[slot]))
+ return true;
+ }
+ }
+ }
+
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ /*
+ * Warn, because if there are CPU pinned counters,
+ * should never get here; bp_cpuinfo::cpu_pinned should
+ * be consistent with the global cpu_pinned histogram.
+ */
+ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot])))
+ return true;
+
+ if (atomic_read(&tsk_pinned_all[type].count[slot]))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
@@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- int cpu, err_cpu;
- int i;
-
- for (i = 0; i < TYPE_MAX; i++)
- nr_slots[i] = hw_breakpoint_slots(i);
+ int ret;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < TYPE_MAX; i++) {
- struct bp_cpuinfo *info = get_bp_info(cpu, i);
+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ return ret;
- info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
- GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
- }
- }
+ ret = init_breakpoint_slots();
+ if (ret)
+ return ret;
- constraints_initialized = 1;
+ constraints_initialized = true;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
-
- err_alloc:
- for_each_possible_cpu(err_cpu) {
- for (i = 0; i < TYPE_MAX; i++)
- kfree(get_bp_info(err_cpu, i)->tsk_pinned);
- if (err_cpu == cpu)
- break;
- }
-
- return -ENOMEM;
}
-
-
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
new file mode 100644
index 000000000000..5ced822df788
--- /dev/null
+++ b/kernel/events/hw_breakpoint_test.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for hw_breakpoint constraints accounting logic.
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/cpumask.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/kthread.h>
+#include <linux/perf_event.h>
+#include <asm/hw_breakpoint.h>
+
+#define TEST_REQUIRES_BP_SLOTS(test, slots) \
+ do { \
+ if ((slots) > get_test_bp_slots()) { \
+ kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \
+ get_test_bp_slots()); \
+ } \
+ } while (0)
+
+#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr))
+
+#define MAX_TEST_BREAKPOINTS 512
+
+static char break_vars[MAX_TEST_BREAKPOINTS];
+static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS];
+static struct task_struct *__other_task;
+
+static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx)
+{
+ struct perf_event_attr attr = {};
+
+ if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS))
+ return NULL;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)&break_vars[idx];
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+}
+
+static void unregister_test_bp(struct perf_event **bp)
+{
+ if (WARN_ON(IS_ERR(*bp)))
+ return;
+ if (WARN_ON(!*bp))
+ return;
+ unregister_hw_breakpoint(*bp);
+ *bp = NULL;
+}
+
+static int get_test_bp_slots(void)
+{
+ static int slots;
+
+ if (!slots)
+ slots = hw_breakpoint_slots(TYPE_DATA);
+
+ return slots;
+}
+
+static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk)
+{
+ struct perf_event *bp = register_test_bp(cpu, tsk, *id);
+
+ KUNIT_ASSERT_NOT_NULL(test, bp);
+ KUNIT_ASSERT_FALSE(test, IS_ERR(bp));
+ KUNIT_ASSERT_NULL(test, test_bps[*id]);
+ test_bps[(*id)++] = bp;
+}
+
+/*
+ * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free.
+ *
+ * Returns true if this can be called again, continuing at @id.
+ */
+static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip)
+{
+ for (int i = 0; i < get_test_bp_slots() - skip; ++i)
+ fill_one_bp_slot(test, id, cpu, tsk);
+
+ return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS;
+}
+
+static int dummy_kthread(void *arg)
+{
+ return 0;
+}
+
+static struct task_struct *get_other_task(struct kunit *test)
+{
+ struct task_struct *tsk;
+
+ if (__other_task)
+ return __other_task;
+
+ tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task");
+ KUNIT_ASSERT_FALSE(test, IS_ERR(tsk));
+ __other_task = tsk;
+ return __other_task;
+}
+
+static int get_test_cpu(int num)
+{
+ int cpu;
+
+ WARN_ON(num < 0);
+
+ for_each_online_cpu(cpu) {
+ if (num-- <= 0)
+ break;
+ }
+
+ return cpu;
+}
+
+/* ===== Test cases ===== */
+
+static void test_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_many_cpus(struct kunit *test)
+{
+ int idx = 0;
+ int cpu;
+
+ /* Test that CPUs are independent. */
+ for_each_online_cpu(cpu) {
+ bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx));
+ if (!do_continue)
+ break;
+ }
+}
+
+static void test_one_task_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, -1, current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one and adding back CPU-target should work. */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_two_tasks_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ /* Test that tasks are independent. */
+ fill_bp_slots(test, &idx, -1, current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one from first task and adding back CPU-target should not work. */
+ unregister_test_bp(&test_bps[0]);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_one_task_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /*
+ * Remove one and adding back CPU-target should work; this case is
+ * special vs. above because the task's constraints are CPU-dependent.
+ */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_one_task_mixed(struct kunit *test)
+{
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_bp_slots(test, &idx, -1, current, 1);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* Transition from CPU-dependent pinned count to CPU-independent. */
+ unregister_test_bp(&test_bps[0]);
+ unregister_test_bp(&test_bps[1]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_two_tasks_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Can still create breakpoints on some other CPU. */
+ fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0);
+}
+
+static void test_two_tasks_on_one_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Cannot create breakpoints on some other CPU either. */
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static void test_task_on_all_and_one_cpu(struct kunit *test)
+{
+ int tsk_on_cpu_idx, cpu_idx;
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_bp_slots(test, &idx, -1, current, 2);
+ /* Transitioning from only all CPU breakpoints to mixed. */
+ tsk_on_cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* We should still be able to use up another CPU's slots. */
+ cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+
+ /* Transitioning back to task target on all CPUs. */
+ unregister_test_bp(&test_bps[tsk_on_cpu_idx]);
+ /* Still have a CPU target breakpoint in get_test_cpu(1). */
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ /* Remove it and try again. */
+ unregister_test_bp(&test_bps[cpu_idx]);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static struct kunit_case hw_breakpoint_test_cases[] = {
+ KUNIT_CASE(test_one_cpu),
+ KUNIT_CASE(test_many_cpus),
+ KUNIT_CASE(test_one_task_on_all_cpus),
+ KUNIT_CASE(test_two_tasks_on_all_cpus),
+ KUNIT_CASE(test_one_task_on_one_cpu),
+ KUNIT_CASE(test_one_task_mixed),
+ KUNIT_CASE(test_two_tasks_on_one_cpu),
+ KUNIT_CASE(test_two_tasks_on_one_all_cpus),
+ KUNIT_CASE(test_task_on_all_and_one_cpu),
+ {},
+};
+
+static int test_init(struct kunit *test)
+{
+ /* Most test cases want 2 distinct CPUs. */
+ if (num_online_cpus() < 2)
+ return -EINVAL;
+
+ /* Want the system to not use breakpoints elsewhere. */
+ if (hw_breakpoint_is_used())
+ return -EBUSY;
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) {
+ if (test_bps[i])
+ unregister_test_bp(&test_bps[i]);
+ }
+
+ if (__other_task) {
+ kthread_stop(__other_task);
+ __other_task = NULL;
+ }
+
+ /* Verify that internal state agrees that no breakpoints are in use. */
+ KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used());
+}
+
+static struct kunit_suite hw_breakpoint_test_suite = {
+ .name = "hw_breakpoint",
+ .test_cases = hw_breakpoint_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+
+kunit_test_suites(&hw_breakpoint_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb35b926024c..273a0fe7910a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
- irq_work_queue(&handle->event->pending);
+ irq_work_queue(&handle->event->pending_irq);
}
/*
@@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
- if (rb->nr_pages)
+ if (rb->nr_pages) {
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
}
@@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2eaa327f8158..d9e357b7e17c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -19,7 +19,7 @@
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
@@ -154,8 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
+ struct folio *old_folio = page_folio(old_page);
+ struct folio *new_folio;
struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
@@ -163,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
- GFP_KERNEL);
+ new_folio = page_folio(new_page);
+ err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
- /* For try_to_free_swap() below */
- lock_page(old_page);
+ /* For folio_free_swap() below */
+ folio_lock(old_folio);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
@@ -179,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
- get_page(new_page);
+ folio_get(new_folio);
page_add_new_anon_rmap(new_page, vma, addr);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
- if (!PageAnon(old_page)) {
+ if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -198,15 +200,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, vma, false);
- if (!page_mapped(old_page))
- try_to_free_swap(old_page);
+ if (!folio_mapped(old_folio))
+ folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
- put_page(old_page);
+ folio_put(old_folio);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
- unlock_page(old_page);
+ folio_unlock(old_folio);
return err;
}
@@ -349,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe,
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *tmp;
- for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+ for_each_vma(vmi, tmp)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
@@ -552,7 +555,7 @@ put_old:
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
- collapse_pte_mapped_thp(mm, vaddr);
+ collapse_pte_mapped_thp(mm, vaddr, false);
return ret;
}
@@ -1231,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset,
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1983,9 +1987,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
diff --git a/kernel/exit.c b/kernel/exit.c
index f072959fcab7..35e0a31a0315 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -60,6 +60,7 @@
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
+#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
@@ -183,6 +184,10 @@ void put_task_struct_rcu_user(struct task_struct *task)
call_rcu(&task->rcu, delayed_put_task_struct);
}
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
+
void release_task(struct task_struct *p)
{
struct task_struct *leader;
@@ -374,10 +379,10 @@ static void coredump_task_exit(struct task_struct *tsk)
complete(&core_state->startup);
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
if (!self.task) /* see coredump_finish() */
break;
- freezable_schedule();
+ schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -466,6 +471,7 @@ assign_new_owner:
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
@@ -733,14 +739,33 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+static void synchronize_group_exit(struct task_struct *tsk, long code)
+{
+ struct sighand_struct *sighand = tsk->sighand;
+ struct signal_struct *signal = tsk->signal;
+
+ spin_lock_irq(&sighand->siglock);
+ signal->quick_threads--;
+ if ((signal->quick_threads == 0) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT)) {
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = code;
+ signal->group_stop_count = 0;
+ }
+ spin_unlock_irq(&sighand->siglock);
+}
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
+ synchronize_group_exit(tsk, code);
+
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
+ kmsan_task_exit(tsk);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
@@ -766,7 +791,7 @@ void __noreturn do_exit(long code)
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
- exit_itimers(tsk->signal);
+ exit_itimers(tsk);
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
@@ -905,7 +930,7 @@ do_group_exit(int exit_code)
exit_code = sig->group_exit_code;
else if (sig->group_exec_task)
exit_code = 0;
- else if (!thread_group_empty(current)) {
+ else {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
@@ -1051,7 +1076,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* p->signal fields because the whole thread group is dead
* and nobody can change them.
*
- * psig->stats_lock also protects us from our sub-theads
+ * psig->stats_lock also protects us from our sub-threads
* which can reap other children at the same time. Until
* we change k_getrusage()-like users to rely on this lock
* we have to take ->siglock as well.
diff --git a/kernel/extable.c b/kernel/extable.c
index bda5e9761541..71f482581cab 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -114,7 +114,7 @@ int kernel_text_address(unsigned long addr)
/* Treat this like an NMI as it can happen anywhere */
if (no_rcu)
- rcu_nmi_enter();
+ ct_nmi_enter();
if (is_module_text_address(addr))
goto out;
@@ -127,7 +127,7 @@ int kernel_text_address(unsigned long addr)
ret = 0;
out:
if (no_rcu)
- rcu_nmi_exit();
+ ct_nmi_exit();
return ret;
}
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 60dc825ecc2b..a7ccd2930c5f 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
/* cut off if it is too long */
if (count > KSYM_NAME_LEN)
count = KSYM_NAME_LEN;
- buf = kmalloc(count + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- if (copy_from_user(buf, buffer, count)) {
- ret = -EFAULT;
- goto out_free;
- }
- buf[count] = '\0';
+ buf = memdup_user_nul(buffer, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
sym = strstrip(buf);
mutex_lock(&fei_lock);
@@ -298,17 +294,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
}
ret = register_kprobe(&attr->kp);
- if (!ret)
- fei_debugfs_add_attr(attr);
- if (ret < 0)
- fei_attr_remove(attr);
- else {
- list_add_tail(&attr->list, &fei_attr_list);
- ret = count;
+ if (ret) {
+ fei_attr_free(attr);
+ goto out;
}
+ fei_debugfs_add_attr(attr);
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
out:
mutex_unlock(&fei_lock);
-out_free:
kfree(buf);
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..08969f5aa38d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,13 +37,13 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
+#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -97,7 +97,6 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
-#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*/
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
- new->vm_next = new->vm_prev = NULL;
dup_anon_vma_name(orig, new);
}
return new;
@@ -580,11 +578,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
+ struct vm_area_struct *mpnt, *tmp;
int retval;
- unsigned long charge;
+ unsigned long charge = 0;
LIST_HEAD(uf);
+ MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) {
@@ -606,16 +605,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
khugepaged_fork(mm, oldmm);
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ retval = mas_expected_entries(&mas, oldmm->map_count);
+ if (retval)
+ goto out;
+
+ mas_for_each(&old_mas, mpnt, ULONG_MAX) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -629,7 +628,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto out;
+ goto loop_out;
}
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
@@ -675,24 +674,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
}
/*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
+ * Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
-
- /*
- * Link in the new vma and copy the page table entries.
- */
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
+ hugetlb_dup_vma_private(tmp);
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+ /* Link the vma into the MT */
+ mas.index = tmp->vm_start;
+ mas.last = tmp->vm_end - 1;
+ mas_store(&mas, tmp);
+ if (mas_is_err(&mas))
+ goto fail_nomem_mas_store;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -702,10 +694,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_ops->open(tmp);
if (retval)
- goto out;
+ goto loop_out;
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+ mas_destroy(&mas);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -714,6 +708,9 @@ out:
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
+
+fail_nomem_mas_store:
+ unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -721,7 +718,7 @@ fail_nomem_policy:
fail_nomem:
retval = -ENOMEM;
vm_unacct_memory(charge);
- goto out;
+ goto loop_out;
}
static inline int mm_alloc_pgd(struct mm_struct