path: root/kernel/sched/wait.c
diff options
authorDavid Woodhouse <dwmw@amazon.co.uk>2020-10-27 14:39:43 +0000
committerPaolo Bonzini <pbonzini@redhat.com>2020-11-15 09:49:09 -0500
commitc4d51a52c67a1e3a0fa3006e5ec21cdc07649cd6 (patch)
tree8197d1c61f4fe778e4cc86e1c7715bdcd9116d17 /kernel/sched/wait.c
parentbf0cd88ce363a2de3684baaa48d3f194acdc516c (diff)
sched/wait: Add add_wait_queue_priority()
This allows an exclusive wait_queue_entry to be added at the head of the queue, instead of the tail as normal. Thus, it gets to consume events first without allowing non-exclusive waiters to be woken at all. The (first) intended use is for KVM IRQFD, which currently has inconsistent behaviour depending on whether posted interrupts are available or not. If they are, KVM will bypass the eventfd completely and deliver interrupts directly to the appropriate vCPU. If not, events are delivered through the eventfd and userspace will receive them when polling on the eventfd. By using add_wait_queue_priority(), KVM will be able to consistently consume events within the kernel without accidentally exposing them to userspace when they're supposed to be bypassed. This, in turn, means that userspace doesn't have to jump through hoops to avoid listening on the erroneously noisy eventfd and injecting duplicate interrupts. Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> Message-Id: <20201027143944.648769-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'kernel/sched/wait.c')
1 files changed, 16 insertions, 1 deletions
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 01f5d3020589..183cc6ae68a6 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+ unsigned long flags;
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
unsigned long flags;
@@ -57,7 +68,11 @@ EXPORT_SYMBOL(remove_wait_queue);
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken.
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

Privacy Policy