From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 1 Dec 2021 17:41:09 +0100
Subject: [PATCH] softirq: Use a dedicated thread for timer wakeups.

A timer/hrtimer softirq is raised in-IRQ context. With threaded
interrupts enabled or on PREEMPT_RT this leads to waking the ksoftirqd
for the processing of the softirq.
Once the ksoftirqd is marked as pending (or is running) it will collect
all raised softirqs. This in turn means that a softirq which would have
been processed at the end of the threaded interrupt, which runs at an
elevated priority, is now moved to ksoftirqd which runs at SCHED_OTHER
priority and competes with every regular task for CPU resources.
This introduces long delays on heavy loaded systems and is not desired
especially if the system is not overloaded by the softirqs.

Split the TIMER_SOFTIRQ and HRTIMER_SOFTIRQ processing into a dedicated
timers thread and let it run at the lowest SCHED_FIFO priority.
RT tasks are are woken up from hardirq context so only timer_list timers
and hrtimers for "regular" tasks are processed here. The higher priority
ensures that wakeups are performed before scheduling SCHED_OTHER tasks.

Using a dedicated variable to store the pending softirq bits values
ensure that the timer are not accidentally picked up by ksoftirqd and
other threaded interrupts.
It shouldn't be picked up by ksoftirqd since it runs at lower priority.
However if the timer bits are ORed while a threaded interrupt is
running, then the timer softirq would be performed at higher priority.
The new timer thread will block on the softirq lock before it starts
softirq work. This "race window" isn't closed because while timer thread
is performing the softirq it can get PI-boosted via the softirq lock by
a random force-threaded thread.
The timer thread can pick up pending softirqs from ksoftirqd but only
if the softirq load is high. It is not be desired that the picked up
softirqs are processed at SCHED_FIFO priority under high softirq load
but this can already happen by a PI-boost by a force-threaded interrupt.

Reported-by: kernel test robot <lkp@intel.com> [ static timer_threads ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 include/linux/interrupt.h |   16 +++++++++
 kernel/softirq.c          |   76 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/hrtimer.c     |    4 +-
 kernel/time/timer.c       |    2 -
 4 files changed, 95 insertions(+), 3 deletions(-)

--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -623,6 +623,22 @@ extern void __raise_softirq_irqoff(unsig
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
+#ifdef CONFIG_PREEMPT_RT
+extern void raise_timer_softirq(void);
+extern void raise_hrtimer_softirq(void);
+
+#else
+static inline void raise_timer_softirq(void)
+{
+	raise_softirq(TIMER_SOFTIRQ);
+}
+
+static inline void raise_hrtimer_softirq(void)
+{
+	raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+#endif
+
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
 
 static inline struct task_struct *this_cpu_ksoftirqd(void)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -637,6 +637,22 @@ static inline void tick_irq_exit(void)
 #endif
 }
 
+static DEFINE_PER_CPU(struct task_struct *, timersd);
+static DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
+
+static unsigned int local_pending_timers(void)
+{
+        return __this_cpu_read(pending_timer_softirq);
+}
+
+static void wake_timersd(void)
+{
+        struct task_struct *tsk = __this_cpu_read(timersd);
+
+        if (tsk)
+                wake_up_process(tsk);
+}
+
 static inline void __irq_exit_rcu(void)
 {
 #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -648,6 +664,8 @@ static inline void __irq_exit_rcu(void)
 	preempt_count_sub(HARDIRQ_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !in_interrupt() && local_pending_timers())
+		wake_timersd();
 
 	tick_irq_exit();
 }
@@ -976,11 +994,69 @@ static struct smp_hotplug_thread softirq
 	.thread_comm		= "ksoftirqd/%u",
 };
 
+static void timersd_setup(unsigned int cpu)
+{
+        sched_set_fifo_low(current);
+}
+
+static int timersd_should_run(unsigned int cpu)
+{
+        return local_pending_timers();
+}
+
+static void run_timersd(unsigned int cpu)
+{
+	unsigned int timer_si;
+
+	ksoftirqd_run_begin();
+
+	timer_si = local_pending_timers();
+	__this_cpu_write(pending_timer_softirq, 0);
+	or_softirq_pending(timer_si);
+
+	__do_softirq();
+
+	ksoftirqd_run_end();
+}
+
+#ifdef CONFIG_PREEMPT_RT
+static void raise_ktimers_thread(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	__this_cpu_or(pending_timer_softirq, 1 << nr);
+}
+
+void raise_hrtimer_softirq(void)
+{
+	raise_ktimers_thread(HRTIMER_SOFTIRQ);
+}
+
+void raise_timer_softirq(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	raise_ktimers_thread(TIMER_SOFTIRQ);
+	wake_timersd();
+	local_irq_restore(flags);
+}
+#endif
+
+static struct smp_hotplug_thread timer_threads = {
+        .store                  = &timersd,
+        .setup                  = timersd_setup,
+        .thread_should_run      = timersd_should_run,
+        .thread_fn              = run_timersd,
+        .thread_comm            = "ktimers/%u",
+};
+
 static __init int spawn_ksoftirqd(void)
 {
 	cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
 				  takeover_tasklets);
 	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		BUG_ON(smpboot_register_percpu_thread(&timer_threads));
 
 	return 0;
 }
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_even
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_hrtimer_softirq();
 	}
 
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@@ -1918,7 +1918,7 @@ void hrtimer_run_queues(void)
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_hrtimer_softirq();
 	}
 
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1769,7 +1769,7 @@ static void run_local_timers(void)
 		if (time_before(jiffies, base->next_expiry))
 			return;
 	}
-	raise_softirq(TIMER_SOFTIRQ);
+	raise_timer_softirq();
 }
 
 /*