From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 27 Apr 2023 13:19:35 +0200
Subject: [PATCH 2/4] locking/rtmutex: Submit/resume work explicitly
before/after blocking
schedule() invokes sched_submit_work() before scheduling and
sched_resume_work() afterwards to ensure that queued block requests are
flushed and the (IO)worker machineries can instantiate new workers if
required. This avoids deadlocks and starvation.
With rt_mutexes this can lead to a subtle problem:
When rtmutex blocks current::pi_blocked_on points to the rtmutex it
blocks on. When one of the functions in sched_submit/resume_work() contends
on a rtmutex based lock then that would corrupt current::pi_blocked_on.
Let rtmutex and the RT lock variants which are based on it invoke
sched_submit/resume_work() explicitly before and after the slowpath so
it's guaranteed that current::pi_blocked_on cannot be corrupted by blocking
on two locks.
This does not apply to the PREEMPT_RT variants of spinlock_t and rwlock_t
as their scheduling slowpath is separate and cannot invoke the work related
functions due to potential deadlocks anyway.
[ tglx: Make it explicit and symmetric. Massage changelog ]
Fixes: e17ba59b7e8e1 ("locking/rtmutex: Guard regular sleeping locks specific functions")
Reported-by: Crystal Wood <swood@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/4b4ab374d3e24e6ea8df5cadc4297619a6d945af.camel@redhat.com
Link: https://lore.kernel.org/r/20230427111937.2745231-3-bigeasy@linutronix.de
---
kernel/locking/rtmutex.c | 11 +++++++++--
kernel/locking/rwbase_rt.c | 18 ++++++++++++++++--
kernel/locking/rwsem.c | 6 ++++++
kernel/locking/spinlock_rt.c | 3 +++
4 files changed, 34 insertions(+), 4 deletions(-)
Index: linux-6.3.0-rt11/kernel/locking/rtmutex.c
===================================================================
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1558 @ static int __sched rt_mutex_slowlock_blo
raw_spin_unlock_irq(&lock->wait_lock);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
- schedule();
+ schedule_rtmutex();
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1587 @ static void __sched rt_mutex_handle_dead
WARN(1, "rtmutex deadlock detected\n");
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ schedule_rtmutex();
}
}
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1683 @ static int __sched rt_mutex_slowlock(str
int ret;
/*
+ * The task is about to sleep. Invoke sched_submit_work() before
+ * blocking as that might take locks and corrupt tsk::pi_blocked_on.
+ */
+ sched_submit_work();
+
+ /*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
* be called in early boot if the cmpxchg() fast path is disabled
* (debug, no architecture support). In this case we will acquire the
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1700 @ static int __sched rt_mutex_slowlock(str
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ sched_resume_work();
return ret;
}
Index: linux-6.3.0-rt11/kernel/locking/rwbase_rt.c
===================================================================
--- linux-6.3.0-rt11.orig/kernel/locking/rwbase_rt.c
+++ linux-6.3.0-rt11/kernel/locking/rwbase_rt.c
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:134 @ static int __sched __rwbase_read_lock(st
static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
unsigned int state)
{
+ int ret;
+
if (rwbase_read_trylock(rwb))
return 0;
- return __rwbase_read_lock(rwb, state);
+ /*
+ * The task is about to sleep. For rwsems this submits work as that
+ * might take locks and corrupt tsk::pi_blocked_on. Must be
+ * explicit here because __rwbase_read_lock() cannot invoke
+ * rt_mutex_slowlock(). NOP for rwlocks.
+ */
+ rwbase_sched_submit_work();
+ ret = __rwbase_read_lock(rwb, state);
+ rwbase_sched_resume_work();
+ return ret;
}
static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:244 @ static int __sched rwbase_write_lock(str
struct rt_mutex_base *rtm = &rwb->rtmutex;
unsigned long flags;
- /* Take the rtmutex as a first step */
+ /*
+ * Take the rtmutex as a first step. For rwsem this will also
+ * invoke sched_submit_work() to flush IO and workers.
+ */
if (rwbase_rtmutex_lock_state(rtm, state))
return -EINTR;
Index: linux-6.3.0-rt11/kernel/locking/rwsem.c
===================================================================
--- linux-6.3.0-rt11.orig/kernel/locking/rwsem.c
+++ linux-6.3.0-rt11/kernel/locking/rwsem.c
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1418 @ static inline void __downgrade_write(str
#define rwbase_rtmutex_lock_state(rtm, state) \
__rt_mutex_lock(rtm, state)
+#define rwbase_sched_submit_work() \
+ sched_submit_work()
+
+#define rwbase_sched_resume_work() \
+ sched_resume_work()
+
#define rwbase_rtmutex_slowlock_locked(rtm, state) \
__rt_mutex_slowlock_locked(rtm, NULL, state)
Index: linux-6.3.0-rt11/kernel/locking/spinlock_rt.c
===================================================================
--- linux-6.3.0-rt11.orig/kernel/locking/spinlock_rt.c
+++ linux-6.3.0-rt11/kernel/locking/spinlock_rt.c
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:162 @ rwbase_rtmutex_lock_state(struct rt_mute
return 0;
}
+static __always_inline void rwbase_sched_submit_work(void) { }
+static __always_inline void rwbase_sched_resume_work(void) { }
+
static __always_inline int
rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
{