#include <linux/sched/debug.h>
#include <linux/sched/wake_q.h>
#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include "lock_events.h"
/*
- * The least significant 2 bits of the owner value has the following
+ * The least significant 3 bits of the owner value has the following
* meanings when set.
* - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
- * - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem
- * The rwsem is anonymously owned, i.e. the owner(s) cannot be
- * readily determined. It can be reader owned or the owning writer
- * is indeterminate.
+ * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock.
+ * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock.
*
+ * When the rwsem is either owned by an anonymous writer, or it is
+ * reader-owned, but a spinning writer has timed out, both nonspinnable
+ * bits will be set to disable optimistic spinning by readers and writers.
+ * In the later case, the last unlocking reader should then check the
+ * writer nonspinnable bit and clear it only to give writers preference
+ * to acquire the lock via optimistic spinning, but not readers. Similar
+ * action is also done in the reader slowpath.
+
* When a writer acquires a rwsem, it puts its task_struct pointer
* into the owner field. It is cleared after an unlock.
*
* When a reader acquires a rwsem, it will also puts its task_struct
- * pointer into the owner field with both the RWSEM_READER_OWNED and
- * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will
- * largely be left untouched. So for a free or reader-owned rwsem,
- * the owner value may contain information about the last reader that
- * acquires the rwsem. The anonymous bit is set because that particular
- * reader may or may not still own the lock.
+ * pointer into the owner field with the RWSEM_READER_OWNED bit set.
+ * On unlock, the owner field will largely be left untouched. So
+ * for a free or reader-owned rwsem, the owner value may contain
+ * information about the last reader that acquires the rwsem.
*
* That information may be helpful in debugging cases where the system
* seems to hang on a reader owned rwsem especially if only one reader
* a rwsem, but the overhead is simply too big.
*/
#define RWSEM_READER_OWNED (1UL << 0)
-#define RWSEM_NONSPINNABLE (1UL << 1)
+#define RWSEM_RD_NONSPINNABLE (1UL << 1)
+#define RWSEM_WR_NONSPINNABLE (1UL << 2)
+#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE)
#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
#ifdef CONFIG_DEBUG_RWSEMS
static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
struct task_struct *owner)
{
- unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | RWSEM_NONSPINNABLE;
+ unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED;
atomic_long_set(&sem->owner, val);
}
}
#endif
+/*
+ * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
+ * remains set. Otherwise, the operation will be aborted.
+ */
+static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ do {
+ if (!(owner & RWSEM_READER_OWNED))
+ break;
+ if (owner & RWSEM_NONSPINNABLE)
+ break;
+ } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
+ owner | RWSEM_NONSPINNABLE));
+}
+
/*
* Return just the real task structure pointer of the owner
*/
return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
}
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+ unsigned long nonspinnable)
{
struct task_struct *owner;
unsigned long flags;
preempt_disable();
rcu_read_lock();
owner = rwsem_owner_flags(sem, &flags);
- if ((flags & RWSEM_NONSPINNABLE) || (owner && !owner_on_cpu(owner)))
+ if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
ret = false;
rcu_read_unlock();
preempt_enable();
OWNER_READER = 1 << 2,
OWNER_NONSPINNABLE = 1 << 3,
};
-#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER)
+#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
static inline enum owner_state
-rwsem_owner_state(struct task_struct *owner, unsigned long flags)
+rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable)
{
- if (flags & RWSEM_NONSPINNABLE)
+ if (flags & nonspinnable)
return OWNER_NONSPINNABLE;
if (flags & RWSEM_READER_OWNED)
return owner ? OWNER_WRITER : OWNER_NULL;
}
-static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
+static noinline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
{
struct task_struct *new, *owner;
unsigned long flags, new_flags;
enum owner_state state;
owner = rwsem_owner_flags(sem, &flags);
- state = rwsem_owner_state(owner, flags);
+ state = rwsem_owner_state(owner, flags, nonspinnable);
if (state != OWNER_WRITER)
return state;
new = rwsem_owner_flags(sem, &new_flags);
if ((new != owner) || (new_flags != flags)) {
- state = rwsem_owner_state(new, new_flags);
+ state = rwsem_owner_state(new, new_flags, nonspinnable);
break;
}
return state;
}
+/*
+ * Calculate reader-owned rwsem spinning threshold for writer
+ *
+ * The more readers own the rwsem, the longer it will take for them to
+ * wind down and free the rwsem. So the empirical formula used to
+ * determine the actual spinning time limit here is:
+ *
+ * Spinning threshold = (10 + nr_readers/2)us
+ *
+ * The limit is capped to a maximum of 25us (30 readers). This is just
+ * a heuristic and is subjected to change in the future.
+ */
+static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+ int readers = count >> RWSEM_READER_SHIFT;
+ u64 delta;
+
+ if (readers > 30)
+ readers = 30;
+ delta = (20 + readers) * NSEC_PER_USEC / 2;
+
+ return sched_clock() + delta;
+}
+
static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
{
bool taken = false;
int prev_owner_state = OWNER_NULL;
+ int loop = 0;
+ u64 rspin_threshold = 0;
+ unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE
+ : RWSEM_RD_NONSPINNABLE;
preempt_disable();
* Optimistically spin on the owner field and attempt to acquire the
* lock whenever the owner changes. Spinning will be stopped when:
* 1) the owning writer isn't running; or
- * 2) readers own the lock as we can't determine if they are
- * actively running or not.
+ * 2) readers own the lock and spinning time has exceeded limit.
*/
for (;;) {
- enum owner_state owner_state = rwsem_spin_on_owner(sem);
+ enum owner_state owner_state;
+ owner_state = rwsem_spin_on_owner(sem, nonspinnable);
if (!(owner_state & OWNER_SPINNABLE))
break;
if (taken)
break;
+ /*
+ * Time-based reader-owned rwsem optimistic spinning
+ */
+ if (wlock && (owner_state == OWNER_READER)) {
+ /*
+ * Re-initialize rspin_threshold every time when
+ * the owner state changes from non-reader to reader.
+ * This allows a writer to steal the lock in between
+ * 2 reader phases and have the threshold reset at
+ * the beginning of the 2nd reader phase.
+ */
+ if (prev_owner_state != OWNER_READER) {
+ if (rwsem_test_oflags(sem, nonspinnable))
+ break;
+ rspin_threshold = rwsem_rspin_threshold(sem);
+ loop = 0;
+ }
+
+ /*
+ * Check time threshold once every 16 iterations to
+ * avoid calling sched_clock() too frequently so
+ * as to reduce the average latency between the times
+ * when the lock becomes free and when the spinner
+ * is ready to do a trylock.
+ */
+ else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
+ rwsem_set_nonspinnable(sem);
+ lockevent_inc(rwsem_opt_nospin);
+ break;
+ }
+ }
+
/*
* An RT task cannot do optimistic spinning if it cannot
* be sure the lock holder is running or live-lock may
lockevent_cond_inc(rwsem_opt_fail, !taken);
return taken;
}
+
+/*
+ * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should
+ * only be called when the reader count reaches 0.
+ *
+ * This give writers better chance to acquire the rwsem first before
+ * readers when the rwsem was being held by readers for a relatively long
+ * period of time. Race can happen that an optimistic spinner may have
+ * just stolen the rwsem and set the owner, but just clearing the
+ * RWSEM_WR_NONSPINNABLE bit will do no harm anyway.
+ */
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem)
+{
+ if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE))
+ atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner);
+}
#else
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+ unsigned long nonspinnable)
{
return false;
}
{
return false;
}
+
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { }
#endif
/*
rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
{
long count, adjustment = -RWSEM_READER_BIAS;
+ bool wake = false;
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
- if (!rwsem_can_spin_on_owner(sem))
+ if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE))
goto queue;
/*
* If there are no writers and we are first in the queue,
* wake our own waiter to join the existing active readers !
*/
- if (!(count & RWSEM_LOCK_MASK) ||
- (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS)))
+ if (!(count & RWSEM_LOCK_MASK)) {
+ clear_wr_nonspinnable(sem);
+ wake = true;
+ }
+ if (wake || (!(count & RWSEM_WRITER_MASK) &&
+ (adjustment & RWSEM_FLAG_WAITERS)))
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
- if (rwsem_can_spin_on_owner(sem) &&
+ if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
rwsem_optimistic_spin(sem, true))
return sem;
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
- RWSEM_FLAG_WAITERS))
+ RWSEM_FLAG_WAITERS)) {
+ clear_wr_nonspinnable(sem);
rwsem_wake(sem, tmp);
+ }
}
/*