I did a few experiments with original 10.3 as of 7cb3520c0632ad912b309489ad86a90f9fc9bd0b. oltp_index_updates, 64 threads, ARM.
vanilla: 143981 TPS
mutex: 78897 TPS
spin: 70085 TPS
nospin: 64573 TPS
nornd_max_delay: 132392 TPS
nornd_1: 160053 TPS
nornd_3: 148662 TPS
nornd_10: 118859 TPS
nornd_0: 74063 TPS
Also there's potential false sharing between srv_spin_wait_delay/srv_n_spin_wait_rounds and statistics variables. I doubt they can contribute much to scalability, still worth checking.
mutex (use pthread_mutex instead of ib mutex)
diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h
|
index 76f02cc..edbb434 100644
|
--- a/storage/innobase/include/ib0mutex.h
|
+++ b/storage/innobase/include/ib0mutex.h
|
@@ -433,6 +433,7 @@ struct TTASEventMutex {
|
ut_a(m_lock_word == MUTEX_STATE_UNLOCKED);
|
|
m_event = os_event_create(sync_latch_get_name(id));
|
+ pthread_mutex_init(&m_mutex, NULL);
|
}
|
|
/** This is the real desctructor. This mutex can be created in BSS and
|
@@ -446,6 +447,7 @@ struct TTASEventMutex {
|
/* We have to free the event before InnoDB shuts down. */
|
os_event_destroy(m_event);
|
m_event = 0;
|
+ pthread_mutex_destroy(&m_mutex);
|
}
|
|
/** Try and lock the mutex. Note: POSIX returns 0 on success.
|
@@ -453,6 +455,7 @@ struct TTASEventMutex {
|
bool try_lock()
|
UNIV_NOTHROW
|
{
|
+ return(pthread_mutex_trylock(&m_mutex) == 0);
|
int32 oldval = MUTEX_STATE_UNLOCKED;
|
return(my_atomic_cas32_strong_explicit(&m_lock_word, &oldval,
|
MUTEX_STATE_LOCKED,
|
@@ -464,6 +467,8 @@ struct TTASEventMutex {
|
void exit()
|
UNIV_NOTHROW
|
{
|
+ pthread_mutex_unlock(&m_mutex);
|
+ return;
|
if (my_atomic_fas32_explicit(&m_lock_word,
|
MUTEX_STATE_UNLOCKED,
|
MY_MEMORY_ORDER_RELEASE)
|
@@ -485,6 +490,9 @@ struct TTASEventMutex {
|
uint32_t line)
|
UNIV_NOTHROW
|
{
|
+ pthread_mutex_lock(&m_mutex);
|
+ return;
|
+
|
uint32_t n_spins = 0;
|
uint32_t n_waits = 0;
|
const uint32_t step = max_spins;
|
@@ -553,6 +561,7 @@ struct TTASEventMutex {
|
}
|
|
private:
|
+ sys_mutex_t m_mutex;
|
/** Disable copying */
|
TTASEventMutex(const TTASEventMutex&);
|
TTASEventMutex& operator=(const TTASEventMutex&);
|
spin (just spin, never delay, never go asleep)
diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h
|
index 76f02cc..5ab6127 100644
|
--- a/storage/innobase/include/ib0mutex.h
|
+++ b/storage/innobase/include/ib0mutex.h
|
@@ -490,6 +490,7 @@ struct TTASEventMutex {
|
const uint32_t step = max_spins;
|
|
while (!try_lock()) {
|
+ continue;
|
if (n_spins++ == max_spins) {
|
max_spins += step;
|
n_waits++;
|
nospin (immediately go asleep, no delay)
diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h
|
index 76f02cc..f7c9827 100644
|
--- a/storage/innobase/include/ib0mutex.h
|
+++ b/storage/innobase/include/ib0mutex.h
|
@@ -490,7 +490,7 @@ struct TTASEventMutex {
|
const uint32_t step = max_spins;
|
|
while (!try_lock()) {
|
- if (n_spins++ == max_spins) {
|
+ if (1 || n_spins++ == max_spins) {
|
max_spins += step;
|
n_waits++;
|
os_thread_yield();
|
nornd_X (different values to ut_delay, either hardcoded or max_delay)
diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h
|
index 76f02cc..d48b213 100644
|
--- a/storage/innobase/include/ib0mutex.h
|
+++ b/storage/innobase/include/ib0mutex.h
|
@@ -516,7 +516,7 @@ struct TTASEventMutex {
|
sync_array_wait_event(sync_arr, cell);
|
}
|
} else {
|
- ut_delay(ut_rnd_interval(0, max_delay));
|
+ ut_delay(X);
|
}
|
}
|
MySQL switched from global RNG state to per-thread state using thread-local-storage (TLS). But TLS performance as such is far from perfect. If my idea explained in
MDEV-14374is proved to be faster, we won't need RNG in spin-loop at all.