Details
-
Bug
-
Status: Open (View Workflow)
-
Major
-
Resolution: Unresolved
-
None
-
None
Description
We installed the galera cluster using the bitnami mariadb-helm charts.
Affected Version 10.7.7 (not able to select it through the select field above)
After a while (few days) we always see deadlocks in the logs:
Caused by: java.sql.SQLException: (conn=371621) Lock wait timeout exceeded; try restarting transaction
|
The values.yaml file:
image:
|
tag: 10.7.7-debian-11-r30
|
|
podAnnotations:
|
vault.security.banzaicloud.io/vault-addr: "http://censored-for-compliance-reasons.com"
|
vault.security.banzaicloud.io/vault-skip-verify: "true"
|
vault.security.banzaicloud.io/vault-role: "cluster-nonprod"
|
vault.security.banzaicloud.io/vault-path: "cluster-nonprod"
|
|
serviceAccount:
|
create: true
|
automountServiceAccountToken: true
|
|
fullnameOverride: dev-mariadb
|
|
rbac:
|
create: true
|
|
podManagementPolicy: Parallel
|
|
replicaCount: 3
|
|
db:
|
user: technical
|
name: ""
|
|
persistence:
|
size: 20Gi
|
|
usePasswordFiles: true
|
customPasswordFiles:
|
root: "/vault-secrets/rootpassword"
|
user: "/vault-secrets/userpassword"
|
mariabackup: "/vault-secrets/mariabackuppassword"
|
|
extraVolumes:
|
- name: vault-secrets-volume
|
emptyDir: {}
|
|
extraVolumeMounts:
|
- mountPath: /vault-secrets
|
name: vault-secrets-volume
|
|
initContainers:
|
- command:
|
- bash
|
- -c
|
- |
|
echo -n "$PASSWORD" > /vault-secrets/userpassword
|
echo -n "$ROOTPASSWORD" > /vault-secrets/rootpassword
|
echo -n "$MARIABACKUPPASSWORD" > /vault-secrets/mariabackuppassword
|
image: ubuntu:jammy
|
name: init-container
|
volumeMounts:
|
- mountPath: /vault-secrets
|
name: vault-secrets-volume
|
env:
|
- name: PASSWORD
|
value: vault:cluster-nonprod/data/dev/common/mariadb-galera/credentials#mariadb-password
|
- name: ROOTPASSWORD
|
value: vault:cluster-nonprod/data/dev/common/mariadb-galera/credentials#mariadb-root-password
|
- name: MARIABACKUPPASSWORD
|
value: vault:cluster-nonprod/data/dev/common/mariadb-galera/credentials#mariadb-galera-mariabackup-password
|
|
resources:
|
requests:
|
cpu: 0.5
|
memory: 4096Mi
|
limits:
|
cpu: 2
|
memory: 4096Mi
|
|
initdbScripts:
|
init_dbs.sql: |
|
CREATE DATABASE IF NOT EXISTS vault;
|
CREATE DATABASE IF NOT EXISTS vault_lock;
|
CREATE DATABASE IF NOT EXISTS keycloak;
|
|
mariadbConfiguration: |-
|
[client]
|
port=3306
|
socket=/opt/bitnami/mariadb/tmp/mysql.sock
|
plugin_dir=/opt/bitnami/mariadb/plugin
|
|
[mysqld]
|
default_storage_engine=InnoDB
|
basedir=/opt/bitnami/mariadb
|
datadir=/bitnami/mariadb/data
|
plugin_dir=/opt/bitnami/mariadb/plugin
|
tmpdir=/opt/bitnami/mariadb/tmp
|
socket=/opt/bitnami/mariadb/tmp/mysql.sock
|
pid_file=/opt/bitnami/mariadb/tmp/mysqld.pid
|
bind_address=0.0.0.0
|
|
## Character set
|
##
|
collation_server=utf8_unicode_ci
|
init_connect='SET NAMES utf8'
|
character_set_server=utf8
|
|
## MyISAM
|
##
|
key_buffer_size=32M
|
myisam_recover_options=FORCE,BACKUP
|
|
## Safety
|
##
|
skip_host_cache
|
skip_name_resolve
|
max_allowed_packet=16M
|
max_connect_errors=1000000
|
sql_mode=IGNORE_SPACE,ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION
|
sysdate_is_now=1
|
|
## Binary Logging
|
##
|
log_bin=mysql-bin
|
expire_logs_days=14
|
# Disabling for performance per http://severalnines.com/blog/9-tips-going-production-galera-cluster-mysql
|
sync_binlog=0
|
# Required for Galera
|
binlog_format=row
|
|
## Caches and Limits
|
##
|
tmp_table_size=32M
|
max_heap_table_size=32M
|
# Re-enabling as now works with Maria 10.1.2
|
query_cache_type=1
|
query_cache_limit=4M
|
query_cache_size=256M
|
max_connections=500
|
thread_cache_size=50
|
open_files_limit=65535
|
table_definition_cache=4096
|
table_open_cache=4096
|
|
## InnoDB
|
##
|
innodb=FORCE
|
innodb_strict_mode=1
|
# Mandatory per https://github.com/codership/documentation/issues/25
|
innodb_autoinc_lock_mode=2
|
# Per https://www.percona.com/blog/2006/08/04/innodb-double-write/
|
innodb_doublewrite=1
|
innodb_flush_method=O_DIRECT
|
innodb_log_files_in_group=2
|
innodb_log_file_size=128M
|
innodb_flush_log_at_trx_commit=1
|
innodb_file_per_table=1
|
# 80% Memory is default reco.
|
# Need to re-evaluate when DB size grows
|
innodb_buffer_pool_size=2G
|
innodb_file_format=Barracuda
|
|
## Logging
|
##
|
log_error=/opt/bitnami/mariadb/logs/mysqld.log
|
slow_query_log_file=/opt/bitnami/mariadb/logs/mysqld.log
|
log_queries_not_using_indexes=1
|
slow_query_log=1
|
|
[galera]
|
wsrep_on=ON
|
wsrep_provider=/opt/bitnami/mariadb/lib/libgalera_smm.so
|
wsrep_sst_method=mariabackup
|
wsrep_slave_threads=4
|
wsrep_cluster_address=gcomm://
|
wsrep_cluster_name=galera
|
wsrep_sst_auth="root:"
|
# Enabled for performance per https://mariadb.com/kb/en/innodb-system-variables/#innodb_flush_log_at_trx_commit
|
innodb_flush_log_at_trx_commit=2
|
# MYISAM REPLICATION SUPPORT #
|
wsrep_mode=REPLICATE_MYISAM
|
|
[mariadb]
|
plugin_load_add=auth_pam
|
|
|
diagnosticMode:
|
enabled: false
|
We checked the nodes for there state and we saw that node dev-mariadb-0 has a process in "Waiting for certification" that never ends:
| 370905 | root | localhost | anonym | Query | 140221 | Waiting for certification | INSERT INTO `ACT_RU_VARIABLE` VALUES ('0018a8dc-afc8-11ee-8401-d2c062af09ed',1,'string','temporaryRa | 0.000 |
|
It seems all the other processes waiting for this one to complete.
We have tried to kill this process, but a KILL 370905 just ends up with an additional process waiting for the one to complete.
We are only able to fix this problem by restarting the node again. The other both nodes working as expected and read and write is possible.
Any idea how to fix this problem, or how we are able to ensure that this didn`t happen again?
Additional Information
Full Processlist of node0:
MariaDB [(none)]> show processlist;
|
+--------+-------------+-------------------+---------+------------+---------+---------------------------------+------------------------------------------------------------------------------------------------------+----------+
|
| Id | User | Host | db | Command | Time | State | Info | Progress |
|
+--------+-------------+-------------------+---------+------------+---------+---------------------------------+------------------------------------------------------------------------------------------------------+----------+
|
| 1 | system user | | NULL | Sleep | 1977667 | wsrep aborter idle | NULL | 0.000 |
|
| 2 | system user | | NULL | Sleep | 140221 | Waiting for TOI DDL | update ACT_RU_EXECUTION set
|
REV_ = 2,
|
PROC_DEF_ID_ = 'd5618a2e-9aa3-11ee-84ad-06434dd3b4 | 0.000 |
|
| 7 | system user | | NULL | Sleep | 140674 | After apply log event | NULL | 0.000 |
|
| 8 | system user | | NULL | Sleep | 140665 | After apply log event | NULL | 0.000 |
|
| 9 | system user | | NULL | Sleep | 140681 | After apply log event | NULL | 0.000 |
|
| 370905 | root | localhost | anonym | Query | 140221 | Waiting for certification | INSERT INTO `ACT_RU_VARIABLE` VALUES
|
('0018a8dc-afc8-11ee-8401-d2c062af09ed',1,'string','temporaryRa | 0.000 |
|
| 370942 | root | localhost | NULL | Query | 140044 | Waiting to execute in isolation | DROP DATABASE anonym | 0.000 |
|
| 371448 | root | localhost | NULL | Query | 137554 | Waiting to execute in isolation | DROP DATABASE anonym | 0.000 |
|
| 371473 | root | localhost | NULL | Query | 137424 | Waiting to execute in isolation | DROP DATABASE anonym | 0.000 |
|
| 371585 | root | localhost | NULL | Query | 136880 | Waiting to execute in isolation | KILL QUERY 370942 | 0.000 |
|
| 371619 | root | 10.42.1.163:57406 | anonym | Query | 135847 | starting | COMMIT | 0.000 |
|
| 377012 | root | 10.42.1.163:45958 | anonym | Query | 108463 | starting | COMMIT | 0.000 |
|
| 397698 | root | localhost | NULL | Query | 10473 | Waiting to execute in isolation | KILL QUERY 397677 | 0.000 |
|
| 397701 | root | localhost | NULL | Query | 10429 | Waiting to execute in isolation | CREATE DATABASE test | 0.000 |
|
| 398502 | root | localhost | anonym | Field List | 6647 | Waiting for table flush | NULL | 0.000 |
|
| 398522 | root | localhost | NULL | Sleep | 6570 | | NULL | 0.000 |
|
| 399845 | root | 10.42.0.36:45956 | anonym | Sleep | 285 | | NULL | 0.000 |
|
| 399846 | root | 10.42.0.36:46010 | anonym | Sleep | 285 | | NULL | 0.000 |
|
| 399883 | root | localhost | NULL | Query | 0 | starting | show processlist | 0.000 |
|
+--------+-------------+-------------------+---------+------------+---------+---------------------------------+------------------------------------------------------------------------------------------------------+----------+
|
20 rows in set (0.000 sec)
|
I checked all the nodes for the status
|
Node0:
MariaDB [(none)]> SHOW GLOBAL STATUS LIKE 'wsrep_%';
|
+-------------------------------+------------------------------------
|
| Variable_name | Value
|
| wsrep_local_state_uuid | a11a7c71-9843-11ee-bf01-3e8006b3a5a5
|
| wsrep_protocol_version | 10
|
| wsrep_last_committed | 7458
|
| wsrep_replicated | 1034
|
| wsrep_replicated_bytes | 85241264
|
| wsrep_repl_keys | 160959
|
| wsrep_repl_keys_bytes | 1312816
|
| wsrep_repl_data_bytes | 83857720
|
| wsrep_repl_other_bytes | 0
|
| wsrep_received | 3771
|
| wsrep_received_bytes | 854031936
|
| wsrep_local_commits | 753
|
| wsrep_local_cert_failures | 0
|
| wsrep_local_replays | 0
|
| wsrep_local_send_queue | 0
|
| wsrep_local_send_queue_max | 2
|
| wsrep_local_send_queue_min | 0
|
| wsrep_local_send_queue_avg | 0.000679348
|
| wsrep_local_recv_queue | 9
|
| wsrep_local_recv_queue_max | 9
|
| wsrep_local_recv_queue_min | 0
|
| wsrep_local_recv_queue_avg | 0.0285714
|
| wsrep_local_cached_downto | 7143
|
| wsrep_flow_control_paused_ns | 0
|
| wsrep_flow_control_paused | 0
|
| wsrep_flow_control_sent | 0
|
| wsrep_flow_control_recv | 0
|
| wsrep_flow_control_active | false
|
| wsrep_flow_control_requested | false
|
| wsrep_cert_deps_distance | 20.7702
|
| wsrep_apply_oooe | 0.0141204
|
| wsrep_apply_oool | 0.00347222
|
| wsrep_apply_window | 1.04051
|
| wsrep_apply_waits | 225
|
| wsrep_commit_oooe | 0
|
| wsrep_commit_oool | 0
|
| wsrep_commit_window | 1.00834
|
| wsrep_local_state | 4
|
| wsrep_local_state_comment | Synced
|
| wsrep_cert_index_size | 3781
|
| wsrep_causal_reads | 0
|
| wsrep_cert_interval | 15.1052
|
| wsrep_open_transactions | 3
|
| wsrep_open_connections | 7
|
| wsrep_incoming_addresse | 10.42.1.156:0,10.42.6.39:0,10.42.17.43:0
|
| wsrep_cluster_weight | 3
|
| wsrep_desync_count | 1
|
| wsrep_evs_delayed |
|
| wsrep_evs_evict_list |
|
| wsrep_evs_repl_latency | 0/0/0/0/0
|
| wsrep_evs_state | OPERATIONAL
|
| wsrep_gcomm_uuid | b123cdcc-9f46-11ee-924e-b2b1bdb1b017
|
| wsrep_gmcast_segment | 0
|
| wsrep_applier_thread_count | 4
|
| wsrep_cluster_capabilities |
|
| wsrep_cluster_conf_id | 27
|
| wsrep_cluster_size | 3
|
| wsrep_cluster_state_uui | a11a7c71-9843-11ee-bf01-3e8006b3a5a5
|
| wsrep_cluster_status | Primary
|
| wsrep_connected | ON
|
| wsrep_local_bf_aborts | 0
|
| wsrep_local_index | 1
|
| wsrep_provider_capabilities | :MULTI_MASTER:CERTIFICATION:PARALLEL_APPLYING:TRX_REPLAY:ISOLATION:PAUSE:CAUSAL_READS:INCREMENTAL_WRITESET:UNORDERED:PREORDERED:STREAMING:NBO: |
|
| wsrep_provider_name | Galera
|
| wsrep_provider_vendor | Codership Oy <info@codership.com>
|
| wsrep_provider_version | 4.13(r0f7af31)
|
| wsrep_ready | ON
|
| wsrep_rollbacker_thread_count | 1
|
| wsrep_thread_count | 5
|
69 rows in set (0.001 sec)
|
Node1:
MariaDB [(none)]> SHOW GLOBAL STATUS LIKE 'wsrep_%';
|
+-------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|
| Variable_name | Value |
|
+-------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|
| wsrep_local_state_uuid | a11a7c71-9843-11ee-bf01-3e8006b3a5a5 |
|
| wsrep_protocol_version | 10 |
|
| wsrep_last_committed | 7480 |
|
| wsrep_replicated | 1 |
|
| wsrep_replicated_bytes | 464 |
|
| wsrep_repl_keys | 1 |
|
| wsrep_repl_keys_bytes | 32 |
|
| wsrep_repl_data_bytes | 362 |
|
| wsrep_repl_other_bytes | 0 |
|
| wsrep_received | 19 |
|
| wsrep_received_bytes | 14723 |
|
| wsrep_local_commits | 0 |
|
| wsrep_local_cert_failures | 0 |
|
| wsrep_local_replays | 0 |
|
| wsrep_local_send_queue | 0 |
|
| wsrep_local_send_queue_max | 1 |
|
| wsrep_local_send_queue_min | 0 |
|
| wsrep_local_send_queue_avg | 0 |
|
| wsrep_local_recv_queue | 0 |
|
| wsrep_local_recv_queue_max | 1 |
|
| wsrep_local_recv_queue_min | 0 |
|
| wsrep_local_recv_queue_avg | 0 |
|
| wsrep_local_cached_downto | 7454 |
|
| wsrep_flow_control_paused_ns | 0 |
|
| wsrep_flow_control_paused | 0 |
|
| wsrep_flow_control_sent | 0 |
|
| wsrep_flow_control_recv | 0 |
|
| wsrep_flow_control_active | false |
|
| wsrep_flow_control_requested | false |
|
| wsrep_cert_deps_distance | 1.21053 |
|
| wsrep_apply_oooe | 0 |
|
| wsrep_apply_oool | 0 |
|
| wsrep_apply_window | 1 |
|
| wsrep_apply_waits | 0 |
|
| wsrep_commit_oooe | 0 |
|
| wsrep_commit_oool | 0 |
|
| wsrep_commit_window | 1 |
|
| wsrep_local_state | 4 |
|
| wsrep_local_state_comment | Synced |
|
| wsrep_cert_index_size | 3785 |
|
| wsrep_causal_reads | 0 |
|
| wsrep_cert_interval | 4323.37 |
|
| wsrep_open_transactions | 0 |
|
| wsrep_open_connections | 0 |
|
| wsrep_incoming_addresses | 10.42.4.127:0,10.42.17.60:0,10.42.6.39:0 |
|
| wsrep_cluster_weight | 3 |
|
| wsrep_desync_count | 0 |
|
| wsrep_evs_delayed | |
|
| wsrep_evs_evict_list | |
|
| wsrep_evs_repl_latency | 0/0/0/0/0 |
|
| wsrep_evs_state | OPERATIONAL |
|
| wsrep_gcomm_uuid | 499450f3-affe-11ee-aab6-9f2e869c0a93 |
|
| wsrep_gmcast_segment | 0 |
|
| wsrep_applier_thread_count | 4 |
|
| wsrep_cluster_capabilities | |
|
| wsrep_cluster_conf_id | 33 |
|
| wsrep_cluster_size | 3 |
|
| wsrep_cluster_state_uuid | a11a7c71-9843-11ee-bf01-3e8006b3a5a5 |
|
| wsrep_cluster_status | Primary |
|
| wsrep_connected | ON |
|
| wsrep_local_bf_aborts | 0 |
|
| wsrep_local_index | 1 |
|
| wsrep_provider_capabilities | :MULTI_MASTER:CERTIFICATION:PARALLEL_APPLYING:TRX_REPLAY:ISOLATION:PAUSE:CAUSAL_READS:INCREMENTAL_WRITESET:UNORDERED:PREORDERED:STREAMING:NBO: |
|
| wsrep_provider_name | Galera |
|
| wsrep_provider_vendor | Codership Oy <info@codership.com> |
|
| wsrep_provider_version | 4.13(r0f7af31) |
|
| wsrep_ready | ON |
|
| wsrep_rollbacker_thread_count | 1 |
|
| wsrep_thread_count | 5 |
|
+-------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|
69 rows in set (0.002 sec)
|
Node2:
MariaDB [(none)]> SHOW GLOBAL STATUS LIKE 'wsrep_%';
|
+-------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|
| Variable_name | Value |
|
+-------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|
| wsrep_local_state_uuid | a11a7c71-9843-11ee-bf01-3e8006b3a5a5 |
|
| wsrep_protocol_version | 10 |
|
| wsrep_last_committed | 7480 |
|
| wsrep_replicated | 1 |
|
| wsrep_replicated_bytes | 456 |
|
| wsrep_repl_keys | 1 |
|
| wsrep_repl_keys_bytes | 32 |
|
| wsrep_repl_data_bytes | 360 |
|
| wsrep_repl_other_bytes | 0 |
|
| wsrep_received | 12 |
|
| wsrep_received_bytes | 13784 |
|
| wsrep_local_commits | 0 |
|
| wsrep_local_cert_failures | 0 |
|
| wsrep_local_replays | 0 |
|
| wsrep_local_send_queue | 0 |
|
| wsrep_local_send_queue_max | 1 |
|
| wsrep_local_send_queue_min | 0 |
|
| wsrep_local_send_queue_avg | 0 |
|
| wsrep_local_recv_queue | 0 |
|
| wsrep_local_recv_queue_max | 1 |
|
| wsrep_local_recv_queue_min | 0 |
|
| wsrep_local_recv_queue_avg | 0 |
|
| wsrep_local_cached_downto | 7143 |
|
| wsrep_flow_control_paused_ns | 0 |
|
| wsrep_flow_control_paused | 0 |
|
| wsrep_flow_control_sent | 0 |
|
| wsrep_flow_control_recv | 0 |
|
| wsrep_flow_control_active | false |
|
| wsrep_flow_control_requested | false |
|
| wsrep_cert_deps_distance | 1.21053 |
|
| wsrep_apply_oooe | 0 |
|
| wsrep_apply_oool | 0 |
|
| wsrep_apply_window | 1 |
|
| wsrep_apply_waits | 0 |
|
| wsrep_commit_oooe | 0 |
|
| wsrep_commit_oool | 0 |
|
| wsrep_commit_window | 1 |
|
| wsrep_local_state | 4 |
|
| wsrep_local_state_comment | Synced |
|
| wsrep_cert_index_size | 3785 |
|
| wsrep_causal_reads | 0 |
|
| wsrep_cert_interval | 4323.37 |
|
| wsrep_open_transactions | 0 |
|
| wsrep_open_connections | 0 |
|
| wsrep_incoming_addresses | 10.42.4.127:0,10.42.17.60:0,10.42.6.39:0 |
|
| wsrep_cluster_weight | 3 |
|
| wsrep_desync_count | 0 |
|
| wsrep_evs_delayed | |
|
| wsrep_evs_evict_list | |
|
| wsrep_evs_repl_latency | 0/0/0/0/0 |
|
| wsrep_evs_state | OPERATIONAL |
|
| wsrep_gcomm_uuid | 2993d30e-afff-11ee-89f7-e7f3cd26fa41 |
|
| wsrep_gmcast_segment | 0 |
|
| wsrep_applier_thread_count | 4 |
|
| wsrep_cluster_capabilities | |
|
| wsrep_cluster_conf_id | 33 |
|
| wsrep_cluster_size | 3 |
|
| wsrep_cluster_state_uuid | a11a7c71-9843-11ee-bf01-3e8006b3a5a5 |
|
| wsrep_cluster_status | Primary |
|
| wsrep_connected | ON |
|
| wsrep_local_bf_aborts | 0 |
|
| wsrep_local_index | 0 |
|
| wsrep_provider_capabilities | :MULTI_MASTER:CERTIFICATION:PARALLEL_APPLYING:TRX_REPLAY:ISOLATION:PAUSE:CAUSAL_READS:INCREMENTAL_WRITESET:UNORDERED:PREORDERED:STREAMING:NBO: |
|
| wsrep_provider_name | Galera |
|
| wsrep_provider_vendor | Codership Oy <info@codership.com> |
|
| wsrep_provider_version | 4.13(r0f7af31) |
|
| wsrep_ready | ON |
|
| wsrep_rollbacker_thread_count | 1 |
|
| wsrep_thread_count | 5 |
|
+-------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|
69 rows in set (0.002 sec)
|
Attachments
Issue Links
- relates to
-
MDEV-34784 MariaDB Stuck in "Waiting for certification"
- Open