|
Page flush is detecting a corrupted page:
rr replay rr-profile-prepare-1
|
(rr) p/x page[16]@8
|
$2 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x33, 0x2a}
|
(rr) set print elements 65536
|
(rr) p *page@srv_page_size
|
$3 = "\000\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\000\000\000\000\200\063*", '\000' <repeats 13 times>, "T", '\000' <repeats 32727 times>, "\200\063*"
|
(rr) watch -l *(uint64_t*)(page+16)
|
Hardware watchpoint 1: -location *(uint64_t*)(page+16)
|
(rr) watch -l page[24+13]
|
Hardware watchpoint 2: -location page[24+13]
|
(rr) when
|
Current event: 6566
|
(rr) rc
|
Continuing.
|
…
|
(rr) rc
|
Continuing.
|
|
Thread 1 hit Hardware watchpoint 2: -location page[24+13]
|
|
Old value = 84 'T'
|
New value = 0 '\000'
|
0x000055cc884a87a1 in mach_write_to_4 (b=0x7fd5b5060022 "", n=84)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/include/mach0data.ic:166
|
166 b[3] = (byte) n;
|
(rr) bt
|
#0 0x000055cc884a87a1 in mach_write_to_4 (b=0x7fd5b5060022 "", n=84)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/include/mach0data.ic:166
|
#1 0x000055cc884aa48a in mlog_parse_nbytes (type=MLOG_4BYTES,
|
ptr=0x7fd5b4ab0b4b "", end_ptr=0x7fd5b4ab0b4b "", page=0x7fd5b5060000 "",
|
page_zip=0x0)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/mtr/mtr0log.cc:235
|
#2 0x000055cc88496f52 in recv_parse_or_apply_log_rec_body (type=MLOG_4BYTES,
|
ptr=0x7fd5b4ab0b48 "", end_ptr=0x7fd5b4ab0b4b "", page_id=..., apply=true,
|
block=0x7fd5b4157e78, mtr=0x7fff2dcc4480)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/log/log0recv.cc:1486
|
#3 0x000055cc8849964f in recv_recover_page (block=0x7fd5b4157e78, mtr=...,
|
recv_addr=0x7fd5b464a788, init=0x55cc8bcb8b28)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/log/log0recv.cc:2025
|
#4 0x000055cc8849a522 in recv_recovery_create_page_low (page_id=...,
|
recv_addr=0x7fd5b464a788)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/log/log0recv.cc:2255
|
#5 0x000055cc8849ad7b in recv_apply_hashed_log_recs (last_batch=true)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/log/log0recv.cc:2385
|
…
|
(rr) when
|
Current event: 3725
|
The write of the LSN is inevitable, and the actual root cause is the write of the 4 bytes at offset 34 (tablespace ID).
I suspect that this is the pre-10.5 counterpart of MDEV-24695. Nothing else than encryption (fil_crypt_thread) should be issuing dummy updates of the tablespace ID. It seems that in this case, encryption is dirtying an unused page.
Now, let us validate this guess by checking what happened in the server on tablespace 84, page 4:
rr replay rr-profile-server
|
break fil0crypt.cc:2070
|
cond 1 space_id==84
|
continue
|
…
|
Thread 2 hit Breakpoint 1, fil_crypt_rotate_page (key_state=0x7f5b07dfee14,
|
state=0x7f5b07dfee20)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/fil/fil0crypt.cc:2070
|
2070 mlog_write_ulint(frame + FIL_PAGE_SPACE_ID,
|
(rr) when
|
Current event: 441932
|
(rr) cond 1 block->page.id.m_page_no==4
|
(rr) c
|
Continuing.
|
|
Thread 2 hit Breakpoint 1, fil_crypt_rotate_page (key_state=0x7f5b07dfee14,
|
state=0x7f5b07dfee20)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/fil/fil0crypt.cc:2070
|
2070 mlog_write_ulint(frame + FIL_PAGE_SPACE_ID,
|
1: space_id = 84
|
(rr) when
|
Current event: 441935
|
(rr) p block->page.id
|
$1 = {m_space = 84, m_page_no = 4}
|
(rr) p *frame@srv_page_size
|
$2 = "*\350%\315\000\000\000\004", '\000' <repeats 13 times>, "h\240\260\000\002", '\000' <repeats 11 times>,…
|
As we can see, my guess is not entirely valid. The page is not all-zero. Let us find more:
(rr) watch -l block->page.id.m_page_no
|
Hardware watchpoint 2: -location block->page.id.m_page_no
|
(rr) disable 1
|
(rr) rc
|
Continuing.
|
Continuing.
|
[Switching to Thread 9588.9636]
|
|
Thread 37 hit Hardware watchpoint 2: -location block->page.id.m_page_no
|
|
Old value = 4
|
New value = 0
|
0x000055a2206e77fa in buf_block_set_file_page (block=0x7f5b5c0b1150,
|
page_id=...)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/include/buf0buf.ic:496
|
496 block->page.id = page_id;
|
(rr) when
|
Current event: 30186
|
(rr) bt
|
#0 0x000055a2206e77fa in buf_block_set_file_page (block=0x7f5b5c0b1150,
|
page_id=...)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/include/buf0buf.ic:496
|
#1 0x000055a2206fb5c9 in buf_page_init (buf_pool=0x55a223dbdb10, page_id=...,
|
zip_size=0, block=0x7f5b5c0b1150)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/buf/buf0buf.cc:5218
|
#2 0x000055a2206fd4ab in buf_page_create (page_id=..., zip_size=0,
|
mtr=0x7f5b580f5150)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/buf/buf0buf.cc:5644
|
#3 0x000055a2207b2906 in fsp_page_create (space=0x55a2244037f0, offset=4,
|
mtr=0x7f5b580f5150, init_mtr=0x7f5b580f5150)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/fsp/fsp0fsp.cc:1143
|
#4 0x000055a2207b2ee9 in fsp_alloc_free_page (space=0x55a2244037f0, hint=0,
|
mtr=0x7f5b580f5150, init_mtr=0x7f5b580f5150)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/fsp/fsp0fsp.cc:1261
|
…
|
(rr) p log_sys.lsn
|
$3 = 466582
|
vlad.lesin, please find out why this page creation was not recovered properly. Also try to dump all log records for this page 84:4.
|
|
So what is going on:
1. fsp_free_page() writes MLOG_INIT_FREE_PAGE in redo log, updates the page's LSN:
Thread 6 hit Breakpoint 5, mlog_write_initial_log_record_low (type=MLOG_INIT_FREE_PAGE, space_id=84, page_no=4, log_ptr=0x7f5b17ffdf18 "\377\377", mtr=0x7f5b17ffdc60)
|
at /home/mdbe/MDEV-24792/10.4/storage/innobase/include/mtr0log.ic:160
|
160 ut_ad(type <= MLOG_BIGGEST_TYPE || EXTRA_CHECK_MLOG_NUMBER(type));
|
(rr) bt
|
#0 mlog_write_initial_log_record_low (type=MLOG_INIT_FREE_PAGE, space_id=84, page_no=4, log_ptr=0x7f5b17ffdf18 "\377\377", mtr=0x7f5b17ffdc60) at /home/mdbe/MDEV-24792/10.4/storage/innobase/include/mtr0log.ic:160
|
#1 0x000055a2207b3351 in fsp_free_page (space=0x55a2244037f0, offset=4, log=true, mtr=0x7f5b17ffdc60) at /home/mdbe/MDEV-24792/10.4/storage/innobase/fsp/fsp0fsp.cc:1331
|
#2 0x000055a2207b8909 in fseg_free_page_low (seg_inode=0x7f5b5d4e80f2 "", space=0x55a2244037f0, offset=4, log=true, mtr=0x7f5b17ffdc60) at /home/mdbe/MDEV-24792/10.4/storage/innobase/fsp/fsp0fsp.cc:2724
|
#3 0x000055a2207b99d2 in fseg_free_step (header=0x7f5b5e91003c "", mtr=0x7f5b17ffdc60) at /home/mdbe/MDEV-24792/10.4/storage/innobase/fsp/fsp0fsp.cc:2969
|
#4 0x000055a220633a60 in trx_purge_free_segment (rseg=0x55a22471dac0, hdr_addr=...) at /home/mdbe/MDEV-24792/10.4/storage/innobase/trx/trx0purge.cc:403
|
#5 0x000055a220633fab in trx_purge_truncate_rseg_history (rseg=..., limit=...) at /home/mdbe/MDEV-24792/10.4/storage/innobase/trx/trx0purge.cc:488
|
#6 0x000055a2206344a8 in trx_purge_truncate_history () at /home/mdbe/MDEV-24792/10.4/storage/innobase/trx/trx0purge.cc:564
|
#7 0x000055a2206370f6 in trx_purge (n_purge_threads=4, truncate=true, slot=0x55a2215fed08 <srv_sys+392>) at /home/mdbe/MDEV-24792/10.4/storage/innobase/trx/trx0purge.cc:1329
|
2. fil_crypt_rotate_page() writes dummy record which updates page's space id to force keys rotation during recovery:
Thread 7 hit Breakpoint 2, mlog_write_ulint (ptr=0x7f5b5e910022 "", val=84, type=MLOG_4BYTES, mtr=0x7f5b07dfe8c0) at /home/mdbe/MDEV-24792/10.4/storage/innobase/mtr/mtr0log.cc:258
|
258 switch (type) {
|
(rr) bt
|
#0 mlog_write_ulint (ptr=0x7f5b5e910022 "", val=84, type=MLOG_4BYTES, mtr=0x7f5b07dfe8c0) at /home/mdbe/MDEV-24792/10.4/storage/innobase/mtr/mtr0log.cc:258
|
#1 0x000055a2207aa62c in fil_crypt_rotate_page (key_state=0x7f5b07dfee14, state=0x7f5b07dfee20) at /home/mdbe/MDEV-24792/10.4/storage/innobase/fil/fil0crypt.cc:2070
|
#2 0x000055a2207aad7f in fil_crypt_rotate_pages (key_state=0x7f5b07dfee14, state=0x7f5b07dfee20) at /home/mdbe/MDEV-24792/10.4/storage/innobase/fil/fil0crypt.cc:2214
|
#3 0x000055a2207ab84f in fil_crypt_thread () at /home/mdbe/MDEV-24792/10.4/storage/innobase/fil/fil0crypt.cc:2438
|
3. Recovery executes MLOG_INIT_FREE_PAGE, creates new page, and then executes dummy record written on (2), updates the page LSN, and this causes assertion crash when the page is flushed, as it's supposed that page LSN must be 0 for freshly allocated pages.
The reason of the bug is the same as MDEV-24695, and the fix is similiar, i.e. mark page as freed and do not write dummy records for freed pages in fil_crypt_rotate_page(). But 10.4 code is quite different, so we can't backport MDEV-24695 as is.
|