commit 1e67093cdb304ab18d83d2b2a86aa5a4e5f92be3 Author: Monty Date: Fri May 5 13:08:23 2017 +0300 MDEV-10104 Table lock race condition with replication Problem was two race condtion in Aria page cache: - find_block() didn't inform free_block() that it had released requests - free_block() didn't handle pinned blocks, which could happen if free_block() was called as part of flush. This is fixed by not freeing blocks that are pinned. This is safe as when maria_close() is called when last thread is using a table, there can be no pinned blocks. For other flush calls it's safe to ignore pinned blocks. diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c index 5043978..34daf1d 100644 --- a/storage/maria/ma_pagecache.c +++ b/storage/maria/ma_pagecache.c @@ -493,7 +493,8 @@ static my_bool info_check_lock(PAGECACHE_BLOCK_LINK *block, #define FLUSH_CACHE 2000 /* sort this many blocks at once */ -static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block); +static my_bool free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + my_bool abort_if_pinned); static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link); #ifndef DBUG_OFF static void test_key_cache(PAGECACHE *pagecache, @@ -1939,7 +1940,7 @@ static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache, removed from the cache as we set the PCBLOCK_REASSIGNED flag (see the code below that handles reading requests). */ - free_block(pagecache, block); + free_block(pagecache, block, 0); return 0; } /* Wait until the page is flushed on disk */ @@ -1950,7 +1951,7 @@ static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache, /* Invalidate page in the block if it has not been done yet */ DBUG_ASSERT(block->status); /* Should always be true */ if (block->status) - free_block(pagecache, block); + free_block(pagecache, block, 0); return 0; } @@ -1975,8 +1976,13 @@ static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache, } else { - DBUG_ASSERT(hash_link->requests > 0); - hash_link->requests--; + /* + When we come here either PCBLOCK_REASSIGNED or PCBLOCK_IN_SWITCH are + active. In both cases wqueue_release_queue() is called when the + state changes. + */ + DBUG_ASSERT(block->hash_link == hash_link); + remove_reader(block); KEYCACHE_DBUG_PRINT("find_block", ("request waiting for old page to be saved")); { @@ -3635,7 +3641,7 @@ static my_bool pagecache_delete_internal(PAGECACHE *pagecache, DBUG_ASSERT(block->hash_link->requests > 0); page_link->requests--; /* See NOTE for pagecache_unlock() about registering requests. */ - free_block(pagecache, block); + free_block(pagecache, block, 0); dec_counter_for_resize_op(pagecache); return 0; @@ -4227,7 +4233,8 @@ my_bool pagecache_write_part(PAGECACHE *pagecache, and add it to the free list. */ -static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) +static my_bool free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + my_bool abort_if_pinned) { uint status= block->status; KEYCACHE_THREAD_TRACE("free block"); @@ -4241,11 +4248,27 @@ static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) /* While waiting for readers to finish, new readers might request the block. But since we set block->status|= PCBLOCK_REASSIGNED, they - will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled + will wait on block->wqueue[COND_FOR_SAVED]. They must be signaled later. */ block->status|= PCBLOCK_REASSIGNED; wait_for_readers(pagecache, block); + if (unlikely(abort_if_pinned) && unlikely(block->pins)) + { + /* + Block got pinned while waiting for readers. + This can only happens when called from flush_pagecache_blocks_int() + when flushing blocks as part of prepare for maria_close() or from + flush_cached_blocks() + */ + block->status&= ~PCBLOCK_REASSIGNED; + unreg_request(pagecache, block, 0); + + /* All pending requests for this page must be resubmitted. */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); + return 1; + } unlink_hash(pagecache, block->hash_link); } @@ -4296,6 +4319,8 @@ static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) /* All pending requests for this page must be resubmitted. */ if (block->wqueue[COND_FOR_SAVED].last_thread) wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); + + return 0; } @@ -4431,9 +4456,16 @@ static int flush_cached_blocks(PAGECACHE *pagecache, if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY || type == FLUSH_FORCE_WRITE)) { - pagecache->blocks_changed--; - pagecache->global_blocks_changed--; - free_block(pagecache, block); + if (!free_block(pagecache, block, 1)) + { + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + } + else + { + block->status&= ~PCBLOCK_IN_FLUSH; + link_to_file_list(pagecache, block, file, 1); + } } else { @@ -4671,7 +4703,7 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache, /* It's a temporary file */ pagecache->blocks_changed--; pagecache->global_blocks_changed--; - free_block(pagecache, block); + free_block(pagecache, block, 0); } } else if (type != FLUSH_KEEP_LAZY) @@ -4741,11 +4773,12 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache, #endif next= block->next_changed; if (block->hash_link->file.file == file->file && + !block->pins && (! (block->status & PCBLOCK_CHANGED) || type == FLUSH_IGNORE_CHANGED)) { reg_requests(pagecache, block, 1); - free_block(pagecache, block); + free_block(pagecache, block, 1); } } }