Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-18415

mariabackup.mdev-14447 test case fails with Table 'test.t' doesn't exist in engine

Details

    Description

      mariabackup.mdev-14447 test case is in buildbot recently

      (http://buildbot.askmonty.org/buildbot/builders/kvm-zyp-opensuse423-amd64/builds/1084/steps/mtr/logs/stdio)

      Root cause is that mariabackup validation of first page fails with checksum mis-match error. (It can be due to partial write or zero filled page).

      In xb_load_single_table_tablespace(), mariabackup allows the corrupted page 0

              if (err != DB_SUCCESS && err != DB_CORRUPTION && xtrabackup_backup) {
                      /* allow corrupted first page for xtrabackup, it could be just
                      zero-filled page, which we restore from redo log later */
                      die("Failed to not validate first page of the file %s, error %d",name, (int)err);
              }
      

      Test case for it:

      call mtr.add_suppression("InnoDB: New log files created");
       
      let $basedir=$MYSQLTEST_VARDIR/tmp/backup;
      let $incremental_dir=$MYSQLTEST_VARDIR/tmp/backup_inc1;
       
      CREATE TABLE t(a varchar(40) PRIMARY KEY, b varchar(40), c varchar(40), d varchar(40), index(b,c,d)) ENGINE INNODB;
       
      set global innodb_log_checkpoint_now = true;
       
      let $backuplog=$MYSQLTEST_VARDIR/tmp/backup.log;
      echo # Create full backup , modify table, then create incremental/differential backup;
      --disable_result_log
       
      --echo # dbug execute makes checksum mismatch while reading page 0 of table t.
       
      exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf  --backup --target-dir=$basedir --dbug=+d,page_intermittent_checksum_mismatch >$backuplog;
      --enable_result_log
       
      --disable_result_log
      echo # Prepare full backup, apply incremental one;
      exec $XTRABACKUP --prepare --verbose --apply-log-only --target-dir=$basedir;
       
      echo # Restore and check results;
      let $targetdir=$basedir;
      -- source include/restart_and_restore.inc
       
       
      --enable_result_log
      SELECT count(*) FROM t;
      DROP TABLE t;
       
      # Cleanup
      rmdir $basedir;
      

      debug execute if code for validation fails:

       
      diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
      index 8dc2fd2..8f00c83 100644
      --- a/storage/innobase/buf/buf0buf.cc
      +++ b/storage/innobase/buf/buf0buf.cc
      @@ -1103,6 +1103,17 @@ buf_page_is_corrupted(
       
                              if (srv_checksum_algorithm
                                  == SRV_CHECKSUM_ALGORITHM_CRC32) {
      +
      +                               DBUG_EXECUTE_IF(
      +                                       "page_intermittent_checksum_mismatch", {
      +                                       static int page_counter;
      +                                       if (page_counter++ == 4) {
      +                                               fprintf(stderr, "mismatch\n");
      +                                               checksum_field2++;
      +                                       }
      +                               });
      +
      +
                                      crc32 = buf_page_check_crc32(read_buf,
      

      Attachments

        Issue Links

          Activity

            diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc
            index 87f8822..1a45b1b 100644
            --- a/extra/mariabackup/xtrabackup.cc
            +++ b/extra/mariabackup/xtrabackup.cc
            @@ -3092,8 +3092,19 @@ xb_load_single_table_tablespace(
                            die("Can't open datafile %s", name);
                    }
             
            +       ulint   retry_count = 10;
            +retry_read:
                    err = file->validate_first_page(&flush_lsn);
             
            +       if (err == DB_CORRUPTION) {
            +               retry_count--;
            +
            +               if (retry_count != 0) {
            +                       os_thread_sleep(10000);
            +                       goto retry_read;
            +               }
            +       }
            +
                    if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) {
                            os_offset_t     node_size = os_file_get_size(file->handle());
                            os_offset_t     n_pages;
            @@ -3124,9 +3135,7 @@ xb_load_single_table_tablespace(
             
                    delete file;
             
            -       if (err != DB_SUCCESS && err != DB_CORRUPTION && xtrabackup_backup) {
            -               /* allow corrupted first page for xtrabackup, it could be just
            -               zero-filled page, which we restore from redo log later */
            +       if (err != DB_SUCCESS && xtrabackup_backup) {
                            die("Failed to not validate first page of the file %s, error %d",name, (int)err);
                    }
             }
            

            This could solve the problem. But whether the delay can bring down the performance

            thiru Thirunarayanan Balathandayuthapani added a comment - diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 87f8822..1a45b1b 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -3092,8 +3092,19 @@ xb_load_single_table_tablespace( die("Can't open datafile %s", name); } + ulint retry_count = 10; +retry_read: err = file->validate_first_page(&flush_lsn); + if (err == DB_CORRUPTION) { + retry_count--; + + if (retry_count != 0) { + os_thread_sleep(10000); + goto retry_read; + } + } + if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) { os_offset_t node_size = os_file_get_size(file->handle()); os_offset_t n_pages; @@ -3124,9 +3135,7 @@ xb_load_single_table_tablespace( delete file; - if (err != DB_SUCCESS && err != DB_CORRUPTION && xtrabackup_backup) { - /* allow corrupted first page for xtrabackup, it could be just - zero-filled page, which we restore from redo log later */ + if (err != DB_SUCCESS && xtrabackup_backup) { die("Failed to not validate first page of the file %s, error %d",name, (int)err); } } This could solve the problem. But whether the delay can bring down the performance
            wlad Vladislav Vaintroub added a comment - - edited

            For a couple of lines like this, I'd prefer a loop over goto (similar to below, like it was already done for system tablespace elsewhere in this file) . I've no big concerns about performance, usually looping would be unnecessary.

            for (int i= 0; i < 10; i++) {
            	err = file->validate_first_page(&flush_lsn); 
                    if (err  != DB_CORRUPTION)  break;
                    my_sleep(1000);
            }
             
            
            

            wlad Vladislav Vaintroub added a comment - - edited For a couple of lines like this, I'd prefer a loop over goto (similar to below, like it was already done for system tablespace elsewhere in this file) . I've no big concerns about performance, usually looping would be unnecessary. for (int i= 0; i < 10; i++) { err = file->validate_first_page(&flush_lsn); if (err != DB_CORRUPTION) break; my_sleep(1000); }  

            People

              thiru Thirunarayanan Balathandayuthapani
              thiru Thirunarayanan Balathandayuthapani
              Votes:
              0 Vote for this issue
              Watchers:
              2 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.