Uploaded image for project: 'MariaDB Server'
  1. MariaDB Server
  2. MDEV-32268

GNU libc posix_fallocate() may be extremely slow

Details

    Description

      After the "fix" for MDEV-11687 we always use posix_fallocate() in the os_file_set_size (see https://github.com/MariaDB/server/blob/b0763f509a3f120e882cee2810495d169ff344a4/storage/innobase/os/os0file.cc#L4926):

      ...
      # ifdef HAVE_POSIX_FALLOCATE
      	int err;
      	do {
      		if (fstat(file, &statbuf)) {
      			err = errno;
      		} else {
      			os_offset_t current_size = statbuf.st_size;
      			if (current_size >= size) {
      				return true;
      			}
      			current_size &= ~4095ULL;
      			err = posix_fallocate(file, current_size,
      					      size - current_size);
      		}
      	} while (err == EINTR
      		 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
      ...
      

      while in some cases (like ALTER TABLE ... ALGORITHM=COPY for COMPRESSED table and datadir on a NAS/NFS mount) using alternative allocation method provides performance benefits.

      Consider the following test case on 10.1 that proves the point to some extent on a local NFS mount on current Ubuntu 22.04:

      openxs@ao756:~/dbs/maria10.1/mysql-test$ ./mtr --mysqld="--innodb_file_format=Barracuda" --mysqld="--innodb_buffer_pool_size=32M" --mysqld="--innodb_use_fallocate=1" innodb.slow_alter
      Logging: ./mtr  --mysqld=--innodb_file_format=Barracuda --mysqld=--innodb_buffer_pool_size=32M --mysqld=--innodb_use_fallocate=1 innodb.slow_alter
      vardir: /home/openxs/dbs/maria10.1/mysql-test/var
      Checking leftover processes...
      Removing old var directory...
       - WARNING: Using the 'mysql-test/var' symlink
      Creating var directory '/home/openxs/dbs/maria10.1/mysql-test/var'...
      Checking supported features...
      MariaDB Version 10.1.49-MariaDB
       - SSL connections supported
      Sphinx 'indexer' binary not found, sphinx suite will be skipped
      Collecting tests...
      Installing system database...
       
      ==============================================================================
       
      TEST                                      RESULT   TIME (ms) or COMMENT
      --------------------------------------------------------------------------
       
      worker[1] Using MTR_BUILD_THREAD 300, with reserved ports 16000..16019
      set default_storage_engine=innodb;
      show global variables like '%fallocate'||
      Variable_name	Value
      innodb_use_fallocate	ON
      create table tbig(id int auto_increment primary key, c1 char(100)) ROW_FORMAT=COMPRESSED;
      insert into tbig(c1) values (repeat('a', 100));
      select @now := now(6)||
      @now := now(6)
      2023-09-27 13:31:58.256905
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      select timestampdiff(microsecond, @now, now(6))||
      timestampdiff(microsecond, @now, now(6))
      86978103
      show table status like 'tbig'||
      Name	Engine	Version	Row_format	Rows	Avg_row_length	Data_length	Max_data_length	Index_length	Data_free	Auto_increment	Create_time	Update_time	Check_time	Collation	Checksum	Create_options	Comment
      tbig	InnoDB	10	Compressed	1048731	60	63258624	0	0	3145728	1376221	2023-09-27 13:31:57	NULL	NULL	latin1_swedish_ci	NULL	row_format=COMPRESSED	
      set session profiling = ON||
      select count(*) from tbig||
      count(*)
      1048576
      alter table tbig add column c2 char(200) default 'b'||
      set session profiling = OFF||
      show profiles||
      Query_ID	Duration	Query
      1	1.69141316	select count(*) from tbig
      2	385.22975249	alter table tbig add column c2 char(200) default 'b'
      drop table tbig||
      innodb.slow_alter 'xtradb'               [ pass ]  474460
      --------------------------------------------------------------------------
      The servers were restarted 0 times
      Spent 474.460 of 562 seconds executing testcases
       
      Completed: All 1 tests were successful.
       
      openxs@ao756:~/dbs/maria10.1/mysql-test$ ./mtr --mysqld="--innodb_file_format=Barracuda" --mysqld="--innodb_buffer_pool_size=32M" --mysqld="--innodb_use_fallocate=0" innodb.slow_alter
      Logging: ./mtr  --mysqld=--innodb_file_format=Barracuda --mysqld=--innodb_buffer_pool_size=32M --mysqld=--innodb_use_fallocate=0 innodb.slow_alter
      vardir: /home/openxs/dbs/maria10.1/mysql-test/var
      Checking leftover processes...
      Removing old var directory...
       - WARNING: Using the 'mysql-test/var' symlink
      Creating var directory '/home/openxs/dbs/maria10.1/mysql-test/var'...
      Checking supported features...
      MariaDB Version 10.1.49-MariaDB
       - SSL connections supported
      Sphinx 'indexer' binary not found, sphinx suite will be skipped
      Collecting tests...
      Installing system database...
       
      ==============================================================================
       
      TEST                                      RESULT   TIME (ms) or COMMENT
      --------------------------------------------------------------------------
       
      worker[1] Using MTR_BUILD_THREAD 300, with reserved ports 16000..16019
      set default_storage_engine=innodb;
      show global variables like '%fallocate'||
      Variable_name	Value
      innodb_use_fallocate	OFF
      create table tbig(id int auto_increment primary key, c1 char(100)) ROW_FORMAT=COMPRESSED;
      insert into tbig(c1) values (repeat('a', 100));
      select @now := now(6)||
      @now := now(6)
      2023-09-27 13:46:34.280800
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      insert into tbig(c1) select c1 from tbig||
      select timestampdiff(microsecond, @now, now(6))||
      timestampdiff(microsecond, @now, now(6))
      86275891
      show table status like 'tbig'||
      Name	Engine	Version	Row_format	Rows	Avg_row_length	Data_length	Max_data_length	Index_length	Data_free	Auto_increment	Create_time	Update_time	Check_time	Collation	Checksum	Create_options	Comment
      tbig	InnoDB	10	Compressed	1048681	59	62734336	0	0	3145728	1376221	2023-09-27 13:46:33	NULL	NULL	latin1_swedish_ci	NULL	row_format=COMPRESSED	
      set session profiling = ON||
      select count(*) from tbig||
      count(*)
      1048576
      alter table tbig add column c2 char(200) default 'b'||
      set session profiling = OFF||
      show profiles||
      Query_ID	Duration	Query
      1	1.91823956	select count(*) from tbig
      2	377.63216720	alter table tbig add column c2 char(200) default 'b'
      drop table tbig||
      innodb.slow_alter 'xtradb'               [ pass ]  466519
      --------------------------------------------------------------------------
      The servers were restarted 0 times
      Spent 466.519 of 555 seconds executing testcases
       
      Completed: All 1 tests were successful.
       
      openxs@ao756:~/dbs/maria10.1/mysql-test$ ls -l | grep var
      drwxrwxr-x  8 openxs openxs   4096 вер 27 10:56 oldvar
      lrwxrwxrwx  1 openxs openxs      4 вер 27 12:09 var -> /mnt
      openxs@ao756:~/dbs/maria10.1/mysql-test$ mount | grep /mnt
      127.0.0.1:/home/openxs/share on /mnt type nfs4 (rw,relatime,vers=4.2,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=none,addr=127.0.0.1)
      

      Here we have 378 seconds to ALTER the table that is twice as large as the buffer pool with innodb_use_fallocate = OFF vs 385 where it is ON. I expect bigger difference when table size increases, but wanted to keep the test fast enough to be pracical on my slow HDD.

      Users reported much more serious impact (that prevents the use of 10.4+ for table terabytes in size) and upgrade from pre-10.2.4 versions.

      Attachments

        Issue Links

          Activity

            The customer replied that with a custom build that disables the posix_fallocate() calls, their ALTER TABLE data rate would be improved by almost 4×. The patch was as follows:

            diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
            index 71e48da0dec..8b62b391afe 100644
            --- a/storage/innobase/os/os0file.cc
            +++ b/storage/innobase/os/os0file.cc
            @@ -4923,7 +4923,7 @@ os_file_set_size(
             		return(success);
             	}
             
            -# ifdef HAVE_POSIX_FALLOCATE
            +# if 0
             	int err;
             	do {
             		if (fstat(file, &statbuf)) {
            

            Based on this, it makes sense to add back the Boolean parameter innodb_use_fallocate, and default it to ON. This customer would set that parameter to OFF in their environment.

            ralf.gebhardt and serg, I wonder if we could handle this rather trivial change a bug fix. After all, it is fixing a regression that had been introduced in MariaDB Server 10.2. Basically, we would introduce a Boolean parameter and add a runtime check for it in the infrequently called function os_file_set_size().

            marko Marko Mäkelä added a comment - The customer replied that with a custom build that disables the posix_fallocate() calls, their ALTER TABLE data rate would be improved by almost 4×. The patch was as follows: diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 71e48da0dec..8b62b391afe 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -4923,7 +4923,7 @@ os_file_set_size( return(success); } -# ifdef HAVE_POSIX_FALLOCATE +# if 0 int err; do { if (fstat(file, &statbuf)) { Based on this, it makes sense to add back the Boolean parameter innodb_use_fallocate , and default it to ON . This customer would set that parameter to OFF in their environment. ralf.gebhardt and serg , I wonder if we could handle this rather trivial change a bug fix. After all, it is fixing a regression that had been introduced in MariaDB Server 10.2. Basically, we would introduce a Boolean parameter and add a runtime check for it in the infrequently called function os_file_set_size() .

            There is an alternative that on Linux, we invoke fallocate(2) instead of posix_fallocate(). We do not know yet if the performance problem is caused by the fallocate() system call itself or by the EOPNOTSUPP fallback in GNU libc posix_fallocate(). I hope that the customer can test another build that implements this. Fewer configuration parameters would be better.

            marko Marko Mäkelä added a comment - There is an alternative that on Linux, we invoke fallocate(2) instead of posix_fallocate() . We do not know yet if the performance problem is caused by the fallocate() system call itself or by the EOPNOTSUPP fallback in GNU libc posix_fallocate() . I hope that the customer can test another build that implements this. Fewer configuration parameters would be better.

            For the record, in GNU libc there is a generic implementation sysdeps/posix/posix_fallocate64.c that invokes pwrite() of 1 byte at a time, as well as a Linux specific sysdeps/unix/sysv/linux/posix_fallocate64.c that first tries fallocate() and then falls back to the generic implementation, which a #define renames to internal_fallocate64. There also exist variants without the 64 suffix.

            In case the second custom build (of invoking fallocate() instead of posix_fallocate()) turns out to fix the performance regression, I have a hypothesis that could explain it. Quoting sysdeps/posix/posix_fallocate64.c:

              unsigned increment;
              {
                struct statfs64 f;
             
                if (__fstatfs64 (fd, &f) != 0)
                  return errno;
                if (f.f_bsize == 0)
                  increment = 512;
                else if (f.f_bsize < 4096)
                  increment = f.f_bsize;
                else
                  /* NFS clients do not propagate the block size of the underlying
                     storage and may report a much larger value which would still
                     leave holes after the loop below, so we cap the increment at
                     4096.  */
                  increment = 4096;
              }
            

            Let us assume we write 1 byte every 4096 bytes. If the file is extended to 4 megabytes, it would be done by 1024 writes of 1 byte, every 4096 bytes. In case the NFS server uses 512-byte allocation block size, the physical size of the file would be 1024*512 = 0.5 megabytes instead of 4 megabytes, and the file would consist of 1024 fragments. When the file is being written with actual data later, it would have to be "defragmented" or "unsparsed" by the file system, which can be very expensive and depending on the file system implementation, block any concurrent I/O on that file system.

            marko Marko Mäkelä added a comment - For the record, in GNU libc there is a generic implementation sysdeps/posix/posix_fallocate64.c that invokes pwrite() of 1 byte at a time, as well as a Linux specific sysdeps/unix/sysv/linux/posix_fallocate64.c that first tries fallocate() and then falls back to the generic implementation, which a #define renames to internal_fallocate64 . There also exist variants without the 64 suffix. In case the second custom build (of invoking fallocate() instead of posix_fallocate() ) turns out to fix the performance regression, I have a hypothesis that could explain it. Quoting sysdeps/posix/posix_fallocate64.c : unsigned increment; { struct statfs64 f;   if (__fstatfs64 (fd, &f) != 0) return errno ; if (f.f_bsize == 0) increment = 512; else if (f.f_bsize < 4096) increment = f.f_bsize; else /* NFS clients do not propagate the block size of the underlying storage and may report a much larger value which would still leave holes after the loop below, so we cap the increment at 4096. */ increment = 4096; } Let us assume we write 1 byte every 4096 bytes. If the file is extended to 4 megabytes, it would be done by 1024 writes of 1 byte, every 4096 bytes. In case the NFS server uses 512-byte allocation block size, the physical size of the file would be 1024*512 = 0.5 megabytes instead of 4 megabytes, and the file would consist of 1024 fragments. When the file is being written with actual data later, it would have to be "defragmented" or "unsparsed" by the file system, which can be very expensive and depending on the file system implementation, block any concurrent I/O on that file system.

            Just for the record, I think that preallocating space only ever makes sense for local storage, we don't know anything about physical layout of the network storage and cannot assume that preallocating there helps or even that it does anything at all.

            That is, I think InnoDB should do fallocate() and if that fails — do not fallback to pwrite()

            serg Sergei Golubchik added a comment - Just for the record, I think that preallocating space only ever makes sense for local storage, we don't know anything about physical layout of the network storage and cannot assume that preallocating there helps or even that it does anything at all. That is, I think InnoDB should do fallocate() and if that fails — do not fallback to pwrite()

            We got feedback from the customer. With the second custom build, they are back to 4× the write rate of the normal build that uses posix_fallocate(). So, my hypothesis about the sparse files seems to be right. The patch was as follows:

            diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
            index 71e48da0dec..075f838ebb1 100644
            --- a/storage/innobase/os/os0file.cc
            +++ b/storage/innobase/os/os0file.cc
            @@ -4934,8 +4934,18 @@ os_file_set_size(
             				return true;
             			}
             			current_size &= ~4095ULL;
            +#  ifdef __linux__
            +			if (!fallocate(file, 0, current_size,
            +				       size - current_size)) {
            +				err = 0;
            +				break;
            +			}
            +
            +			err = errno;
            +#  else
             			err = posix_fallocate(file, current_size,
             					      size - current_size);
            +#  endif
             		}
             	} while (err == EINTR
             		 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
            

            I think that we’d better apply the fix to all our releases. We would not need any new configuration option at this point. The fallback code at the end of os_file_set_size(), which is writing 1-megabyte blocks of NUL bytes, has been demonstrated to work on Alpine Linux (which uses musl libc) and on some version of Docker for Microsoft Windows, if you look at MDEV-16015.

            serg, sure, in a separate feature request we could introduce a configuration option to logically extend files (ftruncate(), which we currently use for extending files of page_compressed tables). On an update-in-place file system such as Linux ext4, that could lead to more likely breakage the a design assumption of InnoDB that writes cannot fail due to running out of space.

            marko Marko Mäkelä added a comment - We got feedback from the customer. With the second custom build, they are back to 4× the write rate of the normal build that uses posix_fallocate() . So, my hypothesis about the sparse files seems to be right. The patch was as follows: diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 71e48da0dec..075f838ebb1 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -4934,8 +4934,18 @@ os_file_set_size( return true; } current_size &= ~4095ULL; +# ifdef __linux__ + if (!fallocate(file, 0, current_size, + size - current_size)) { + err = 0; + break; + } + + err = errno; +# else err = posix_fallocate(file, current_size, size - current_size); +# endif } } while (err == EINTR && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED); I think that we’d better apply the fix to all our releases. We would not need any new configuration option at this point. The fallback code at the end of os_file_set_size() , which is writing 1-megabyte blocks of NUL bytes, has been demonstrated to work on Alpine Linux (which uses musl libc) and on some version of Docker for Microsoft Windows, if you look at MDEV-16015 . serg , sure, in a separate feature request we could introduce a configuration option to logically extend files ( ftruncate() , which we currently use for extending files of page_compressed tables). On an update-in-place file system such as Linux ext4 , that could lead to more likely breakage the a design assumption of InnoDB that writes cannot fail due to running out of space.

            People

              marko Marko Mäkelä
              valerii Valerii Kravchuk
              Votes:
              0 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved:

                Git Integration

                  Error rendering 'com.xiplink.jira.git.jira_git_plugin:git-issue-webpanel'. Please contact your Jira administrators.