|
Its still the same reason as in MDEV-11079:
The loading code tries to treat the binary data also as UTF-8 - thereby ignoring any escape sequeces.
The parser is basically:
- get next character
- if its an escape character, handle escape sequence
- otherwise determine multi-byte character length from the character and try copy as much characters [ignoring any escape sequence]
This breaks as soon as binary data is read using the UTF-8 characterset. Binary data will always contain non-utf8 conformant byte sequences. The start byte of a 2 byte UTF8 symbol may even be followed by a escape sequence.
Escape sequences starting in the middle of a multi-byte character are causing this bug.
The patch in MDEV-11079 is still valid for this bug. It switches the parser character set to a binary (single byte) while reading blob fields.
|
|
--- sql/sql_load.cc.old 2017-07-20 19:11:17.038746970 +0200
|
+++ sql/sql_load.cc 2017-07-21 15:09:38.039931561 +0200
|
@@ -165,7 +165,7 @@
|
String &field_term,String &line_start,String &line_term,
|
String &enclosed,int escape,bool get_it_from_net, bool is_fifo);
|
~READ_INFO();
|
- int read_field();
|
+ int read_field(CHARSET_INFO *field_charset);
|
int read_fixed_length(void);
|
int next_line(void);
|
char unescape(char chr);
|
@@ -1116,7 +1116,7 @@
|
uchar *pos;
|
Item_field *real_item;
|
|
- if (read_info.read_field())
|
+ if (read_info.read_field(item->real_item()->collation.collation))
|
break;
|
|
/* If this line is to be skipped we don't want to fill field or var */
|
@@ -1651,10 +1651,13 @@
|
must make sure to use escapes properly.
|
*/
|
|
-int READ_INFO::read_field()
|
+int READ_INFO::read_field(CHARSET_INFO *field_charset)
|
{
|
int chr,found_enclosed_char;
|
uchar *to,*new_buffer;
|
+ CHARSET_INFO *read_charset = this->read_charset;
|
+ if (field_charset == &my_charset_bin)
|
+ read_charset = &my_charset_bin;
|
|
found_null=0;
|
if (found_end_of_line)
|
|
|
MariaDB assumes that the entire file has the same encoding, because it is specified during 'SELECT .. INTO OUTFILE .. CHARACTER SET ...' on per file (not per column) level. Determining character set per column at LOAD DATA type would break symmetry.
To avoid escape/unescape problems on a mixture of character and binary data one should use 'CHARACTER SET binary' both in 'SELECT INTO OUTFILE' and 'LOAD DATA INFILE' queries.
This test demonstrates that a mixture of utf8 and binary data gets restored without problems:
let $MYSQLD_DATADIR= `SELECT @@datadir`;
|
|
SET NAMES utf8;
|
|
CREATE TABLE t1 (
|
x VARCHAR(10) CHARACTER SET UTF8 NOT NULL,
|
y MEDIUMBLOB NOT NULL
|
);
|
INSERT INTO t1 VALUES ('ÄÖÜ', X'8FE95C725427');
|
|
SELECT *
|
INTO OUTFILE 't1.txt' CHARACTER SET BINARY
|
FIELDS TERMINATED BY ';' ENCLOSED BY '"' ESCAPED BY '\\'
|
LINES TERMINATED BY '\n'
|
FROM t1;
|
|
CREATE TABLE t2 LIKE t1;
|
|
LOAD DATA INFILE 't1.txt' INTO TABLE t2 CHARSET BINARY
|
FIELDS TERMINATED BY ';' ENCLOSED BY '"' ESCAPED BY '\\'
|
LINES TERMINATED BY '\n';
|
SELECT 't1' AS tab, HEX(x), HEX(y) FROM t1
|
UNION ALL
|
SELECT 't2', HEX(x), HEX(y) FROM t2;
|
|
DROP TABLE t1, t2;
|
|
--remove_file $MYSQLD_DATADIR/test/t1.txt
|
The output is:
SET NAMES utf8;
|
CREATE TABLE t1 (
|
x VARCHAR(10) CHARACTER SET UTF8 NOT NULL,
|
y MEDIUMBLOB NOT NULL
|
);
|
INSERT INTO t1 VALUES ('ÄÖÜ', X'8FE95C725427');
|
SELECT *
|
INTO OUTFILE 't1.txt' CHARACTER SET BINARY
|
FIELDS TERMINATED BY ';' ENCLOSED BY '"' ESCAPED BY '\\'
|
LINES TERMINATED BY '\n'
|
FROM t1;
|
CREATE TABLE t2 LIKE t1;
|
LOAD DATA INFILE 't1.txt' INTO TABLE t2 CHARSET BINARY
|
FIELDS TERMINATED BY ';' ENCLOSED BY '"' ESCAPED BY '\\'
|
LINES TERMINATED BY '\n';
|
SELECT 't1' AS tab, HEX(x), HEX(y) FROM t1
|
UNION ALL
|
SELECT 't2', HEX(x), HEX(y) FROM t2;
|
tab HEX(x) HEX(y)
|
t1 C384C396C39C 8FE95C725427
|
t2 C384C396C39C 8FE95C725427
|
DROP TABLE t1, t2;
|
|