Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

table, parser: check for invalid GBK characters before insertion #28814

Merged
merged 25 commits into from
Nov 22, 2021
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8783c7d
table, parser: check for invalid GBK characters before insertion
tangenta Oct 14, 2021
76da0c5
fix encode char length calculation
tangenta Oct 14, 2021
062beea
format parser/charset/encoding_table.go
tangenta Oct 14, 2021
8b2e958
address comments
tangenta Oct 15, 2021
4a17d28
remove unnecessary field
tangenta Oct 15, 2021
3097c3a
format file
tangenta Oct 15, 2021
ad7cb70
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Oct 21, 2021
9ddd077
Merge branch 'master' into gbk-invalid
tangenta Nov 1, 2021
dda97b3
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 1, 2021
e8f180a
replace the invalid chars with '?'
tangenta Nov 2, 2021
13cf296
fix the truncation for ascii
tangenta Nov 2, 2021
650b63b
fix integration test TestStringValidatorUTF8
tangenta Nov 2, 2021
7a74b29
fix truncate algo
tangenta Nov 13, 2021
b1911e7
fix go import order
tangenta Nov 13, 2021
35e203f
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 13, 2021
c5ed6a8
fix integration test TestInsertWrongValueForField
tangenta Nov 13, 2021
ddb172a
Merge branch 'master' into gbk-invalid
tangenta Nov 18, 2021
fefc72c
Merge branch 'master' into gbk-invalid
tangenta Nov 18, 2021
bbded58
parser/charset: avoid unnecessary copying for valid ASCII string
tangenta Nov 19, 2021
3b685fc
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 19, 2021
7743db1
fix truncate bugs and add test
tangenta Nov 20, 2021
1b917dd
Merge remote-tracking branch 'upstream/master' into gbk-invalid
tangenta Nov 20, 2021
74f1b49
charset: make format and address comment
tangenta Nov 22, 2021
25bb576
Merge branch 'master' into gbk-invalid
ti-chi-bot Nov 22, 2021
48dcf12
Merge branch 'master' into gbk-invalid
ti-chi-bot Nov 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions cmd/explaintest/r/new_character_set_invalid.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
set @@sql_mode = 'strict_trans_tables';
drop table if exists t;
create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8);
insert into t values ('中文', 'asdf', '字符集');
insert into t values ('À', 'ø', '😂');
Error 1366: Incorrect string value '\xC3\x80' for column 'a'
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
Error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a'
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
Error 1366: Incorrect string value '\xFF\xFF' for column 'a'
select * from t;
a b c
中文 asdf 字符集
set @@sql_mode = '';
insert into t values ('À', 'ø', '😂');
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
select * from t;
a b c
中文 asdf 字符集
? ? ?
中文?中文 asdf?fdsa 字符集?字符集
@@ @@ @@
17 changes: 17 additions & 0 deletions cmd/explaintest/t/new_character_set_invalid.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
set @@sql_mode = 'strict_trans_tables';
drop table if exists t;
create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8);
insert into t values ('中文', 'asdf', '字符集');
-- error 1366: Incorrect string value '\xC3\x80' for column 'a'
insert into t values ('À', 'ø', '😂');
-- error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a'
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
-- error 1366: Incorrect string value '\xFF\xFF' for column 'a'
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
select * from t;

set @@sql_mode = '';
insert into t values ('À', 'ø', '😂');
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
select * from t;
2 changes: 1 addition & 1 deletion executor/insert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ func (s *testSuite3) TestInsertWrongValueForField(c *C) {
tk.MustExec(`create table t1(a char(10) charset utf8);`)
tk.MustExec(`insert into t1 values('我');`)
tk.MustExec(`alter table t1 add column b char(10) charset ascii as ((a));`)
tk.MustQuery(`select * from t1;`).Check(testkit.Rows(`我 `))
tk.MustQuery(`select * from t1;`).Check(testkit.Rows("我 ?"))

tk.MustExec(`drop table if exists t;`)
tk.MustExec(`create table t (a year);`)
Expand Down
11 changes: 2 additions & 9 deletions expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,12 +327,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression)
func isValidString(str string, dstChs string) bool {
switch dstChs {
case charset.CharsetASCII:
for _, c := range str {
if c >= 0x80 {
return false
}
}
return true
return charset.StringValidatorASCII{}.Validate(str) == -1
case charset.CharsetLatin1:
// For backward compatibility, we do not block SQL like select '啊' = convert('a' using latin1) collate latin1_bin;
return true
Expand All @@ -343,9 +338,7 @@ func isValidString(str string, dstChs string) bool {
// Convert to binary is always safe.
return true
default:
e, _ := charset.Lookup(dstChs)
_, err := e.NewEncoder().String(str)
return err == nil
return charset.StringValidatorOther{Charset: dstChs}.Validate(str) == -1
}
}

Expand Down
186 changes: 181 additions & 5 deletions parser/charset/encoding_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ package charset

import (
"strings"
go_unicode "unicode"
"unicode/utf8"

"github.com/cznic/mathutil"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
Expand Down Expand Up @@ -273,11 +276,10 @@ func FindNextCharacterLength(label string) func([]byte) int {

var encodingNextCharacterLength = map[string]func([]byte) int{
// https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram
"gbk": characterLengthGBK,
"utf-8": characterLengthUTF8,
"binary": func(bs []byte) int {
return 1
},
"gbk": characterLengthGBK,
"utf-8": characterLengthUTF8,
"binary": characterLengthOne,
"windows-1252": characterLengthOne,
}

func characterLengthGBK(bs []byte) int {
Expand All @@ -298,3 +300,177 @@ func characterLengthUTF8(bs []byte) int {
}
return 4
}

func characterLengthOne(_ []byte) int {
return 1
}

// TruncateStrategy indicates the way to handle the invalid strings in specific charset.
// - TruncateStrategyEmpty: returns an empty string.
// - TruncateStrategyTrim: returns the valid prefix part of string.
// - TruncateStrategyReplace: returns the whole string, but the invalid characters are replaced with '?'.
type TruncateStrategy int8

const (
TruncateStrategyEmpty TruncateStrategy = iota
TruncateStrategyTrim
TruncateStrategyReplace
)

var _ StringValidator = StringValidatorASCII{}
var _ StringValidator = StringValidatorUTF8{}
var _ StringValidator = StringValidatorOther{}

// StringValidator is used to check if a string is valid in the specific charset.
type StringValidator interface {
Validate(str string) (invalidPos int)
Truncate(str string, strategy TruncateStrategy) (result string, invalidPos int)
}

// StringValidatorASCII checks whether a string is valid ASCII string.
type StringValidatorASCII struct{}

// Validate checks whether the string is valid in the given charset.
func (s StringValidatorASCII) Validate(str string) int {
_, invalidPos := s.Truncate(str, TruncateStrategyEmpty)
return invalidPos
}

// Truncate implement the interface StringValidator.
func (s StringValidatorASCII) Truncate(str string, strategy TruncateStrategy) (string, int) {
invalidPos := -1
for i := 0; i < len(str); i++ {
if str[i] > go_unicode.MaxASCII {
invalidPos = i
}
}
if invalidPos == -1 {
// Quick check passed.
return str, -1
}
switch strategy {
case TruncateStrategyEmpty:
return "", invalidPos
case TruncateStrategyTrim:
return str[:invalidPos], invalidPos
case TruncateStrategyReplace:
result := make([]byte, 0, len(str))
for i, w := 0, 0; i < len(str); i += w {
w = 1
if str[i] > go_unicode.MaxASCII {
w = characterLengthUTF8(Slice(str)[i:])
w = mathutil.Min(w, len(str)-i)
result = append(result, '?')
continue
}
result = append(result, str[i:i+w]...)
}
return string(result), invalidPos
}
return str, -1
}

// StringValidatorUTF8 checks whether a string is valid UTF8 string.
type StringValidatorUTF8 struct {
IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4"
CheckMB4ValueInUTF8 bool
}

// Validate checks whether the string is valid in the given charset.
func (s StringValidatorUTF8) Validate(str string) int {
_, invalidPos := s.Truncate(str, TruncateStrategyEmpty)
return invalidPos
}

// Truncate implement the interface StringValidator.
func (s StringValidatorUTF8) Truncate(str string, strategy TruncateStrategy) (string, int) {
if s.IsUTF8MB4 && utf8.ValidString(str) {
// Quick check passed.
return str, -1
}
doMB4CharCheck := !s.IsUTF8MB4 && s.CheckMB4ValueInUTF8
var result []byte
if strategy == TruncateStrategyReplace {
result = make([]byte, 0, len(str))
}
invalidPos := -1
for i, w := 0, 0; i < len(str); i += w {
var rv rune
rv, w = utf8.DecodeRuneInString(str[i:])
if (rv == utf8.RuneError && !strings.HasPrefix(str[i:], string(utf8.RuneError))) ||
w > 3 && doMB4CharCheck {
if invalidPos == -1 {
invalidPos = i
}
switch strategy {
case TruncateStrategyEmpty:
return "", invalidPos
case TruncateStrategyTrim:
return str[:i], invalidPos
case TruncateStrategyReplace:
result = append(result, '?')
continue
}
}
if strategy == TruncateStrategyReplace {
result = append(result, str[i:i+w]...)
}
}
if strategy == TruncateStrategyReplace {
return string(result), invalidPos
}
return str, -1
}

// StringValidatorOther checks whether a string is valid string in given charset.
type StringValidatorOther struct {
Charset string
}

// Validate checks whether the string is valid in the given charset.
func (s StringValidatorOther) Validate(str string) int {
_, invalidPos := s.Truncate(str, TruncateStrategyEmpty)
return invalidPos
}

// Truncate implement the interface StringValidator.
func (s StringValidatorOther) Truncate(str string, strategy TruncateStrategy) (string, int) {
enc := NewEncoding(s.Charset)
if !enc.enabled() {
return str, -1
}
var result []byte
if strategy == TruncateStrategyReplace {
result = make([]byte, 0, len(str))
}
var buf [4]byte
strBytes := Slice(str)
transformer := enc.enc.NewEncoder()
invalidPos := -1
for i, w := 0, 0; i < len(str); i += w {
w = characterLengthUTF8(strBytes[i:])
w := mathutil.Min(w, len(str)-i)
_, _, err := transformer.Transform(buf[:], strBytes[i:i+w], true)
if err != nil {
if invalidPos == -1 {
invalidPos = i
}
switch strategy {
case TruncateStrategyEmpty:
return "", invalidPos
case TruncateStrategyTrim:
return str[:i], invalidPos
case TruncateStrategyReplace:
result = append(result, '?')
continue
}
}
if strategy == TruncateStrategyReplace {
result = append(result, strBytes[i:i+w]...)
}
}
if strategy == TruncateStrategyReplace {
return string(result), invalidPos
}
return str, -1
}
38 changes: 38 additions & 0 deletions parser/charset/encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,41 @@ func TestEncoding(t *testing.T) {
require.Equal(t, tc.result, string(result), cmt)
}
}

func TestStringValidatorASCII(t *testing.T) {
v := charset.StringValidatorASCII{}
require.Equal(t, -1, v.Validate("qwerty"))
require.Equal(t, 2, v.Validate("qwÊrty"))
require.Equal(t, 0, v.Validate("中文"))
}

func TestStringValidatorUTF8(t *testing.T) {
// Test charset "utf8mb4".
v := charset.StringValidatorUTF8{IsUTF8MB4: true}
require.Equal(t, -1, v.Validate("qwerty"))
require.Equal(t, -1, v.Validate("qwÊrty"))
require.Equal(t, -1, v.Validate("qwÊ合法字符串"))
require.Equal(t, -1, v.Validate("😂"))
invalid := string([]byte{0xff, 0xfe, 0xfd})
require.Equal(t, 0, v.Validate(invalid))
// Test charset "utf8" without checking mb4 value.
v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: false}
require.Equal(t, -1, v.Validate("qwerty"))
require.Equal(t, -1, v.Validate("qwÊrty"))
require.Equal(t, -1, v.Validate("qwÊ合法字符串"))
require.Equal(t, -1, v.Validate("😂"))
require.Equal(t, 0, v.Validate(invalid))
// Test charset "utf8" with checking mb4 value.
v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: true}
require.Equal(t, 0, v.Validate("😂")) // 4-bytes character is invalid.
require.Equal(t, 0, v.Validate(invalid))
}

func TestStringValidatorGBK(t *testing.T) {
v := charset.StringValidatorOther{Charset: "gbk"}
require.Equal(t, -1, v.Validate("asdf"))
require.Equal(t, -1, v.Validate("中文"))
require.Equal(t, 0, v.Validate("À"))
require.Equal(t, 4, v.Validate("asdfÀ"))
require.Equal(t, 6, v.Validate("中文À"))
}
Loading