Skip to content

Commit

Permalink
table, parser: check for invalid GBK characters before insertion (#28814
Browse files Browse the repository at this point in the history
)
  • Loading branch information
tangenta authored Nov 22, 2021
1 parent 723242a commit c835349
Show file tree
Hide file tree
Showing 7 changed files with 374 additions and 67 deletions.
23 changes: 23 additions & 0 deletions cmd/explaintest/r/new_character_set_invalid.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
set @@sql_mode = 'strict_trans_tables';
drop table if exists t;
create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8);
insert into t values ('中文', 'asdf', '字符集');
insert into t values ('À', 'ø', '😂');
Error 1366: Incorrect string value '\xC3\x80' for column 'a'
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
Error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a'
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
Error 1366: Incorrect string value '\xFF\xFF' for column 'a'
select * from t;
a b c
中文 asdf 字符集
set @@sql_mode = '';
insert into t values ('À', 'ø', '😂');
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
select * from t;
a b c
中文 asdf 字符集
? ? ?
中文?中文 asdf?fdsa 字符集?字符集
@@ @@ @@
17 changes: 17 additions & 0 deletions cmd/explaintest/t/new_character_set_invalid.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
set @@sql_mode = 'strict_trans_tables';
drop table if exists t;
create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8);
insert into t values ('中文', 'asdf', '字符集');
-- error 1366: Incorrect string value '\xC3\x80' for column 'a'
insert into t values ('À', 'ø', '😂');
-- error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a'
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
-- error 1366: Incorrect string value '\xFF\xFF' for column 'a'
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
select * from t;

set @@sql_mode = '';
insert into t values ('À', 'ø', '😂');
insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集');
insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff);
select * from t;
2 changes: 1 addition & 1 deletion executor/insert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ func (s *testSuite3) TestInsertWrongValueForField(c *C) {
tk.MustExec(`create table t1(a char(10) charset utf8);`)
tk.MustExec(`insert into t1 values('我');`)
tk.MustExec(`alter table t1 add column b char(10) charset ascii as ((a));`)
tk.MustQuery(`select * from t1;`).Check(testkit.Rows(`我 `))
tk.MustQuery(`select * from t1;`).Check(testkit.Rows("我 ?"))

tk.MustExec(`drop table if exists t;`)
tk.MustExec(`create table t (a year);`)
Expand Down
11 changes: 2 additions & 9 deletions expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,12 +327,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression)
func isValidString(str string, dstChs string) bool {
switch dstChs {
case charset.CharsetASCII:
for _, c := range str {
if c >= 0x80 {
return false
}
}
return true
return charset.StringValidatorASCII{}.Validate(str) == -1
case charset.CharsetLatin1:
// For backward compatibility, we do not block SQL like select '啊' = convert('a' using latin1) collate latin1_bin;
return true
Expand All @@ -343,9 +338,7 @@ func isValidString(str string, dstChs string) bool {
// Convert to binary is always safe.
return true
default:
e, _ := charset.Lookup(dstChs)
_, err := e.NewEncoder().String(str)
return err == nil
return charset.StringValidatorOther{Charset: dstChs}.Validate(str) == -1
}
}

Expand Down
192 changes: 187 additions & 5 deletions parser/charset/encoding_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ package charset

import (
"strings"
go_unicode "unicode"
"unicode/utf8"

"github.com/cznic/mathutil"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
Expand Down Expand Up @@ -273,11 +276,10 @@ func FindNextCharacterLength(label string) func([]byte) int {

var encodingNextCharacterLength = map[string]func([]byte) int{
// https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram
"gbk": characterLengthGBK,
"utf-8": characterLengthUTF8,
"binary": func(bs []byte) int {
return 1
},
"gbk": characterLengthGBK,
"utf-8": characterLengthUTF8,
"binary": characterLengthOne,
"windows-1252": characterLengthOne,
}

func characterLengthGBK(bs []byte) int {
Expand All @@ -298,3 +300,183 @@ func characterLengthUTF8(bs []byte) int {
}
return 4
}

func characterLengthOne(_ []byte) int {
return 1
}

// TruncateStrategy indicates the way to handle the invalid strings in specific charset.
// - TruncateStrategyEmpty: returns an empty string.
// - TruncateStrategyTrim: returns the valid prefix part of string.
// - TruncateStrategyReplace: returns the whole string, but the invalid characters are replaced with '?'.
type TruncateStrategy int8

const (
TruncateStrategyEmpty TruncateStrategy = iota
TruncateStrategyTrim
TruncateStrategyReplace
)

var _ StringValidator = StringValidatorASCII{}
var _ StringValidator = StringValidatorUTF8{}
var _ StringValidator = StringValidatorOther{}

// StringValidator is used to check if a string is valid in the specific charset.
type StringValidator interface {
Validate(str string) (invalidPos int)
Truncate(str string, strategy TruncateStrategy) (result string, invalidPos int)
}

// StringValidatorASCII checks whether a string is valid ASCII string.
type StringValidatorASCII struct{}

// Validate checks whether the string is valid in the given charset.
func (s StringValidatorASCII) Validate(str string) int {
_, invalidPos := s.Truncate(str, TruncateStrategyEmpty)
return invalidPos
}

// Truncate implement the interface StringValidator.
func (s StringValidatorASCII) Truncate(str string, strategy TruncateStrategy) (string, int) {
invalidPos := -1
for i := 0; i < len(str); i++ {
if str[i] > go_unicode.MaxASCII {
invalidPos = i
break
}
}
if invalidPos == -1 {
// Quick check passed.
return str, -1
}
switch strategy {
case TruncateStrategyEmpty:
return "", invalidPos
case TruncateStrategyTrim:
return str[:invalidPos], invalidPos
case TruncateStrategyReplace:
result := make([]byte, 0, len(str))
for i, w := 0, 0; i < len(str); i += w {
w = 1
if str[i] > go_unicode.MaxASCII {
w = characterLengthUTF8(Slice(str)[i:])
w = mathutil.Min(w, len(str)-i)
result = append(result, '?')
continue
}
result = append(result, str[i:i+w]...)
}
return string(result), invalidPos
}
return str, -1
}

// StringValidatorUTF8 checks whether a string is valid UTF8 string.
type StringValidatorUTF8 struct {
IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4"
CheckMB4ValueInUTF8 bool
}

// Validate checks whether the string is valid in the given charset.
func (s StringValidatorUTF8) Validate(str string) int {
_, invalidPos := s.Truncate(str, TruncateStrategyEmpty)
return invalidPos
}

// Truncate implement the interface StringValidator.
func (s StringValidatorUTF8) Truncate(str string, strategy TruncateStrategy) (string, int) {
if str == "" {
return str, -1
}
if s.IsUTF8MB4 && utf8.ValidString(str) {
// Quick check passed.
return str, -1
}
doMB4CharCheck := !s.IsUTF8MB4 && s.CheckMB4ValueInUTF8
var result []byte
if strategy == TruncateStrategyReplace {
result = make([]byte, 0, len(str))
}
invalidPos := -1
for i, w := 0, 0; i < len(str); i += w {
var rv rune
rv, w = utf8.DecodeRuneInString(str[i:])
if (rv == utf8.RuneError && w == 1) || (w > 3 && doMB4CharCheck) {
if invalidPos == -1 {
invalidPos = i
}
switch strategy {
case TruncateStrategyEmpty:
return "", invalidPos
case TruncateStrategyTrim:
return str[:i], invalidPos
case TruncateStrategyReplace:
result = append(result, '?')
continue
}
}
if strategy == TruncateStrategyReplace {
result = append(result, str[i:i+w]...)
}
}
if strategy == TruncateStrategyReplace {
return string(result), invalidPos
}
return str, -1
}

// StringValidatorOther checks whether a string is valid string in given charset.
type StringValidatorOther struct {
Charset string
}

// Validate checks whether the string is valid in the given charset.
func (s StringValidatorOther) Validate(str string) int {
_, invalidPos := s.Truncate(str, TruncateStrategyEmpty)
return invalidPos
}

// Truncate implement the interface StringValidator.
func (s StringValidatorOther) Truncate(str string, strategy TruncateStrategy) (string, int) {
if str == "" {
return str, -1
}
enc := NewEncoding(s.Charset)
if !enc.enabled() {
return str, -1
}
var result []byte
if strategy == TruncateStrategyReplace {
result = make([]byte, 0, len(str))
}
var buf [4]byte
strBytes := Slice(str)
transformer := enc.enc.NewEncoder()
invalidPos := -1
for i, w := 0, 0; i < len(str); i += w {
w = characterLengthUTF8(strBytes[i:])
w = mathutil.Min(w, len(str)-i)
_, _, err := transformer.Transform(buf[:], strBytes[i:i+w], true)
if err != nil {
if invalidPos == -1 {
invalidPos = i
}
switch strategy {
case TruncateStrategyEmpty:
return "", invalidPos
case TruncateStrategyTrim:
return str[:i], invalidPos
case TruncateStrategyReplace:
result = append(result, '?')
continue
}
}
if strategy == TruncateStrategyReplace {
result = append(result, strBytes[i:i+w]...)
}
}
if strategy == TruncateStrategyReplace {
return string(result), invalidPos
}
return str, -1
}
Loading

0 comments on commit c835349

Please sign in to comment.