From 8783c7d359c2d49ecb8854e8453d4da6324dc73f Mon Sep 17 00:00:00 2001 From: tangenta Date: Thu, 14 Oct 2021 14:44:27 +0800 Subject: [PATCH 01/15] table, parser: check for invalid GBK characters before insertion --- .gitignore | 2 + .../r/new_character_set_invalid.result | 19 +++ .../t/new_character_set_invalid.test | 14 +++ parser/charset/encoding.go | 29 +++++ parser/charset/encoding_table.go | 118 +++++++++++++++--- parser/charset/encoding_test.go | 43 +++++++ table/column.go | 71 ++++------- 7 files changed, 231 insertions(+), 65 deletions(-) create mode 100644 cmd/explaintest/r/new_character_set_invalid.result create mode 100644 cmd/explaintest/t/new_character_set_invalid.test diff --git a/.gitignore b/.gitignore index ef4a4e7052c04..594f123a5d51a 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ cmd/explaintest/explain-test.out cmd/explaintest/explaintest_tidb-server cmd/explaintest/portgenerator cmd/explaintest/s/ +cmd/explaintest/importer +cmd/pluginpkg/pluginpkg *.fail.go tools/bin/ vendor diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result new file mode 100644 index 0000000000000..cce03cf556084 --- /dev/null +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -0,0 +1,19 @@ +set @@sql_mode = 'strict_trans_tables'; +drop table if exists t; +create table t (a varchar(255) charset gbk); +insert into t values ('中文'); +insert into t values ('À'); +Error 1366: Incorrect string value '\xC3\x80' for column 'a' +insert into t values ('中文À中文'); +Error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' +select a from t; +a +中文 +set @@sql_mode = ''; +insert into t values ('À'); +insert into t values ('中文À中文'); +select a from t; +a +中文 + +中文 diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test new file mode 100644 index 0000000000000..d0d9429c99706 --- /dev/null +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -0,0 +1,14 @@ +set @@sql_mode = 'strict_trans_tables'; +drop table if exists t; +create table t (a varchar(255) charset gbk); +insert into t values ('中文'); +-- error 1366: Incorrect string value '\xC3\x80' for column 'a' +insert into t values ('À'); +-- error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' +insert into t values ('中文À中文'); +select a from t; + +set @@sql_mode = ''; +insert into t values ('À'); +insert into t values ('中文À中文'); +select a from t; diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index fe99fb10691ee..a5a4e2ec9f22b 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -97,6 +97,35 @@ func (e *Encoding) Decode(dest, src []byte) ([]byte, error) { return e.transform(e.enc.NewDecoder(), dest, src, true) } +// IsValid checks whether src(utf8) bytes can be encode into a string with given charset. +// Return -1 if it decodes successfully. +func (e *Encoding) IsValid(src []byte) (invalidPos int) { + dec := e.enc.NewEncoder() + dest := [4]byte{} + var srcOffset int + for srcOffset < len(src) { + srcNextLen := characterLengthUTF8(src[srcOffset:]) + srcEnd := mathutil.Min(srcOffset+srcNextLen, len(src)) + _, nSrc, err := dec.Transform(dest[:], src[srcOffset:srcEnd], false) + if err != nil { + return srcOffset + } + srcOffset += nSrc + } + return -1 +} + +func nextLengthUTF8(bs []byte) int { + if len(bs) == 0 || bs[0] < 0x80 { + return 1 + } else if bs[0] < 0xe0 { + return 2 + } else if bs[0] < 0xf0 { + return 3 + } + return 4 +} + func (e *Encoding) transform(transformer transform.Transformer, dest, src []byte, isDecoding bool) ([]byte, error) { if len(dest) < len(src) { dest = make([]byte, len(src)*2) diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index e3cb4c9bca721..2df8e090f564a 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -15,6 +15,8 @@ package charset import ( "strings" + go_unicode "unicode" + "unicode/utf8" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" @@ -273,24 +275,106 @@ func FindNextCharacterLength(label string) func([]byte) int { var encodingNextCharacterLength = map[string]func([]byte) int{ // https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram - "gbk": func(bs []byte) int { - if len(bs) == 0 || bs[0] < 0x80 { - // A byte in the range 00–7F is a single byte that means the same thing as it does in ASCII. - return 1 - } - return 2 - }, - "utf-8": func(bs []byte) int { - if len(bs) == 0 || bs[0] < 0x80 { - return 1 - } else if bs[0] < 0xe0 { - return 2 - } else if bs[0] < 0xf0 { - return 3 - } - return 4 - }, + "gbk": characterLengthGBK, + "utf-8": characterLengthUTF8, "binary": func(bs []byte) int { return 1 }, } + +func characterLengthGBK(bs []byte) int { + if len(bs) == 0 || bs[0] < 0x80 { + // A byte in the range 00–7F is a single byte that means the same thing as it does in ASCII. + return 1 + } + return 2 +} + +func characterLengthUTF8(bs []byte) int { + if len(bs) == 0 || bs[0] < 0x80 { + return 1 + } else if bs[0] < 0xe0 { + return 2 + } else if bs[0] < 0xf0 { + return 3 + } + return 4 +} + +var _ StringValidator = StringValidatorASCII{} +var _ StringValidator = StringValidatorUTF8{} +var _ StringValidator = StringValidatorOther{} + +// StringValidator is used to check if a string is valid in the specific charset. +type StringValidator interface { + Validate(str string) (invalidPos int) +} + +// StringValidatorASCII checks whether a string is valid ASCII string. +type StringValidatorASCII struct { + Enabled bool +} + +// Validate checks whether the string is valid in the given charset. +// It returns the first invalid byte offset. +func (s StringValidatorASCII) Validate(str string) (invalidPos int) { + if !s.Enabled { + return -1 + } + for i := 0; i < len(str); i++ { + if str[i] > go_unicode.MaxASCII { + return i + } + } + return -1 +} + +// StringValidatorUTF8 checks whether a string is valid UTF8 string. +type StringValidatorUTF8 struct { + Enabled bool + IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4" + CheckMB4ValueInUTF8 bool +} + +// Validate checks whether the string is valid in the given charset. +// It returns the first invalid byte offset. +func (s StringValidatorUTF8) Validate(str string) (invalidPos int) { + if !s.Enabled { + return -1 + } + if s.IsUTF8MB4 && utf8.ValidString(str) { + // Quick check passed. + return -1 + } + doMB4CharCheck := !s.IsUTF8MB4 && s.CheckMB4ValueInUTF8 + for i, w := 0, 0; i < len(str); i += w { + runeValue, width := utf8.DecodeRuneInString(str[i:]) + if runeValue == utf8.RuneError { + if strings.HasPrefix(str[i:], string(utf8.RuneError)) { + w = width + continue + } + return i + } else if width > 3 && doMB4CharCheck { + // Meet non-BMP characters. + return i + } + w = width + } + return -1 +} + +// StringValidatorOther checks whether a string is valid string in given charset. +type StringValidatorOther struct { + Charset string +} + +// Validate checks whether the string is valid in the given charset. +// It returns the first invalid byte offset. +func (s StringValidatorOther) Validate(str string) (invalidPos int) { + enc := NewEncoding(s.Charset) + if !enc.Enabled() { + return -1 + } + return enc.IsValid([]byte(str)) +} diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index 49b65f0dfcc58..0f1ee1bd4f325 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -74,3 +74,46 @@ func (s *testEncodingSuite) TestEncoding(c *C) { c.Assert(string(result), Equals, tc.result, Commentf("%v", tc)) } } + +func (s *testEncodingSuite) TestValidatorASCII(c *C) { + v := charset.StringValidatorASCII{Enabled: false} + c.Assert(v.Validate("qwerty"), Equals, -1) + c.Assert(v.Validate("qwÊrty"), Equals, -1) + v.Enabled = true + c.Assert(v.Validate("qwerty"), Equals, -1) + c.Assert(v.Validate("qwÊrty"), Equals, 2) +} + +func (s *testEncodingSuite) TestValidatorUTF8(c *C) { + v := charset.StringValidatorUTF8{Enabled: false} + c.Assert(v.Validate("qwerty"), Equals, -1) + // Test charset "utf8mb4". + v = charset.StringValidatorUTF8{Enabled: true, IsUTF8MB4: true} + c.Assert(v.Validate("qwerty"), Equals, -1) + c.Assert(v.Validate("qwÊrty"), Equals, -1) + c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) + c.Assert(v.Validate("😂"), Equals, -1) + invalid := string([]byte{0xff, 0xfe, 0xfd}) + c.Assert(v.Validate(invalid), Equals, 0) + // Test charset "utf8" without checking mb4 value. + v = charset.StringValidatorUTF8{Enabled: true, IsUTF8MB4: false, CheckMB4ValueInUTF8: false} + c.Assert(v.Validate("qwerty"), Equals, -1) + c.Assert(v.Validate("qwÊrty"), Equals, -1) + c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) + c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) + c.Assert(v.Validate("😂"), Equals, -1) + c.Assert(v.Validate(invalid), Equals, 0) + // Test charset "utf8" with checking mb4 value. + v = charset.StringValidatorUTF8{Enabled: true, IsUTF8MB4: false, CheckMB4ValueInUTF8: true} + c.Assert(v.Validate("😂"), Equals, 0) // 4-bytes character is invalid. + c.Assert(v.Validate(invalid), Equals, 0) +} + +func (s *testEncodingSuite) TestValidatorGBK(c *C) { + v := charset.StringValidatorOther{Charset: "gbk"} + c.Assert(v.Validate("asdf"), Equals, -1) + c.Assert(v.Validate("中文"), Equals, -1) + c.Assert(v.Validate("À"), Equals, 0) + c.Assert(v.Validate("asdfÀ"), Equals, 4) + c.Assert(v.Validate("中文À"), Equals, 6) +} diff --git a/table/column.go b/table/column.go index 142d83883ae8e..afd0548102642 100644 --- a/table/column.go +++ b/table/column.go @@ -24,7 +24,6 @@ import ( "strings" "time" "unicode" - "unicode/utf8" "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/expression" @@ -171,7 +170,7 @@ func truncateTrailingSpaces(v *types.Datum) { v.SetString(str, v.Collation()) } -func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, casted *types.Datum, str string, i int) (types.Datum, error) { +func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, str string, i int) (types.Datum, error) { sc := ctx.GetSessionVars().StmtCtx var strval strings.Builder @@ -319,61 +318,37 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r truncateTrailingSpaces(&casted) } - if col.Charset == charset.CharsetASCII { - if ctx.GetSessionVars().SkipASCIICheck { - return casted, nil - } - + if v := makeStringValidator(ctx, col); v != nil { str := casted.GetString() - for i := 0; i < len(str); i++ { - if str[i] > unicode.MaxASCII { - casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i) - break - } - } - if forceIgnoreTruncate { - err = nil - } - return casted, err - } - - if ctx.GetSessionVars().SkipUTF8Check { - return casted, nil - } - - if !mysql.IsUTF8Charset(col.Charset) { - return casted, nil - } - str := casted.GetString() - utf8Charset := col.Charset == mysql.UTF8Charset - doMB4CharCheck := utf8Charset && config.GetGlobalConfig().CheckMb4ValueInUTF8 - fastCheck := (col.Charset == mysql.UTF8MB4Charset) && utf8.ValidString(str) - if !fastCheck { - // The following check is slow, if we fast check success, we can avoid this. - for i, w := 0, 0; i < len(str); i += w { - runeValue, width := utf8.DecodeRuneInString(str[i:]) - if runeValue == utf8.RuneError { - if strings.HasPrefix(str[i:], string(utf8.RuneError)) { - w = width - continue - } - casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i) - break - } else if width > 3 && doMB4CharCheck { - // Handle non-BMP characters. - casted, err = handleWrongCharsetValue(ctx, col, &casted, str, i) - break - } - w = width + if invalidPos := v.Validate(str); invalidPos >= 0 { + casted, err = handleWrongCharsetValue(ctx, col, str, invalidPos) } } - if forceIgnoreTruncate { err = nil } return casted, err } +func makeStringValidator(ctx sessionctx.Context, col *model.ColumnInfo) charset.StringValidator { + var validator charset.StringValidator + switch col.Charset { + case charset.CharsetASCII: + enabled := !ctx.GetSessionVars().SkipASCIICheck + validator = charset.StringValidatorASCII{Enabled: enabled} + case charset.CharsetUTF8: + enabled := !ctx.GetSessionVars().SkipUTF8Check + needCheckMB4 := config.GetGlobalConfig().CheckMb4ValueInUTF8 + validator = charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: false, CheckMB4ValueInUTF8: needCheckMB4} + case charset.CharsetUTF8MB4: + enabled := !ctx.GetSessionVars().SkipUTF8Check + validator = charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: true} + case charset.CharsetGBK: + validator = charset.StringValidatorOther{Charset: "gbk"} + } + return validator +} + // ColDesc describes column information like MySQL desc and show columns do. type ColDesc struct { Field string From 76da0c5ce359f90f31d7bef82772253622683ac0 Mon Sep 17 00:00:00 2001 From: tangenta Date: Thu, 14 Oct 2021 15:09:29 +0800 Subject: [PATCH 02/15] fix encode char length calculation --- parser/charset/encoding.go | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index a5a4e2ec9f22b..5c60c7f627f97 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -115,17 +115,6 @@ func (e *Encoding) IsValid(src []byte) (invalidPos int) { return -1 } -func nextLengthUTF8(bs []byte) int { - if len(bs) == 0 || bs[0] < 0x80 { - return 1 - } else if bs[0] < 0xe0 { - return 2 - } else if bs[0] < 0xf0 { - return 3 - } - return 4 -} - func (e *Encoding) transform(transformer transform.Transformer, dest, src []byte, isDecoding bool) ([]byte, error) { if len(dest) < len(src) { dest = make([]byte, len(src)*2) @@ -155,10 +144,14 @@ func (e *Encoding) transform(transformer transform.Transformer, dest, src []byte } func (e *Encoding) nextCharLenInSrc(srcRest []byte, isDecoding bool) int { - if isDecoding && e.charLength != nil { - return e.charLength(srcRest) + if isDecoding { + if e.charLength != nil { + return e.charLength(srcRest) + } + return len(srcRest) + } else { + return characterLengthUTF8(srcRest) } - return len(srcRest) } func enlargeCapacity(dest []byte) []byte { From 062beea8fcfe4ab2e18fa9e82c0c924c3267102a Mon Sep 17 00:00:00 2001 From: tangenta Date: Thu, 14 Oct 2021 16:03:44 +0800 Subject: [PATCH 03/15] format parser/charset/encoding_table.go --- parser/charset/encoding_table.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index 2df8e090f564a..f824259f08bea 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -275,7 +275,7 @@ func FindNextCharacterLength(label string) func([]byte) int { var encodingNextCharacterLength = map[string]func([]byte) int{ // https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram - "gbk": characterLengthGBK, + "gbk": characterLengthGBK, "utf-8": characterLengthUTF8, "binary": func(bs []byte) int { return 1 @@ -331,8 +331,8 @@ func (s StringValidatorASCII) Validate(str string) (invalidPos int) { // StringValidatorUTF8 checks whether a string is valid UTF8 string. type StringValidatorUTF8 struct { - Enabled bool - IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4" + Enabled bool + IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4" CheckMB4ValueInUTF8 bool } From 8b2e95836f6d995643d4ddbfa930c233db9094c1 Mon Sep 17 00:00:00 2001 From: tangenta Date: Fri, 15 Oct 2021 12:14:19 +0800 Subject: [PATCH 04/15] address comments --- expression/collation.go | 13 ++++--------- table/column.go | 14 +++++++------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/expression/collation.go b/expression/collation.go index 1ce5f4b25c5a7..39d8b4c9f56fb 100644 --- a/expression/collation.go +++ b/expression/collation.go @@ -339,12 +339,8 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) func isValidString(str string, dstChs string) bool { switch dstChs { case charset.CharsetASCII: - for _, c := range str { - if c >= 0x80 { - return false - } - } - return true + v := charset.StringValidatorASCII{Enabled: true} + return v.Validate(str) == -1 case charset.CharsetLatin1: // For backward compatibility, we do not block SQL like select '啊' = convert('a' using latin1) collate latin1_bin; return true @@ -355,9 +351,8 @@ func isValidString(str string, dstChs string) bool { // Convert to binary is always safe. return true default: - e, _ := charset.Lookup(dstChs) - _, err := e.NewEncoder().String(str) - return err == nil + v := charset.StringValidatorOther{Charset: dstChs} + return v.Validate(str) == -1 } } diff --git a/table/column.go b/table/column.go index afd0548102642..5624e8991ca61 100644 --- a/table/column.go +++ b/table/column.go @@ -331,22 +331,22 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r } func makeStringValidator(ctx sessionctx.Context, col *model.ColumnInfo) charset.StringValidator { - var validator charset.StringValidator switch col.Charset { case charset.CharsetASCII: enabled := !ctx.GetSessionVars().SkipASCIICheck - validator = charset.StringValidatorASCII{Enabled: enabled} + return charset.StringValidatorASCII{Enabled: enabled} case charset.CharsetUTF8: enabled := !ctx.GetSessionVars().SkipUTF8Check needCheckMB4 := config.GetGlobalConfig().CheckMb4ValueInUTF8 - validator = charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: false, CheckMB4ValueInUTF8: needCheckMB4} + return charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: false, CheckMB4ValueInUTF8: needCheckMB4} case charset.CharsetUTF8MB4: enabled := !ctx.GetSessionVars().SkipUTF8Check - validator = charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: true} - case charset.CharsetGBK: - validator = charset.StringValidatorOther{Charset: "gbk"} + return charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: true} + case charset.CharsetLatin1, charset.CharsetBinary: + return nil + default: + return charset.StringValidatorOther{Charset: col.Charset} } - return validator } // ColDesc describes column information like MySQL desc and show columns do. From 4a17d2890cde6226cdb11b3dfbfcfd403446d000 Mon Sep 17 00:00:00 2001 From: tangenta Date: Fri, 15 Oct 2021 14:44:50 +0800 Subject: [PATCH 05/15] remove unnecessary field --- expression/collation.go | 6 ++---- parser/charset/encoding_table.go | 11 +---------- parser/charset/encoding_test.go | 14 +++++--------- table/column.go | 18 ++++++++++++------ 4 files changed, 20 insertions(+), 29 deletions(-) diff --git a/expression/collation.go b/expression/collation.go index 39d8b4c9f56fb..215e61d1da407 100644 --- a/expression/collation.go +++ b/expression/collation.go @@ -339,8 +339,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) func isValidString(str string, dstChs string) bool { switch dstChs { case charset.CharsetASCII: - v := charset.StringValidatorASCII{Enabled: true} - return v.Validate(str) == -1 + return charset.StringValidatorASCII{}.Validate(str) == -1 case charset.CharsetLatin1: // For backward compatibility, we do not block SQL like select '啊' = convert('a' using latin1) collate latin1_bin; return true @@ -351,8 +350,7 @@ func isValidString(str string, dstChs string) bool { // Convert to binary is always safe. return true default: - v := charset.StringValidatorOther{Charset: dstChs} - return v.Validate(str) == -1 + return charset.StringValidatorOther{Charset: dstChs}.Validate(str) == -1 } } diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index f824259f08bea..8389696bae5bf 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -311,16 +311,11 @@ type StringValidator interface { } // StringValidatorASCII checks whether a string is valid ASCII string. -type StringValidatorASCII struct { - Enabled bool -} +type StringValidatorASCII struct {} // Validate checks whether the string is valid in the given charset. // It returns the first invalid byte offset. func (s StringValidatorASCII) Validate(str string) (invalidPos int) { - if !s.Enabled { - return -1 - } for i := 0; i < len(str); i++ { if str[i] > go_unicode.MaxASCII { return i @@ -331,7 +326,6 @@ func (s StringValidatorASCII) Validate(str string) (invalidPos int) { // StringValidatorUTF8 checks whether a string is valid UTF8 string. type StringValidatorUTF8 struct { - Enabled bool IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4" CheckMB4ValueInUTF8 bool } @@ -339,9 +333,6 @@ type StringValidatorUTF8 struct { // Validate checks whether the string is valid in the given charset. // It returns the first invalid byte offset. func (s StringValidatorUTF8) Validate(str string) (invalidPos int) { - if !s.Enabled { - return -1 - } if s.IsUTF8MB4 && utf8.ValidString(str) { // Quick check passed. return -1 diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index 0f1ee1bd4f325..cd8387256024f 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -76,19 +76,15 @@ func (s *testEncodingSuite) TestEncoding(c *C) { } func (s *testEncodingSuite) TestValidatorASCII(c *C) { - v := charset.StringValidatorASCII{Enabled: false} - c.Assert(v.Validate("qwerty"), Equals, -1) - c.Assert(v.Validate("qwÊrty"), Equals, -1) - v.Enabled = true + v := charset.StringValidatorASCII{} c.Assert(v.Validate("qwerty"), Equals, -1) c.Assert(v.Validate("qwÊrty"), Equals, 2) + c.Assert(v.Validate("中文"), Equals, 0) } func (s *testEncodingSuite) TestValidatorUTF8(c *C) { - v := charset.StringValidatorUTF8{Enabled: false} - c.Assert(v.Validate("qwerty"), Equals, -1) // Test charset "utf8mb4". - v = charset.StringValidatorUTF8{Enabled: true, IsUTF8MB4: true} + v := charset.StringValidatorUTF8{IsUTF8MB4: true} c.Assert(v.Validate("qwerty"), Equals, -1) c.Assert(v.Validate("qwÊrty"), Equals, -1) c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) @@ -96,7 +92,7 @@ func (s *testEncodingSuite) TestValidatorUTF8(c *C) { invalid := string([]byte{0xff, 0xfe, 0xfd}) c.Assert(v.Validate(invalid), Equals, 0) // Test charset "utf8" without checking mb4 value. - v = charset.StringValidatorUTF8{Enabled: true, IsUTF8MB4: false, CheckMB4ValueInUTF8: false} + v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: false} c.Assert(v.Validate("qwerty"), Equals, -1) c.Assert(v.Validate("qwÊrty"), Equals, -1) c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) @@ -104,7 +100,7 @@ func (s *testEncodingSuite) TestValidatorUTF8(c *C) { c.Assert(v.Validate("😂"), Equals, -1) c.Assert(v.Validate(invalid), Equals, 0) // Test charset "utf8" with checking mb4 value. - v = charset.StringValidatorUTF8{Enabled: true, IsUTF8MB4: false, CheckMB4ValueInUTF8: true} + v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: true} c.Assert(v.Validate("😂"), Equals, 0) // 4-bytes character is invalid. c.Assert(v.Validate(invalid), Equals, 0) } diff --git a/table/column.go b/table/column.go index 5624e8991ca61..81d667c2e24ef 100644 --- a/table/column.go +++ b/table/column.go @@ -333,15 +333,21 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r func makeStringValidator(ctx sessionctx.Context, col *model.ColumnInfo) charset.StringValidator { switch col.Charset { case charset.CharsetASCII: - enabled := !ctx.GetSessionVars().SkipASCIICheck - return charset.StringValidatorASCII{Enabled: enabled} + if ctx.GetSessionVars().SkipASCIICheck { + return nil + } + return charset.StringValidatorASCII{} case charset.CharsetUTF8: - enabled := !ctx.GetSessionVars().SkipUTF8Check + if ctx.GetSessionVars().SkipUTF8Check { + return nil + } needCheckMB4 := config.GetGlobalConfig().CheckMb4ValueInUTF8 - return charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: false, CheckMB4ValueInUTF8: needCheckMB4} + return charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: needCheckMB4} case charset.CharsetUTF8MB4: - enabled := !ctx.GetSessionVars().SkipUTF8Check - return charset.StringValidatorUTF8{Enabled: enabled, IsUTF8MB4: true} + if ctx.GetSessionVars().SkipUTF8Check { + return nil + } + return charset.StringValidatorUTF8{IsUTF8MB4: true} case charset.CharsetLatin1, charset.CharsetBinary: return nil default: From 3097c3aa2808558b73cf77ba0860328ae0458d14 Mon Sep 17 00:00:00 2001 From: tangenta Date: Fri, 15 Oct 2021 15:16:39 +0800 Subject: [PATCH 06/15] format file --- parser/charset/encoding_table.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index 8389696bae5bf..a8b1fd92e629e 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -311,7 +311,7 @@ type StringValidator interface { } // StringValidatorASCII checks whether a string is valid ASCII string. -type StringValidatorASCII struct {} +type StringValidatorASCII struct{} // Validate checks whether the string is valid in the given charset. // It returns the first invalid byte offset. From e8f180a2ba2a30957d73937819affb0e6d91006a Mon Sep 17 00:00:00 2001 From: tangenta Date: Tue, 2 Nov 2021 10:35:25 +0800 Subject: [PATCH 07/15] replace the invalid chars with '?' --- .../r/new_character_set_invalid.result | 28 +++++------ .../t/new_character_set_invalid.test | 16 +++---- parser/charset/encoding.go | 3 +- parser/charset/encoding_table.go | 44 +++++++++++++++-- parser/charset/encoding_test.go | 47 +++++++++---------- table/column.go | 20 ++++---- 6 files changed, 93 insertions(+), 65 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index cce03cf556084..14d8c747b81d0 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -1,19 +1,19 @@ set @@sql_mode = 'strict_trans_tables'; drop table if exists t; -create table t (a varchar(255) charset gbk); -insert into t values ('中文'); -insert into t values ('À'); +create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8); +insert into t values ('中文', 'asdf', '字符集'); +insert into t values ('À', 'ø', '😂'); Error 1366: Incorrect string value '\xC3\x80' for column 'a' -insert into t values ('中文À中文'); +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); Error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' -select a from t; -a -中文 +select * from t; +a b c +中文 asdf 字符集 set @@sql_mode = ''; -insert into t values ('À'); -insert into t values ('中文À中文'); -select a from t; -a -中文 - -中文 +insert into t values ('À', 'ø', '😂'); +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +select * from t; +a b c +中文 asdf 字符集 +? ø ? +中文?中文 asdføfdsa 字符集?字符集 diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test index d0d9429c99706..c5f738fac790b 100644 --- a/cmd/explaintest/t/new_character_set_invalid.test +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -1,14 +1,14 @@ set @@sql_mode = 'strict_trans_tables'; drop table if exists t; -create table t (a varchar(255) charset gbk); -insert into t values ('中文'); +create table t (a varchar(255) charset gbk, b varchar(255) charset ascii, c varchar(255) charset utf8); +insert into t values ('中文', 'asdf', '字符集'); -- error 1366: Incorrect string value '\xC3\x80' for column 'a' -insert into t values ('À'); +insert into t values ('À', 'ø', '😂'); -- error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' -insert into t values ('中文À中文'); -select a from t; +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +select * from t; set @@sql_mode = ''; -insert into t values ('À'); -insert into t values ('中文À中文'); -select a from t; +insert into t values ('À', 'ø', '😂'); +insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +select * from t; diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 6fffea1b4c1f1..f7878e6f2d567 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -157,9 +157,8 @@ func (e *Encoding) DecodeString(src string) (string, error) { return string(bs), err } - // IsValid checks whether src(utf8) bytes can be encode into a string with given charset. -// Return -1 if it decodes successfully. +// Return -1 if it encodes successfully. func (e *Encoding) IsValid(src []byte) (invalidPos int) { dec := e.enc.NewEncoder() dest := [4]byte{} diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index 99f387d8d6d1d..c7f2e6c10ad56 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -275,11 +275,10 @@ func FindNextCharacterLength(label string) func([]byte) int { var encodingNextCharacterLength = map[string]func([]byte) int{ // https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram - "gbk": characterLengthGBK, - "utf-8": characterLengthUTF8, - "binary": func(bs []byte) int { - return 1 - }, + "gbk": characterLengthGBK, + "utf-8": characterLengthUTF8, + "binary": characterLengthOne, + "windows-1252": characterLengthOne, } func characterLengthGBK(bs []byte) int { @@ -301,6 +300,10 @@ func characterLengthUTF8(bs []byte) int { return 4 } +func characterLengthOne(_ []byte) int { + return 1 +} + var _ StringValidator = StringValidatorASCII{} var _ StringValidator = StringValidatorUTF8{} var _ StringValidator = StringValidatorOther{} @@ -308,6 +311,7 @@ var _ StringValidator = StringValidatorOther{} // StringValidator is used to check if a string is valid in the specific charset. type StringValidator interface { Validate(str string) (invalidPos int) + Truncate(str string) string } // StringValidatorASCII checks whether a string is valid ASCII string. @@ -324,6 +328,13 @@ func (s StringValidatorASCII) Validate(str string) (invalidPos int) { return -1 } +// Truncate implement the interface StringValidator. +func (s StringValidatorASCII) Truncate(str string) string { + enc := NewEncoding("ascii") + truncated := enc.EncodeInternal(nil, []byte(str)) + return string(truncated) +} + // StringValidatorUTF8 checks whether a string is valid UTF8 string. type StringValidatorUTF8 struct { IsUTF8MB4 bool // Distinguish between "utf8" and "utf8mb4" @@ -355,6 +366,22 @@ func (s StringValidatorUTF8) Validate(str string) (invalidPos int) { return -1 } +// Truncate implement the interface StringValidator. +func (s StringValidatorUTF8) Truncate(str string) string { + r := make([]byte, 0, len(str)) + for i, w := 0, 0; i < len(str); i += w { + rv, width := utf8.DecodeRuneInString(str[i:]) + w = width + if (rv == utf8.RuneError && !strings.HasPrefix(str[i:], string(utf8.RuneError))) || + width > 3 && !s.IsUTF8MB4 { + r = append(r, '?') + } else { + r = append(r, str[i:i+w]...) + } + } + return string(r) +} + // StringValidatorOther checks whether a string is valid string in given charset. type StringValidatorOther struct { Charset string @@ -369,3 +396,10 @@ func (s StringValidatorOther) Validate(str string) (invalidPos int) { } return enc.IsValid([]byte(str)) } + +// Truncate implement the interface StringValidator. +func (s StringValidatorOther) Truncate(str string) string { + enc := NewEncoding(s.Charset) + truncated := enc.EncodeInternal(nil, []byte(str)) + return string(truncated) +} diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index 37be21aa77e50..6b5a3855b623c 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -92,41 +92,40 @@ func TestEncoding(t *testing.T) { } } -func (s *testEncodingSuite) TestValidatorASCII(c *C) { +func TestStringValidatorASCII(t *testing.T) { v := charset.StringValidatorASCII{} - c.Assert(v.Validate("qwerty"), Equals, -1) - c.Assert(v.Validate("qwÊrty"), Equals, 2) - c.Assert(v.Validate("中文"), Equals, 0) + require.Equal(t, -1, v.Validate("qwerty")) + require.Equal(t, 2, v.Validate("qwÊrty")) + require.Equal(t, 0, v.Validate("中文")) } -func (s *testEncodingSuite) TestValidatorUTF8(c *C) { +func TestStringValidatorUTF8(t *testing.T) { // Test charset "utf8mb4". v := charset.StringValidatorUTF8{IsUTF8MB4: true} - c.Assert(v.Validate("qwerty"), Equals, -1) - c.Assert(v.Validate("qwÊrty"), Equals, -1) - c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) - c.Assert(v.Validate("😂"), Equals, -1) + require.Equal(t, -1, v.Validate("qwerty")) + require.Equal(t, -1, v.Validate("qwÊrty")) + require.Equal(t, -1, v.Validate("qwÊ合法字符串")) + require.Equal(t, -1, v.Validate("😂")) invalid := string([]byte{0xff, 0xfe, 0xfd}) - c.Assert(v.Validate(invalid), Equals, 0) + require.Equal(t, 0, v.Validate(invalid)) // Test charset "utf8" without checking mb4 value. v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: false} - c.Assert(v.Validate("qwerty"), Equals, -1) - c.Assert(v.Validate("qwÊrty"), Equals, -1) - c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) - c.Assert(v.Validate("qwÊ合法字符串"), Equals, -1) - c.Assert(v.Validate("😂"), Equals, -1) - c.Assert(v.Validate(invalid), Equals, 0) + require.Equal(t, -1, "qwerty") + require.Equal(t, -1, "qwÊrty") + require.Equal(t, -1, "qwÊ合法字符串") + require.Equal(t, -1, "😂") + require.Equal(t, 0, v.Validate(invalid)) // Test charset "utf8" with checking mb4 value. v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: true} - c.Assert(v.Validate("😂"), Equals, 0) // 4-bytes character is invalid. - c.Assert(v.Validate(invalid), Equals, 0) + require.Equal(t, 0, v.Validate("😂")) // 4-bytes character is invalid. + require.Equal(t, 0, v.Validate(invalid)) } -func (s *testEncodingSuite) TestValidatorGBK(c *C) { +func TestStringValidatorGBK(t *testing.T) { v := charset.StringValidatorOther{Charset: "gbk"} - c.Assert(v.Validate("asdf"), Equals, -1) - c.Assert(v.Validate("中文"), Equals, -1) - c.Assert(v.Validate("À"), Equals, 0) - c.Assert(v.Validate("asdfÀ"), Equals, 4) - c.Assert(v.Validate("中文À"), Equals, 6) + require.Equal(t, -1, v.Validate("asdf")) + require.Equal(t, -1, v.Validate("中文")) + require.Equal(t, 0, v.Validate("À")) + require.Equal(t, 4, v.Validate("asdfÀ")) + require.Equal(t, 6, v.Validate("中文À")) } diff --git a/table/column.go b/table/column.go index 680a0a2123b1e..dbef6f6e0cdaf 100644 --- a/table/column.go +++ b/table/column.go @@ -20,11 +20,6 @@ package table import ( "fmt" - "strconv" - "strings" - "time" - "unicode" - "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/expression" "github.com/pingcap/tidb/parser" @@ -41,6 +36,10 @@ import ( "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/timeutil" "go.uber.org/zap" + "strconv" + "strings" + "time" + "unicode" ) // Column provides meta data describing a table column. @@ -170,9 +169,8 @@ func truncateTrailingSpaces(v *types.Datum) { v.SetString(str, v.Collation()) } -func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, str string, i int) (types.Datum, error) { +func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, str string, i int) error { sc := ctx.GetSessionVars().StmtCtx - var strval strings.Builder for j := 0; j < 6; j++ { if len(str) > (i + j) { @@ -186,14 +184,11 @@ func handleWrongCharsetValue(ctx sessionctx.Context, col *model.ColumnInfo, str if len(str) > i+6 { strval.WriteString(`...`) } - // TODO: Add 'at row %d' err := ErrTruncatedWrongValueForField.FastGen("Incorrect string value '%s' for column '%s'", strval.String(), col.Name) logutil.BgLogger().Error("incorrect string value", zap.Uint64("conn", ctx.GetSessionVars().ConnectionID), zap.Error(err)) - // Truncate to valid utf8 string. - truncateVal := types.NewStringDatum(str[:i]) err = sc.HandleTruncate(err) - return truncateVal, err + return err } func handleZeroDatetime(ctx sessionctx.Context, col *model.ColumnInfo, casted types.Datum, str string, tmIsInvalid bool) (types.Datum, bool, error) { @@ -321,7 +316,8 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r if v := makeStringValidator(ctx, col); v != nil { str := casted.GetString() if invalidPos := v.Validate(str); invalidPos >= 0 { - casted, err = handleWrongCharsetValue(ctx, col, str, invalidPos) + casted = types.NewStringDatum(v.Truncate(str)) + err = handleWrongCharsetValue(ctx, col, str, invalidPos) } } if forceIgnoreTruncate { From 13cf296a17045def808c21674727b104bc49696b Mon Sep 17 00:00:00 2001 From: tangenta Date: Tue, 2 Nov 2021 15:07:16 +0800 Subject: [PATCH 08/15] fix the truncation for ascii --- cmd/explaintest/r/new_character_set_invalid.result | 4 ++-- parser/charset/encoding_table.go | 14 +++++++++++--- table/column.go | 9 +++++---- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index 14d8c747b81d0..8751183549c38 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -15,5 +15,5 @@ insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); select * from t; a b c 中文 asdf 字符集 -? ø ? -中文?中文 asdføfdsa 字符集?字符集 +? ? ? +中文?中文 asdf?fdsa 字符集?字符集 diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index c7f2e6c10ad56..5c96cb482840b 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -330,9 +330,17 @@ func (s StringValidatorASCII) Validate(str string) (invalidPos int) { // Truncate implement the interface StringValidator. func (s StringValidatorASCII) Truncate(str string) string { - enc := NewEncoding("ascii") - truncated := enc.EncodeInternal(nil, []byte(str)) - return string(truncated) + strBytes := Slice(str) + r := make([]byte, 0, len(str)) + for i, w := 0, 0; i < len(str); i += w { + w = characterLengthUTF8(strBytes[i:]) + if w > 1 || (w == 1 && str[i] > go_unicode.MaxASCII) { + r = append(r, '?') + } else { + r = append(r, str[i:i+w]...) + } + } + return string(r) } // StringValidatorUTF8 checks whether a string is valid UTF8 string. diff --git a/table/column.go b/table/column.go index dbef6f6e0cdaf..bb30218123477 100644 --- a/table/column.go +++ b/table/column.go @@ -20,6 +20,11 @@ package table import ( "fmt" + "strconv" + "strings" + "time" + "unicode" + "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/expression" "github.com/pingcap/tidb/parser" @@ -36,10 +41,6 @@ import ( "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/timeutil" "go.uber.org/zap" - "strconv" - "strings" - "time" - "unicode" ) // Column provides meta data describing a table column. From 650b63bf9a649dbab74e50f101085672c9de2098 Mon Sep 17 00:00:00 2001 From: tangenta Date: Tue, 2 Nov 2021 15:16:18 +0800 Subject: [PATCH 09/15] fix integration test TestStringValidatorUTF8 --- parser/charset/encoding_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index 6b5a3855b623c..957175fda265d 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -110,10 +110,10 @@ func TestStringValidatorUTF8(t *testing.T) { require.Equal(t, 0, v.Validate(invalid)) // Test charset "utf8" without checking mb4 value. v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: false} - require.Equal(t, -1, "qwerty") - require.Equal(t, -1, "qwÊrty") - require.Equal(t, -1, "qwÊ合法字符串") - require.Equal(t, -1, "😂") + require.Equal(t, -1, v.Validate("qwerty")) + require.Equal(t, -1, v.Validate("qwÊrty")) + require.Equal(t, -1, v.Validate("qwÊ合法字符串")) + require.Equal(t, -1, v.Validate("😂")) require.Equal(t, 0, v.Validate(invalid)) // Test charset "utf8" with checking mb4 value. v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: true} From 7a74b297786263f1eb9fb16da800d17a10ee03be Mon Sep 17 00:00:00 2001 From: tangenta Date: Sat, 13 Nov 2021 19:50:23 +0800 Subject: [PATCH 10/15] fix truncate algo --- .../r/new_character_set_invalid.result | 4 + .../t/new_character_set_invalid.test | 3 + parser/charset/encoding.go | 18 -- parser/charset/encoding_table.go | 172 ++++++++++++------ table/column.go | 8 +- 5 files changed, 129 insertions(+), 76 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index 8751183549c38..aaeb66a8d9b44 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -6,14 +6,18 @@ insert into t values ('À', 'ø', '😂'); Error 1366: Incorrect string value '\xC3\x80' for column 'a' insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); Error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); +Error 1366: Incorrect string value '\xFF\xFF' for column 'a' select * from t; a b c 中文 asdf 字符集 set @@sql_mode = ''; insert into t values ('À', 'ø', '😂'); insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); select * from t; a b c 中文 asdf 字符集 ? ? ? 中文?中文 asdf?fdsa 字符集?字符集 +@@ @@ @@ diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test index c5f738fac790b..34031d0b83ef8 100644 --- a/cmd/explaintest/t/new_character_set_invalid.test +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -6,9 +6,12 @@ insert into t values ('中文', 'asdf', '字符集'); insert into t values ('À', 'ø', '😂'); -- error 1366: Incorrect string value '\xC3\x80\xE4\xB8\xAD\xE6...' for column 'a' insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +-- error 1366: Incorrect string value '\xFF\xFF' for column 'a' +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); select * from t; set @@sql_mode = ''; insert into t values ('À', 'ø', '😂'); insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); +insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); select * from t; diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index f7878e6f2d567..72bff5b6cf7e9 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -157,24 +157,6 @@ func (e *Encoding) DecodeString(src string) (string, error) { return string(bs), err } -// IsValid checks whether src(utf8) bytes can be encode into a string with given charset. -// Return -1 if it encodes successfully. -func (e *Encoding) IsValid(src []byte) (invalidPos int) { - dec := e.enc.NewEncoder() - dest := [4]byte{} - var srcOffset int - for srcOffset < len(src) { - srcNextLen := characterLengthUTF8(src[srcOffset:]) - srcEnd := mathutil.Min(srcOffset+srcNextLen, len(src)) - _, nSrc, err := dec.Transform(dest[:], src[srcOffset:srcEnd], false) - if err != nil { - return srcOffset - } - srcOffset += nSrc - } - return -1 -} - func (e *Encoding) transform(transformer transform.Transformer, dest, src []byte, isDecoding bool) ([]byte, error) { if len(dest) < len(src) { dest = make([]byte, len(src)*2) diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index 5c96cb482840b..86ab3f4cc7401 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -14,6 +14,7 @@ package charset import ( + "github.com/cznic/mathutil" "strings" go_unicode "unicode" "unicode/utf8" @@ -304,6 +305,14 @@ func characterLengthOne(_ []byte) int { return 1 } +type TruncateStrategy int8 + +const ( + TruncateStrategyEmpty TruncateStrategy = iota + TruncateStrategyTrim + TruncateStrategyReplace +) + var _ StringValidator = StringValidatorASCII{} var _ StringValidator = StringValidatorUTF8{} var _ StringValidator = StringValidatorOther{} @@ -311,36 +320,51 @@ var _ StringValidator = StringValidatorOther{} // StringValidator is used to check if a string is valid in the specific charset. type StringValidator interface { Validate(str string) (invalidPos int) - Truncate(str string) string + Truncate(str string, strategy TruncateStrategy) (result string, invalidPos int) } // StringValidatorASCII checks whether a string is valid ASCII string. type StringValidatorASCII struct{} // Validate checks whether the string is valid in the given charset. -// It returns the first invalid byte offset. -func (s StringValidatorASCII) Validate(str string) (invalidPos int) { - for i := 0; i < len(str); i++ { - if str[i] > go_unicode.MaxASCII { - return i - } - } - return -1 +func (s StringValidatorASCII) Validate(str string) int { + _, invalidPos := s.Truncate(str, TruncateStrategyEmpty) + return invalidPos } // Truncate implement the interface StringValidator. -func (s StringValidatorASCII) Truncate(str string) string { - strBytes := Slice(str) - r := make([]byte, 0, len(str)) +func (s StringValidatorASCII) Truncate(str string, strategy TruncateStrategy) (string, int) { + var result []byte + if strategy == TruncateStrategyReplace { + result = make([]byte, 0, len(str)) + } + invalidPos := -1 for i, w := 0, 0; i < len(str); i += w { - w = characterLengthUTF8(strBytes[i:]) - if w > 1 || (w == 1 && str[i] > go_unicode.MaxASCII) { - r = append(r, '?') - } else { - r = append(r, str[i:i+w]...) + w = 1 + if str[i] > go_unicode.MaxASCII { + if invalidPos == -1 { + invalidPos = i + } + switch strategy { + case TruncateStrategyEmpty: + return "", invalidPos + case TruncateStrategyTrim: + return str[:i], invalidPos + case TruncateStrategyReplace: + w = characterLengthUTF8(Slice(str)[i:]) + w = mathutil.Min(w, len(str)-i) + result = append(result, '?') + continue + } + } + if strategy == TruncateStrategyReplace { + result = append(result, str[i:i+w]...) } } - return string(r) + if strategy == TruncateStrategyReplace { + return string(result), invalidPos + } + return str, -1 } // StringValidatorUTF8 checks whether a string is valid UTF8 string. @@ -350,44 +374,49 @@ type StringValidatorUTF8 struct { } // Validate checks whether the string is valid in the given charset. -// It returns the first invalid byte offset. -func (s StringValidatorUTF8) Validate(str string) (invalidPos int) { +func (s StringValidatorUTF8) Validate(str string) int { + _, invalidPos := s.Truncate(str, TruncateStrategyEmpty) + return invalidPos +} + +// Truncate implement the interface StringValidator. +func (s StringValidatorUTF8) Truncate(str string, strategy TruncateStrategy) (string, int) { if s.IsUTF8MB4 && utf8.ValidString(str) { // Quick check passed. - return -1 + return str, -1 } doMB4CharCheck := !s.IsUTF8MB4 && s.CheckMB4ValueInUTF8 + var result []byte + if strategy == TruncateStrategyReplace { + result = make([]byte, 0, len(str)) + } + invalidPos := -1 for i, w := 0, 0; i < len(str); i += w { - runeValue, width := utf8.DecodeRuneInString(str[i:]) - if runeValue == utf8.RuneError { - if strings.HasPrefix(str[i:], string(utf8.RuneError)) { - w = width + var rv rune + rv, w = utf8.DecodeRuneInString(str[i:]) + if (rv == utf8.RuneError && !strings.HasPrefix(str[i:], string(utf8.RuneError))) || + w > 3 && doMB4CharCheck { + if invalidPos == -1 { + invalidPos = i + } + switch strategy { + case TruncateStrategyEmpty: + return "", invalidPos + case TruncateStrategyTrim: + return str[:i], invalidPos + case TruncateStrategyReplace: + result = append(result, '?') continue } - return i - } else if width > 3 && doMB4CharCheck { - // Meet non-BMP characters. - return i } - w = width - } - return -1 -} - -// Truncate implement the interface StringValidator. -func (s StringValidatorUTF8) Truncate(str string) string { - r := make([]byte, 0, len(str)) - for i, w := 0, 0; i < len(str); i += w { - rv, width := utf8.DecodeRuneInString(str[i:]) - w = width - if (rv == utf8.RuneError && !strings.HasPrefix(str[i:], string(utf8.RuneError))) || - width > 3 && !s.IsUTF8MB4 { - r = append(r, '?') - } else { - r = append(r, str[i:i+w]...) + if strategy == TruncateStrategyReplace { + result = append(result, str[i:i+w]...) } } - return string(r) + if strategy == TruncateStrategyReplace { + return string(result), invalidPos + } + return str, -1 } // StringValidatorOther checks whether a string is valid string in given charset. @@ -396,18 +425,49 @@ type StringValidatorOther struct { } // Validate checks whether the string is valid in the given charset. -// It returns the first invalid byte offset. -func (s StringValidatorOther) Validate(str string) (invalidPos int) { - enc := NewEncoding(s.Charset) - if !enc.enabled() { - return -1 - } - return enc.IsValid([]byte(str)) +func (s StringValidatorOther) Validate(str string) int { + _, invalidPos := s.Truncate(str, TruncateStrategyEmpty) + return invalidPos } // Truncate implement the interface StringValidator. -func (s StringValidatorOther) Truncate(str string) string { +func (s StringValidatorOther) Truncate(str string, strategy TruncateStrategy) (string, int) { enc := NewEncoding(s.Charset) - truncated := enc.EncodeInternal(nil, []byte(str)) - return string(truncated) + if !enc.enabled() { + return str, -1 + } + var result []byte + if strategy == TruncateStrategyReplace { + result = make([]byte, 0, len(str)) + } + var buf [4]byte + strBytes := Slice(str) + transformer := enc.enc.NewEncoder() + invalidPos := -1 + for i, w := 0, 0; i < len(str); i += w { + w = characterLengthUTF8(strBytes[i:]) + w := mathutil.Min(w, len(str)-i) + _, _, err := transformer.Transform(buf[:], strBytes[i:i+w], true) + if err != nil { + if invalidPos == -1 { + invalidPos = i + } + switch strategy { + case TruncateStrategyEmpty: + return "", invalidPos + case TruncateStrategyTrim: + return str[:i], invalidPos + case TruncateStrategyReplace: + result = append(result, '?') + continue + } + } + if strategy == TruncateStrategyReplace { + result = append(result, strBytes[i:i+w]...) + } + } + if strategy == TruncateStrategyReplace { + return string(result), invalidPos + } + return str, -1 } diff --git a/table/column.go b/table/column.go index bb30218123477..445a169a82b59 100644 --- a/table/column.go +++ b/table/column.go @@ -316,8 +316,12 @@ func CastValue(ctx sessionctx.Context, val types.Datum, col *model.ColumnInfo, r if v := makeStringValidator(ctx, col); v != nil { str := casted.GetString() - if invalidPos := v.Validate(str); invalidPos >= 0 { - casted = types.NewStringDatum(v.Truncate(str)) + strategy := charset.TruncateStrategyReplace + if val.Collation() == charset.CollationBin { + strategy = charset.TruncateStrategyTrim + } + if newStr, invalidPos := v.Truncate(str, strategy); invalidPos >= 0 { + casted = types.NewStringDatum(newStr) err = handleWrongCharsetValue(ctx, col, str, invalidPos) } } From b1911e7fbbb16bd6823894d355303f590a2b4d75 Mon Sep 17 00:00:00 2001 From: tangenta Date: Sat, 13 Nov 2021 23:21:43 +0800 Subject: [PATCH 11/15] fix go import order --- .gitignore | 2 -- parser/charset/encoding_table.go | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 54f2ad05fc371..b1c7ad4934527 100644 --- a/.gitignore +++ b/.gitignore @@ -13,8 +13,6 @@ cmd/explaintest/explain-test.out cmd/explaintest/explaintest_tidb-server cmd/explaintest/portgenerator cmd/explaintest/s/ -cmd/explaintest/importer -cmd/pluginpkg/pluginpkg *.fail.go tools/bin/ vendor diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index 86ab3f4cc7401..5a67a1817822e 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -14,11 +14,11 @@ package charset import ( - "github.com/cznic/mathutil" "strings" go_unicode "unicode" "unicode/utf8" + "github.com/cznic/mathutil" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/japanese" From c5ed6a85f3f967b8e719650691762bb30b700b0d Mon Sep 17 00:00:00 2001 From: tangenta Date: Sat, 13 Nov 2021 23:35:39 +0800 Subject: [PATCH 12/15] fix integration test TestInsertWrongValueForField --- executor/insert_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executor/insert_test.go b/executor/insert_test.go index d2d5b232b53af..03f3ab6e1c65d 100644 --- a/executor/insert_test.go +++ b/executor/insert_test.go @@ -331,7 +331,7 @@ func (s *testSuite3) TestInsertWrongValueForField(c *C) { tk.MustExec(`create table t1(a char(10) charset utf8);`) tk.MustExec(`insert into t1 values('我');`) tk.MustExec(`alter table t1 add column b char(10) charset ascii as ((a));`) - tk.MustQuery(`select * from t1;`).Check(testkit.Rows(`我 `)) + tk.MustQuery(`select * from t1;`).Check(testkit.Rows("我 ?")) tk.MustExec(`drop table if exists t;`) tk.MustExec(`create table t (a year);`) From bbded5812a8670620d535a8d5fccd3f0cbd88812 Mon Sep 17 00:00:00 2001 From: tangenta Date: Fri, 19 Nov 2021 11:19:13 +0800 Subject: [PATCH 13/15] parser/charset: avoid unnecessary copying for valid ASCII string --- parser/charset/encoding_table.go | 41 +++++++++++++++++--------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index 5a67a1817822e..b5fa7b7986833 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -305,6 +305,10 @@ func characterLengthOne(_ []byte) int { return 1 } +// TruncateStrategy indicates the way to handle the invalid strings in specific charset. +// - TruncateStrategyEmpty: returns an empty string. +// - TruncateStrategyTrim: returns the valid prefix part of string. +// - TruncateStrategyReplace: returns the whole string, but the invalid characters are replaced with '?'. type TruncateStrategy int8 const ( @@ -334,34 +338,33 @@ func (s StringValidatorASCII) Validate(str string) int { // Truncate implement the interface StringValidator. func (s StringValidatorASCII) Truncate(str string, strategy TruncateStrategy) (string, int) { - var result []byte - if strategy == TruncateStrategyReplace { - result = make([]byte, 0, len(str)) - } invalidPos := -1 - for i, w := 0, 0; i < len(str); i += w { - w = 1 + for i := 0; i < len(str); i++ { if str[i] > go_unicode.MaxASCII { - if invalidPos == -1 { - invalidPos = i - } - switch strategy { - case TruncateStrategyEmpty: - return "", invalidPos - case TruncateStrategyTrim: - return str[:i], invalidPos - case TruncateStrategyReplace: + invalidPos = i + } + } + if invalidPos == -1 { + // Quick check passed. + return str, -1 + } + switch strategy { + case TruncateStrategyEmpty: + return "", invalidPos + case TruncateStrategyTrim: + return str[:invalidPos], invalidPos + case TruncateStrategyReplace: + result := make([]byte, 0, len(str)) + for i, w := 0, 0; i < len(str); i += w { + w = 1 + if str[i] > go_unicode.MaxASCII { w = characterLengthUTF8(Slice(str)[i:]) w = mathutil.Min(w, len(str)-i) result = append(result, '?') continue } - } - if strategy == TruncateStrategyReplace { result = append(result, str[i:i+w]...) } - } - if strategy == TruncateStrategyReplace { return string(result), invalidPos } return str, -1 From 7743db19d4dec51e9cb8412628cfa55424843f2b Mon Sep 17 00:00:00 2001 From: tangenta Date: Sat, 20 Nov 2021 16:13:34 +0800 Subject: [PATCH 14/15] fix truncate bugs and add test --- parser/charset/encoding.go | 1 + parser/charset/encoding_table.go | 13 +++- parser/charset/encoding_test.go | 112 +++++++++++++++++++++++++------ 3 files changed, 103 insertions(+), 23 deletions(-) diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index e0ddf19fe2b56..4702fd528961b 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -219,6 +219,7 @@ func (e *Encoding) generateErr(srcRest []byte, srcNextLen int) error { // replacementBytes are bytes for the replacement rune 0xfffd. var replacementBytes = []byte{0xEF, 0xBF, 0xBD} +var replacementStr = string(replacementBytes) // beginWithReplacementChar check if dst has the prefix '0xEFBFBD'. func beginWithReplacementChar(dst []byte) bool { diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index b5fa7b7986833..b7c16076e63b3 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -342,6 +342,7 @@ func (s StringValidatorASCII) Truncate(str string, strategy TruncateStrategy) (s for i := 0; i < len(str); i++ { if str[i] > go_unicode.MaxASCII { invalidPos = i + break } } if invalidPos == -1 { @@ -384,6 +385,9 @@ func (s StringValidatorUTF8) Validate(str string) int { // Truncate implement the interface StringValidator. func (s StringValidatorUTF8) Truncate(str string, strategy TruncateStrategy) (string, int) { + if str == "" { + return str, -1 + } if s.IsUTF8MB4 && utf8.ValidString(str) { // Quick check passed. return str, -1 @@ -397,8 +401,8 @@ func (s StringValidatorUTF8) Truncate(str string, strategy TruncateStrategy) (st for i, w := 0, 0; i < len(str); i += w { var rv rune rv, w = utf8.DecodeRuneInString(str[i:]) - if (rv == utf8.RuneError && !strings.HasPrefix(str[i:], string(utf8.RuneError))) || - w > 3 && doMB4CharCheck { + if (rv == utf8.RuneError && !strings.HasPrefix(str[i:], replacementStr)) || + (w > 3 && doMB4CharCheck) { if invalidPos == -1 { invalidPos = i } @@ -435,6 +439,9 @@ func (s StringValidatorOther) Validate(str string) int { // Truncate implement the interface StringValidator. func (s StringValidatorOther) Truncate(str string, strategy TruncateStrategy) (string, int) { + if str == "" { + return str, -1 + } enc := NewEncoding(s.Charset) if !enc.enabled() { return str, -1 @@ -449,7 +456,7 @@ func (s StringValidatorOther) Truncate(str string, strategy TruncateStrategy) (s invalidPos := -1 for i, w := 0, 0; i < len(str); i += w { w = characterLengthUTF8(strBytes[i:]) - w := mathutil.Min(w, len(str)-i) + w = mathutil.Min(w, len(str)-i) _, _, err := transformer.Transform(buf[:], strBytes[i:i+w], true) if err != nil { if invalidPos == -1 { diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index 957175fda265d..cf19dc48c1249 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -16,6 +16,7 @@ package charset_test import ( "fmt" "testing" + "unicode/utf8" "github.com/pingcap/tidb/parser/charset" "github.com/stretchr/testify/require" @@ -94,6 +95,27 @@ func TestEncoding(t *testing.T) { func TestStringValidatorASCII(t *testing.T) { v := charset.StringValidatorASCII{} + testCases := []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"qwerty", charset.TruncateStrategyEmpty, "qwerty", -1}, + {"qwÊrty", charset.TruncateStrategyEmpty, "", 2}, + {"qwÊrty", charset.TruncateStrategyTrim, "qw", 2}, + {"qwÊrty", charset.TruncateStrategyReplace, "qw?rty", 2}, + {"中文", charset.TruncateStrategyEmpty, "", 0}, + {"中文?qwert", charset.TruncateStrategyTrim, "", 0}, + {"中文?qwert", charset.TruncateStrategyReplace, "???qwert", 0}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } require.Equal(t, -1, v.Validate("qwerty")) require.Equal(t, 2, v.Validate("qwÊrty")) require.Equal(t, 0, v.Validate("中文")) @@ -102,30 +124,80 @@ func TestStringValidatorASCII(t *testing.T) { func TestStringValidatorUTF8(t *testing.T) { // Test charset "utf8mb4". v := charset.StringValidatorUTF8{IsUTF8MB4: true} - require.Equal(t, -1, v.Validate("qwerty")) - require.Equal(t, -1, v.Validate("qwÊrty")) - require.Equal(t, -1, v.Validate("qwÊ合法字符串")) - require.Equal(t, -1, v.Validate("😂")) - invalid := string([]byte{0xff, 0xfe, 0xfd}) - require.Equal(t, 0, v.Validate(invalid)) - // Test charset "utf8" without checking mb4 value. - v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: false} - require.Equal(t, -1, v.Validate("qwerty")) - require.Equal(t, -1, v.Validate("qwÊrty")) - require.Equal(t, -1, v.Validate("qwÊ合法字符串")) - require.Equal(t, -1, v.Validate("😂")) - require.Equal(t, 0, v.Validate(invalid)) + oxfffefd := string([]byte{0xff, 0xfe, 0xfd}) + testCases := []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"qwerty", charset.TruncateStrategyEmpty, "qwerty", -1}, + {"qwÊrty", charset.TruncateStrategyEmpty, "qwÊrty", -1}, + {"qwÊ合法字符串", charset.TruncateStrategyEmpty, "qwÊ合法字符串", -1}, + {"😂", charset.TruncateStrategyEmpty, "😂", -1}, + {oxfffefd, charset.TruncateStrategyEmpty, "", 0}, + {oxfffefd, charset.TruncateStrategyReplace, "???", 0}, + {"中文"+oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, + {"中文"+oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, + {string(utf8.RuneError), charset.TruncateStrategyEmpty, "�", -1}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } // Test charset "utf8" with checking mb4 value. v = charset.StringValidatorUTF8{IsUTF8MB4: false, CheckMB4ValueInUTF8: true} - require.Equal(t, 0, v.Validate("😂")) // 4-bytes character is invalid. - require.Equal(t, 0, v.Validate(invalid)) + testCases = []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"qwerty", charset.TruncateStrategyEmpty, "qwerty", -1}, + {"qwÊrty", charset.TruncateStrategyEmpty, "qwÊrty", -1}, + {"qwÊ合法字符串", charset.TruncateStrategyEmpty, "qwÊ合法字符串", -1}, + {"😂", charset.TruncateStrategyEmpty, "", 0}, + {"😂", charset.TruncateStrategyReplace, "?", 0}, + {"valid_str😂", charset.TruncateStrategyReplace, "valid_str?", 9}, + {oxfffefd, charset.TruncateStrategyEmpty, "", 0}, + {oxfffefd, charset.TruncateStrategyReplace, "???", 0}, + {"中文"+oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, + {"中文"+oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, + {string(utf8.RuneError), charset.TruncateStrategyEmpty, "�", -1}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } } func TestStringValidatorGBK(t *testing.T) { v := charset.StringValidatorOther{Charset: "gbk"} - require.Equal(t, -1, v.Validate("asdf")) - require.Equal(t, -1, v.Validate("中文")) - require.Equal(t, 0, v.Validate("À")) - require.Equal(t, 4, v.Validate("asdfÀ")) - require.Equal(t, 6, v.Validate("中文À")) + testCases := []struct { + str string + strategy charset.TruncateStrategy + expected string + invalidPos int + }{ + {"", charset.TruncateStrategyEmpty, "", -1}, + {"asdf", charset.TruncateStrategyEmpty, "asdf", -1}, + {"中文", charset.TruncateStrategyEmpty, "中文", -1}, + {"À", charset.TruncateStrategyEmpty, "", 0}, + {"À", charset.TruncateStrategyReplace, "?", 0}, + {"中文À中文", charset.TruncateStrategyTrim, "中文", 6}, + {"中文À中文", charset.TruncateStrategyReplace, "中文?中文", 6}, + {"asdfÀ", charset.TruncateStrategyReplace, "asdf?", 4}, + } + for _, tc := range testCases { + msg := fmt.Sprintf("%v", tc) + actual, invalidPos := v.Truncate(tc.str, tc.strategy) + require.Equal(t, tc.expected, actual, msg) + require.Equal(t, tc.invalidPos, invalidPos, msg) + } } From 74f1b49fe1375421b06e895dfeb8b67816361266 Mon Sep 17 00:00:00 2001 From: tangenta Date: Mon, 22 Nov 2021 10:59:16 +0800 Subject: [PATCH 15/15] charset: make format and address comment --- parser/charset/encoding.go | 1 - parser/charset/encoding_table.go | 3 +-- parser/charset/encoding_test.go | 8 ++++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/parser/charset/encoding.go b/parser/charset/encoding.go index 4702fd528961b..e0ddf19fe2b56 100644 --- a/parser/charset/encoding.go +++ b/parser/charset/encoding.go @@ -219,7 +219,6 @@ func (e *Encoding) generateErr(srcRest []byte, srcNextLen int) error { // replacementBytes are bytes for the replacement rune 0xfffd. var replacementBytes = []byte{0xEF, 0xBF, 0xBD} -var replacementStr = string(replacementBytes) // beginWithReplacementChar check if dst has the prefix '0xEFBFBD'. func beginWithReplacementChar(dst []byte) bool { diff --git a/parser/charset/encoding_table.go b/parser/charset/encoding_table.go index b7c16076e63b3..b1e1f1c293e4b 100644 --- a/parser/charset/encoding_table.go +++ b/parser/charset/encoding_table.go @@ -401,8 +401,7 @@ func (s StringValidatorUTF8) Truncate(str string, strategy TruncateStrategy) (st for i, w := 0, 0; i < len(str); i += w { var rv rune rv, w = utf8.DecodeRuneInString(str[i:]) - if (rv == utf8.RuneError && !strings.HasPrefix(str[i:], replacementStr)) || - (w > 3 && doMB4CharCheck) { + if (rv == utf8.RuneError && w == 1) || (w > 3 && doMB4CharCheck) { if invalidPos == -1 { invalidPos = i } diff --git a/parser/charset/encoding_test.go b/parser/charset/encoding_test.go index cf19dc48c1249..4adfd916655db 100644 --- a/parser/charset/encoding_test.go +++ b/parser/charset/encoding_test.go @@ -138,8 +138,8 @@ func TestStringValidatorUTF8(t *testing.T) { {"😂", charset.TruncateStrategyEmpty, "😂", -1}, {oxfffefd, charset.TruncateStrategyEmpty, "", 0}, {oxfffefd, charset.TruncateStrategyReplace, "???", 0}, - {"中文"+oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, - {"中文"+oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, + {"中文" + oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, + {"中文" + oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, {string(utf8.RuneError), charset.TruncateStrategyEmpty, "�", -1}, } for _, tc := range testCases { @@ -165,8 +165,8 @@ func TestStringValidatorUTF8(t *testing.T) { {"valid_str😂", charset.TruncateStrategyReplace, "valid_str?", 9}, {oxfffefd, charset.TruncateStrategyEmpty, "", 0}, {oxfffefd, charset.TruncateStrategyReplace, "???", 0}, - {"中文"+oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, - {"中文"+oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, + {"中文" + oxfffefd, charset.TruncateStrategyTrim, "中文", 6}, + {"中文" + oxfffefd, charset.TruncateStrategyReplace, "中文???", 6}, {string(utf8.RuneError), charset.TruncateStrategyEmpty, "�", -1}, } for _, tc := range testCases {