diff --git a/textsplitter/options.go b/textsplitter/options.go index 363ae9fbb..1342f2fb9 100644 --- a/textsplitter/options.go +++ b/textsplitter/options.go @@ -1,5 +1,7 @@ package textsplitter +import "unicode/utf8" + // Options is a struct that contains options for a text splitter. type Options struct { ChunkSize int @@ -118,5 +120,5 @@ func WithReferenceLinks(referenceLinks bool) Option { } func defaultLenFunc(s string) int { - return len(s) + return utf8.RuneCountInString(s) } diff --git a/textsplitter/recursive_character_test.go b/textsplitter/recursive_character_test.go index 943bd1511..8a0a880f8 100644 --- a/textsplitter/recursive_character_test.go +++ b/textsplitter/recursive_character_test.go @@ -2,7 +2,6 @@ package textsplitter import ( "testing" - "unicode/utf8" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -18,7 +17,6 @@ func TestRecursiveCharacterSplitter(t *testing.T) { chunkSize int separators []string expectedDocs []schema.Document - lenFunc func(string) int } testCases := []testCase{ { @@ -26,7 +24,6 @@ func TestRecursiveCharacterSplitter(t *testing.T) { chunkOverlap: 0, chunkSize: 10, separators: []string{"\n\n", "\n", " "}, - lenFunc: utf8.RuneCountInString, expectedDocs: []schema.Document{ {PageContent: "哈里森\n很高兴遇见你", Metadata: map[string]any{}}, {PageContent: "欢迎来中国", Metadata: map[string]any{}}, @@ -115,9 +112,6 @@ Bye! splitter.ChunkOverlap = tc.chunkOverlap splitter.ChunkSize = tc.chunkSize splitter.Separators = tc.separators - if tc.lenFunc != nil { - splitter.LenFunc = tc.lenFunc - } docs, err := CreateDocuments(splitter, []string{tc.text}, nil) require.NoError(t, err)