fix(text): fix bug and replace markdown chunking (#221)

Because - there is out of the range of the slice when the chunks are in some conditions - the markdown chunking is not controllable in LangChainGo This commit - Rescan the raw text to make the chunk scanning will not be out of the slice - Replace markdown chunking with custom logic that required for RAG system Please check Markdown Chunking Logic Document in Linear as reference.
instill-ai · Jul 18, 2024 · 298c91a · 298c91a
1 parent ef43beb
commit 298c91a
Show file tree

Hide file tree

Showing 3 changed files with 441 additions and 9 deletions.
diff --git a/operator/text/v0/chunk_text.go b/operator/text/v0/chunk_text.go
@@ -4,7 +4,7 @@ import (
 	"fmt"
 	"reflect"
 
-	"github.com/pkoukk/tiktoken-go"
+	tiktoken "github.com/pkoukk/tiktoken-go"
 	"github.com/tmc/langchaingo/textsplitter"
 )
 
@@ -65,8 +65,12 @@ func (s *Setting) SetDefault() {
 	}
 }
 
+type TextSplitter interface {
+	SplitText(text string) ([]string, error)
+}
+
 func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
-	var split textsplitter.TextSplitter
+	var split TextSplitter
 	setting := input.Strategy.Setting
 	// TODO: Take this out when we fix the error in frontend side.
 	// Bug: The default value is not set from frontend side.
@@ -92,10 +96,9 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
 		)
 	case "Markdown":
 		positionCalculator = MarkdownPositionCalculator{}
-		split = textsplitter.NewMarkdownTextSplitter(
+		split = NewMarkdownTextSplitter(
 			textsplitter.WithChunkSize(setting.ChunkSize),
 			textsplitter.WithChunkOverlap(setting.ChunkOverlap),
-			textsplitter.WithCodeBlocks(setting.CodeBlocks),
 		)
 	case "Recursive":
 		positionCalculator = PositionCalculator{}
@@ -122,10 +125,20 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
 
 	startScanPosition := 0
 	rawRunes := []rune(input.Text)
-	for _, chunk := range chunks {
+	for i, chunk := range chunks {
 		chunkRunes := []rune(chunk)
 
 		startPosition, endPosition := positionCalculator.getChunkPositions(rawRunes, chunkRunes, startScanPosition)
+
+		if shouldScanRawTextFromPreviousChunk(startPosition, endPosition) {
+			previousChunk := output.TextChunks[i-1]
+			startPosition, endPosition = positionCalculator.getChunkPositions(rawRunes, chunkRunes, previousChunk.StartPosition)
+		}
+
+		if startPosition == endPosition {
+			continue
+		}
+
 		output.TextChunks = append(output.TextChunks, TextChunk{
 			Text:          chunk,
 			StartPosition: startPosition,
@@ -134,9 +147,20 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
 		startScanPosition = startPosition + 1
 	}
 
+	if len(output.TextChunks) == 0 {
+		output.TextChunks = append(output.TextChunks, TextChunk{
+			Text:          input.Text,
+			StartPosition: 0,
+			EndPosition:   len(rawRunes) - 1,
+		})
+	}
 	return output, nil
 }
 
+func shouldScanRawTextFromPreviousChunk(startPosition, endPosition int) bool {
+	return startPosition == 0 && endPosition == 0
+}
+
 type ChunkPositionCalculator interface {
 	getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int)
 }
@@ -147,6 +171,11 @@ func (PositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosi
 
 	for i := startScanPosition; i < len(rawText); i++ {
 		if rawText[i] == chunk[0] {
+
+			if i+len(chunk) > len(rawText) {
+				break
+			}
+
 			if reflect.DeepEqual(rawText[i:i+len(chunk)], chunk) {
 				startPosition = i
 				endPosition = len(chunk) + i - 1
@@ -166,6 +195,11 @@ func (MarkdownPositionCalculator) getChunkPositions(rawText, chunk []rune, start
 	for i := startScanPosition; i < len(rawText); i++ {
 
 		if rawText[i] == chunk[skipHeaderIndex] {
+
+			if i+len(chunk)-skipHeaderIndex > len(rawText) {
+				break
+			}
+
 			if reflect.DeepEqual(rawText[i:(i+len(chunk)-skipHeaderIndex)], chunk[skipHeaderIndex:]) {
 				startPosition = i
 				endPosition = len(chunk) + i - 1 - skipHeaderIndex
@@ -178,14 +212,16 @@ func (MarkdownPositionCalculator) getChunkPositions(rawText, chunk []rune, start
 
 func getSkipHeaderIndex(chunk []rune) int {
 	hashtagCount := 0
+	skipPosition := 0
 	for i := 0; i < len(chunk); i++ {
 		if chunk[i] == '#' {
 			hashtagCount++
 		}
 
 		if hashtagCount >= 1 && chunk[i] == '\n' {
-			return i + 1
+			skipPosition = i + 1
+			hashtagCount = 0
 		}
 	}
-	return 0
+	return skipPosition
 }
diff --git a/operator/text/v0/chunk_text_test.go b/operator/text/v0/chunk_text_test.go
@@ -55,12 +55,17 @@ func TestChunkText(t *testing.T) {
 			output: ChunkTextOutput{
 				TextChunks: []TextChunk{
 					{
-						Text:          "Hello world.",
+						Text:          "Hello",
 						StartPosition: 0,
+						EndPosition:   4,
+					},
+					{
+						Text:          "world.",
+						StartPosition: 6,
 						EndPosition:   11,
 					},
 				},
-				ChunkNum:   1,
+				ChunkNum:   2,
 				TokenCount: 3,
 			},
 		},