Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

Commit

Permalink
fix(text): fix bug and replace markdown chunking (#221)
Browse files Browse the repository at this point in the history
Because

- there is out of the range of the slice when the chunks are in some
conditions
- the markdown chunking is not controllable in LangChainGo

This commit

- Rescan the raw text to make the chunk scanning will not be out of the
slice
- Replace markdown chunking with custom logic that required for RAG
system

Please check Markdown Chunking Logic Document in Linear as reference.
  • Loading branch information
chuang8511 authored Jul 18, 2024
1 parent ef43beb commit 298c91a
Show file tree
Hide file tree
Showing 3 changed files with 441 additions and 9 deletions.
50 changes: 43 additions & 7 deletions operator/text/v0/chunk_text.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"fmt"
"reflect"

"github.com/pkoukk/tiktoken-go"
tiktoken "github.com/pkoukk/tiktoken-go"
"github.com/tmc/langchaingo/textsplitter"
)

Expand Down Expand Up @@ -65,8 +65,12 @@ func (s *Setting) SetDefault() {
}
}

type TextSplitter interface {
SplitText(text string) ([]string, error)
}

func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
var split textsplitter.TextSplitter
var split TextSplitter
setting := input.Strategy.Setting
// TODO: Take this out when we fix the error in frontend side.
// Bug: The default value is not set from frontend side.
Expand All @@ -92,10 +96,9 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
)
case "Markdown":
positionCalculator = MarkdownPositionCalculator{}
split = textsplitter.NewMarkdownTextSplitter(
split = NewMarkdownTextSplitter(
textsplitter.WithChunkSize(setting.ChunkSize),
textsplitter.WithChunkOverlap(setting.ChunkOverlap),
textsplitter.WithCodeBlocks(setting.CodeBlocks),
)
case "Recursive":
positionCalculator = PositionCalculator{}
Expand All @@ -122,10 +125,20 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {

startScanPosition := 0
rawRunes := []rune(input.Text)
for _, chunk := range chunks {
for i, chunk := range chunks {
chunkRunes := []rune(chunk)

startPosition, endPosition := positionCalculator.getChunkPositions(rawRunes, chunkRunes, startScanPosition)

if shouldScanRawTextFromPreviousChunk(startPosition, endPosition) {
previousChunk := output.TextChunks[i-1]
startPosition, endPosition = positionCalculator.getChunkPositions(rawRunes, chunkRunes, previousChunk.StartPosition)
}

if startPosition == endPosition {
continue
}

output.TextChunks = append(output.TextChunks, TextChunk{
Text: chunk,
StartPosition: startPosition,
Expand All @@ -134,9 +147,20 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
startScanPosition = startPosition + 1
}

if len(output.TextChunks) == 0 {
output.TextChunks = append(output.TextChunks, TextChunk{
Text: input.Text,
StartPosition: 0,
EndPosition: len(rawRunes) - 1,
})
}
return output, nil
}

func shouldScanRawTextFromPreviousChunk(startPosition, endPosition int) bool {
return startPosition == 0 && endPosition == 0
}

type ChunkPositionCalculator interface {
getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int)
}
Expand All @@ -147,6 +171,11 @@ func (PositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosi

for i := startScanPosition; i < len(rawText); i++ {
if rawText[i] == chunk[0] {

if i+len(chunk) > len(rawText) {
break
}

if reflect.DeepEqual(rawText[i:i+len(chunk)], chunk) {
startPosition = i
endPosition = len(chunk) + i - 1
Expand All @@ -166,6 +195,11 @@ func (MarkdownPositionCalculator) getChunkPositions(rawText, chunk []rune, start
for i := startScanPosition; i < len(rawText); i++ {

if rawText[i] == chunk[skipHeaderIndex] {

if i+len(chunk)-skipHeaderIndex > len(rawText) {
break
}

if reflect.DeepEqual(rawText[i:(i+len(chunk)-skipHeaderIndex)], chunk[skipHeaderIndex:]) {
startPosition = i
endPosition = len(chunk) + i - 1 - skipHeaderIndex
Expand All @@ -178,14 +212,16 @@ func (MarkdownPositionCalculator) getChunkPositions(rawText, chunk []rune, start

func getSkipHeaderIndex(chunk []rune) int {
hashtagCount := 0
skipPosition := 0
for i := 0; i < len(chunk); i++ {
if chunk[i] == '#' {
hashtagCount++
}

if hashtagCount >= 1 && chunk[i] == '\n' {
return i + 1
skipPosition = i + 1
hashtagCount = 0
}
}
return 0
return skipPosition
}
9 changes: 7 additions & 2 deletions operator/text/v0/chunk_text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,17 @@ func TestChunkText(t *testing.T) {
output: ChunkTextOutput{
TextChunks: []TextChunk{
{
Text: "Hello world.",
Text: "Hello",
StartPosition: 0,
EndPosition: 4,
},
{
Text: "world.",
StartPosition: 6,
EndPosition: 11,
},
},
ChunkNum: 1,
ChunkNum: 2,
TokenCount: 3,
},
},
Expand Down
Loading

0 comments on commit 298c91a

Please sign in to comment.