diff --git a/operator/text/v0/markdown_document.go b/operator/text/v0/markdown_document.go index aac9eb7b7..f955bb17e 100644 --- a/operator/text/v0/markdown_document.go +++ b/operator/text/v0/markdown_document.go @@ -128,14 +128,19 @@ func buildDocument(rawRunes []rune, previousDocument *MarkdownDocument, startPos currentContent.Type = "plaintext" currentContent.BlockStartPosition = currentPosition currentContent.BlockEndPosition = currentPosition - + meetHeaderTimes := 0 for currentPosition < endPositionOfBlock { line := readLine(rawRunes, ¤tPosition) currentContent.BlockEndPosition += sizeOfString(line) + 1 if isHeader(line) { - header := parseHeader(line) + meetHeaderTimes++ + + if meetHeaderTimes > 0 && len(paragraph) > 0 { + currentContent.PlainText = paragraph + doc.Contents = append(doc.Contents, currentContent) + } if endOfDocument(doc) { currentPosition -= sizeOfString(line) + 1 currentContent.PlainText = paragraph @@ -145,6 +150,7 @@ func buildDocument(rawRunes []rune, previousDocument *MarkdownDocument, startPos end = true break } + header := parseHeader(line) currentHeaderLevel = header.Level headers[header.Level-1] = &header } else { diff --git a/operator/text/v0/markdown_splitter.go b/operator/text/v0/markdown_splitter.go index 8b8b2fbba..21b755b20 100644 --- a/operator/text/v0/markdown_splitter.go +++ b/operator/text/v0/markdown_splitter.go @@ -5,6 +5,8 @@ import ( "reflect" "strings" + "log" + "github.com/tmc/langchaingo/textsplitter" ) @@ -432,6 +434,10 @@ func (sp MarkdownTextSplitter) chunkPlainText(content Content, headers []Header) if shouldScanRawTextFromPreviousChunk(startPosition, endPosition) { previousChunkIndex := len(contentChunks) - 1 + if previousChunkIndex < 0 { + log.Println("There may be missing chunks in the content because of parsing errors in the markdown_document") + continue + } previousChunk := contentChunks[previousChunkIndex] startPosition, endPosition = getChunkPositions(rawRunes, chunkRunes, previousChunk.ContentStartPosition+1) }