Skip to content

Commit

Permalink
better chunking
Browse files Browse the repository at this point in the history
  • Loading branch information
mishig25 committed Jun 26, 2024
1 parent 0a863af commit 483cb8b
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions src/doc_builder/build_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,23 +86,23 @@ def get_chunks(self, page_info, chunk_len_chars, prefix=[]):
split_content = self.split_markdown(self.content)
if not len(split_content):
return []
chunk = prefix_str
chunk_str = ""
for content in split_content:
if len(chunk) > chunk_len_chars:
# todo: add source dawg
if len(chunk_str) > chunk_len_chars and len(chunk_str) > len(prefix_str):
chunks.append(
Chunk(
text=chunk.strip(),
text=prefix_str.strip() + "\n\n" + chunk_str.strip(),
source=f"{page_info['page']}#{self.anchor}",
package_name=page_info["package_name"],
)
)
chunk = prefix_str
chunk += content + " "
if len(chunk) > len(prefix_str):
chunk_str = ""
chunk_str += content + " "

if len(chunk_str) > len(prefix_str):
chunks.append(
Chunk(
text=chunk.strip(),
text=prefix_str.strip() + "\n\n" + chunk_str.strip(),
source=f"{page_info['page']}#{self.anchor}",
package_name=page_info["package_name"],
)
Expand Down

0 comments on commit 483cb8b

Please sign in to comment.