Skip to content

Commit

Permalink
feat(crawler): add functionality to fetch and index LLMS files from W…
Browse files Browse the repository at this point in the history
…ebDocuments (#3880)

* feat(crawler): add functionality to fetch and index LLMS files from a given URL

* fix(webcrawler): update logging for document fetching and streamline indexing process

* fix(webcrawler): add logging for crawled llms.txt documents

* feat(crawler): enhance crawler_llms to split llms-full.txt into multiple documents

* refactor(crawler): move split_llms_content to llms_txt_parser module

* [autofix.ci] apply automated fixes

* test(crawler): add integration tests for crawler_llms with Cloudflare and Perplexity sources

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
  • Loading branch information
Sma1lboy and autofix-ci[bot] authored Feb 23, 2025
1 parent 86cd7bd commit 0c882b7
Show file tree
Hide file tree
Showing 5 changed files with 281 additions and 3 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/tabby-crawler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ serde_json.workspace = true
logkit.workspace = true
htmd = "0.1"
regex.workspace = true
reqwest.workspace = true

[dev-dependencies]
tracing-test.workspace = true
Expand Down
56 changes: 56 additions & 0 deletions crates/tabby-crawler/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod llms_txt_parser;
mod types;

use std::process::Stdio;
Expand Down Expand Up @@ -148,6 +149,31 @@ pub async fn crawl_pipeline(
.filter_map(move |data| async move { to_document(data) }))
}

/// Attempts to fetch `llms-full.txt` from the given base URL,
/// then splits its markdown content into multiple sections based on H1 headings.
/// Each section becomes a separate `CrawledDocument`.
/// Returns a vector of `CrawledDocument`s if successful.
pub async fn crawler_llms(start_url: &str) -> anyhow::Result<Vec<CrawledDocument>> {
// Remove trailing slash from the base URL if present.
let base_url = start_url.trim_end_matches('/');

let llms_full_url = format!("{}/llms-full.txt", base_url);
let resp = reqwest::get(&llms_full_url).await?;
if !resp.status().is_success() {
anyhow::bail!("Unable to fetch llms-full.txt from {}", base_url);
}
let body = resp.text().await?;
debug!("Successfully fetched llms-full.txt: {}", llms_full_url);

// Split the fetched markdown content into sections.
let docs = llms_txt_parser::split_llms_content(&body, start_url);
if docs.is_empty() {
anyhow::bail!("No sections found in llms-full.txt from {}", base_url);
}

Ok(docs)
}

#[cfg(test)]
mod tests {

Expand Down Expand Up @@ -182,4 +208,34 @@ mod tests {
assert_eq!(doc.url, "https://example.com");
assert_eq!(doc.markdown, "Hello, World!");
}

#[tokio::test]
#[traced_test]
async fn test_crawler_llms_success_developers_cloudflare_with_url() {
let base_url = "https://developers.cloudflare.com";
let result = crawler_llms(base_url).await;
assert!(result.is_ok(), "Expected success from {}", base_url);
let docs = result.unwrap();
assert!(
!docs.is_empty(),
"Expected at least one section from llms-full.txt at {}",
base_url
);
println!("Fetched {} documents from {}", docs.len(), base_url);
}

#[tokio::test]
#[traced_test]
async fn test_crawler_llms_success_docs_perplexity_with_source() {
let base_url = "https://docs.perplexity.ai";
let result = crawler_llms(base_url).await;
assert!(result.is_ok(), "Expected success from {}", base_url);
let docs = result.unwrap();
assert!(
!docs.is_empty(),
"Expected at least one section from llms-full.txt at {}",
base_url
);
println!("Fetched {} documents from {}", docs.len(), base_url);
}
}
177 changes: 177 additions & 0 deletions crates/tabby-crawler/src/llms_txt_parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
use crate::types::{CrawledDocument, CrawledMetadata};

pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument> {
let mut docs = Vec::new();
let mut current_title: Option<String> = None;
let mut current_url: Option<String> = None;
let mut current_body = String::new();

// Process the content line by line.
for line in content.lines() {
// Check if the line starts with a heading-1 marker.
if line.starts_with("# ") {
// If we already have a section in progress, finalize it.
if let Some(title) = current_title.take() {
// Use the URL from the section if available; otherwise, fallback to base_url.
let url = current_url.take().unwrap_or_else(|| base_url.to_owned());
let metadata = CrawledMetadata {
title: title.into(),
description: url.clone().into(),
};
docs.push(CrawledDocument::new(
url,
current_body.trim().to_owned(),
metadata,
));
current_body = String::new();
}
current_title = Some(line[2..].trim().to_owned());
current_url = None;
} else if line.starts_with("URL:") || line.starts_with("Source:") {
let prefix_len = if line.starts_with("URL:") { 4 } else { 7 };
let url_str = line[prefix_len..].trim();
current_url = Some(url_str.to_owned());
} else {
current_body.push_str(line);
current_body.push('\n');
}
}

// Finalize the last section if any.
if let Some(title) = current_title {
let url = current_url.unwrap_or_else(|| base_url.to_owned());
let metadata = CrawledMetadata {
title: title.into(),
description: url.clone().into(),
};
docs.push(CrawledDocument::new(
url,
current_body.trim().to_owned(),
metadata,
));
}

docs
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_split_llms_content_with_url() {
// Test a section that provides a URL.
let content = "\
# Test Title with URL
URL: https://developers.cloudflare.com
This is a test body.
More text on the same section.
";
let base_url = "example.com";
let docs = split_llms_content(content, base_url);
assert_eq!(docs.len(), 1, "Should produce one document");

let doc = &docs[0];
// The title is taken from the heading.
assert_eq!(doc.metadata.title, Some("Test Title with URL".to_string()));
// The URL should be extracted from the URL: line.
assert_eq!(doc.url, "https://developers.cloudflare.com");
// The body should contain the text after the URL line.
assert_eq!(
doc.markdown,
"This is a test body.\nMore text on the same section."
);
}

#[test]
fn test_split_llms_content_with_source() {
// Test a section that provides a Source.
let content = "\
# Test Title with Source
Source: https://docs.perplexity.ai
This is another test body.
Line two of body.
";
let base_url = "example.com";
let docs = split_llms_content(content, base_url);
assert_eq!(docs.len(), 1, "Should produce one document");

let doc = &docs[0];
assert_eq!(
doc.metadata.title,
Some("Test Title with Source".to_string())
);
// The URL should be extracted from the Source: line.
assert_eq!(doc.url, "https://docs.perplexity.ai");
assert_eq!(
doc.markdown,
"This is another test body.\nLine two of body."
);
}

#[test]
fn test_split_llms_content_without_metadata() {
// Test a section with no URL or Source line; should fallback to base_url.
let content = "\
# Test Title without URL or Source
This is test body with no explicit URL.
Additional content line.
";
let base_url = "example.com";
let docs = split_llms_content(content, base_url);
assert_eq!(docs.len(), 1, "Should produce one document");

let doc = &docs[0];
assert_eq!(
doc.metadata.title,
Some("Test Title without URL or Source".to_string())
);
// Fallback to the provided base_url.
assert_eq!(doc.url, "example.com");
assert_eq!(
doc.markdown,
"This is test body with no explicit URL.\nAdditional content line."
);
}

#[test]
fn test_split_llms_content_multiple_sections() {
// Test multiple sections with mixed metadata.
let content = "\
# Section One
URL: https://developers.cloudflare.com
Content for section one.
# Section Two
Source: https://docs.perplexity.ai
Content for section two.
# Section Three
Content for section three with no metadata.
";
let base_url = "example.com";
let docs = split_llms_content(content, base_url);
assert_eq!(docs.len(), 3, "Should produce three documents");

// Section One.
let doc1 = &docs[0];
assert_eq!(doc1.metadata.title, Some("Section One".to_string()));
assert_eq!(doc1.url, "https://developers.cloudflare.com");
assert!(doc1.markdown.contains("Content for section one."));

// Section Two.
let doc2 = &docs[1];
assert_eq!(doc2.metadata.title, Some("Section Two".to_string()));
assert_eq!(doc2.url, "https://docs.perplexity.ai");
assert!(doc2.markdown.contains("Content for section two."));

// Section Three.
let doc3 = &docs[2];
assert_eq!(doc3.metadata.title, Some("Section Three".to_string()));
// Since no URL/Source is provided, fallback to base_url.
assert_eq!(doc3.url, "example.com");
assert!(doc3
.markdown
.contains("Content for section three with no metadata."));
}
}
49 changes: 46 additions & 3 deletions ee/tabby-webserver/src/service/background_job/web_crawler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{sync::Arc, time::Duration};
use chrono::Utc;
use futures::StreamExt;
use serde::{Deserialize, Serialize};
use tabby_crawler::crawl_pipeline;
use tabby_crawler::{crawl_pipeline, crawler_llms};
use tabby_index::public::{
StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocState,
StructuredDocWebFields,
Expand Down Expand Up @@ -37,9 +37,52 @@ impl WebCrawlerJob {
pub async fn run_impl(self, embedding: Arc<dyn Embedding>) -> tabby_schema::Result<()> {
logkit::info!("Starting doc index pipeline for {}", self.url);
let embedding = embedding.clone();
let mut num_docs = 0;
let indexer = StructuredDocIndexer::new(embedding.clone());
let mut num_docs = 0;

// attempt to fetch the LLMS file using crawler_llms.
match crawler_llms(&self.url).await {
Ok(docs) => {
logkit::info!(
"Fetched and split llms-full.txt successfully. Indexing {} sections.",
docs.len()
);
// Index each section separately.
for doc in docs {
let source_doc = StructuredDoc {
source_id: self.source_id.clone(),
fields: StructuredDocFields::Web(StructuredDocWebFields {
title: doc.metadata.title.unwrap_or_default(),
link: doc.url,
body: doc.markdown,
}),
};

if indexer
.presync(&StructuredDocState {
id: source_doc.id().to_string(),
updated_at: Utc::now(),
deleted: false,
})
.await
{
indexer.sync(source_doc).await;
num_docs += 1;
}
}
indexer.commit();
logkit::info!("Indexed {} documents from '{}'", num_docs, self.url);
return Ok(());
}
Err(err) => {
logkit::info!(
"No LLMS file found, continuing with normal indexing. Error: {:?}",
err
);
}
}

// if no LLMS file was found, use the regular crawl_pipeline.
let url_prefix = self.url_prefix.as_ref().unwrap_or(&self.url);
let mut pipeline = Box::pin(crawl_pipeline(&self.url, url_prefix).await?);
while let Some(doc) = pipeline.next().await {
Expand All @@ -52,7 +95,6 @@ impl WebCrawlerJob {
body: doc.markdown,
}),
};

num_docs += 1;

if indexer
Expand All @@ -70,6 +112,7 @@ impl WebCrawlerJob {
indexer.commit();
Ok(())
}

pub async fn run(self, embedding: Arc<dyn Embedding>) -> tabby_schema::Result<()> {
let url = self.url.clone();
if tokio::time::timeout(
Expand Down

0 comments on commit 0c882b7

Please sign in to comment.