feat(crawler): add functionality to fetch and index LLMS files from W…

…ebDocuments (#3880) * feat(crawler): add functionality to fetch and index LLMS files from a given URL * fix(webcrawler): update logging for document fetching and streamline indexing process * fix(webcrawler): add logging for crawled llms.txt documents * feat(crawler): enhance crawler_llms to split llms-full.txt into multiple documents * refactor(crawler): move split_llms_content to llms_txt_parser module * [autofix.ci] apply automated fixes * test(crawler): add integration tests for crawler_llms with Cloudflare and Perplexity sources --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
TabbyML · Feb 23, 2025 · 0c882b7 · 0c882b7
1 parent 86cd7bd
commit 0c882b7
Show file tree

Hide file tree

Showing 5 changed files with 281 additions and 3 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/tabby-crawler/Cargo.toml b/crates/tabby-crawler/Cargo.toml
@@ -18,6 +18,7 @@ serde_json.workspace = true
 logkit.workspace = true
 htmd = "0.1"
 regex.workspace = true
+reqwest.workspace = true
 
 [dev-dependencies]
 tracing-test.workspace = true

diff --git a/crates/tabby-crawler/src/lib.rs b/crates/tabby-crawler/src/lib.rs
@@ -1,3 +1,4 @@
+mod llms_txt_parser;
 mod types;
 
 use std::process::Stdio;
@@ -148,6 +149,31 @@ pub async fn crawl_pipeline(
         .filter_map(move |data| async move { to_document(data) }))
 }
 
+/// Attempts to fetch `llms-full.txt` from the given base URL,
+/// then splits its markdown content into multiple sections based on H1 headings.
+/// Each section becomes a separate `CrawledDocument`.
+/// Returns a vector of `CrawledDocument`s if successful.
+pub async fn crawler_llms(start_url: &str) -> anyhow::Result<Vec<CrawledDocument>> {
+    // Remove trailing slash from the base URL if present.
+    let base_url = start_url.trim_end_matches('/');
+
+    let llms_full_url = format!("{}/llms-full.txt", base_url);
+    let resp = reqwest::get(&llms_full_url).await?;
+    if !resp.status().is_success() {
+        anyhow::bail!("Unable to fetch llms-full.txt from {}", base_url);
+    }
+    let body = resp.text().await?;
+    debug!("Successfully fetched llms-full.txt: {}", llms_full_url);
+
+    // Split the fetched markdown content into sections.
+    let docs = llms_txt_parser::split_llms_content(&body, start_url);
+    if docs.is_empty() {
+        anyhow::bail!("No sections found in llms-full.txt from {}", base_url);
+    }
+
+    Ok(docs)
+}
+
 #[cfg(test)]
 mod tests {
 
@@ -182,4 +208,34 @@ mod tests {
         assert_eq!(doc.url, "https://example.com");
         assert_eq!(doc.markdown, "Hello, World!");
     }
+
+    #[tokio::test]
+    #[traced_test]
+    async fn test_crawler_llms_success_developers_cloudflare_with_url() {
+        let base_url = "https://developers.cloudflare.com";
+        let result = crawler_llms(base_url).await;
+        assert!(result.is_ok(), "Expected success from {}", base_url);
+        let docs = result.unwrap();
+        assert!(
+            !docs.is_empty(),
+            "Expected at least one section from llms-full.txt at {}",
+            base_url
+        );
+        println!("Fetched {} documents from {}", docs.len(), base_url);
+    }
+
+    #[tokio::test]
+    #[traced_test]
+    async fn test_crawler_llms_success_docs_perplexity_with_source() {
+        let base_url = "https://docs.perplexity.ai";
+        let result = crawler_llms(base_url).await;
+        assert!(result.is_ok(), "Expected success from {}", base_url);
+        let docs = result.unwrap();
+        assert!(
+            !docs.is_empty(),
+            "Expected at least one section from llms-full.txt at {}",
+            base_url
+        );
+        println!("Fetched {} documents from {}", docs.len(), base_url);
+    }
 }
diff --git a/crates/tabby-crawler/src/llms_txt_parser.rs b/crates/tabby-crawler/src/llms_txt_parser.rs
@@ -0,0 +1,177 @@
+use crate::types::{CrawledDocument, CrawledMetadata};
+
+pub fn split_llms_content(content: &str, base_url: &str) -> Vec<CrawledDocument> {
+    let mut docs = Vec::new();
+    let mut current_title: Option<String> = None;
+    let mut current_url: Option<String> = None;
+    let mut current_body = String::new();
+
+    // Process the content line by line.
+    for line in content.lines() {
+        // Check if the line starts with a heading-1 marker.
+        if line.starts_with("# ") {
+            // If we already have a section in progress, finalize it.
+            if let Some(title) = current_title.take() {
+                // Use the URL from the section if available; otherwise, fallback to base_url.
+                let url = current_url.take().unwrap_or_else(|| base_url.to_owned());
+                let metadata = CrawledMetadata {
+                    title: title.into(),
+                    description: url.clone().into(),
+                };
+                docs.push(CrawledDocument::new(
+                    url,
+                    current_body.trim().to_owned(),
+                    metadata,
+                ));
+                current_body = String::new();
+            }
+            current_title = Some(line[2..].trim().to_owned());
+            current_url = None;
+        } else if line.starts_with("URL:") || line.starts_with("Source:") {
+            let prefix_len = if line.starts_with("URL:") { 4 } else { 7 };
+            let url_str = line[prefix_len..].trim();
+            current_url = Some(url_str.to_owned());
+        } else {
+            current_body.push_str(line);
+            current_body.push('\n');
+        }
+    }
+
+    // Finalize the last section if any.
+    if let Some(title) = current_title {
+        let url = current_url.unwrap_or_else(|| base_url.to_owned());
+        let metadata = CrawledMetadata {
+            title: title.into(),
+            description: url.clone().into(),
+        };
+        docs.push(CrawledDocument::new(
+            url,
+            current_body.trim().to_owned(),
+            metadata,
+        ));
+    }
+
+    docs
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_split_llms_content_with_url() {
+        // Test a section that provides a URL.
+        let content = "\
+# Test Title with URL
+URL: https://developers.cloudflare.com
+This is a test body.
+More text on the same section.
+";
+        let base_url = "example.com";
+        let docs = split_llms_content(content, base_url);
+        assert_eq!(docs.len(), 1, "Should produce one document");
+
+        let doc = &docs[0];
+        // The title is taken from the heading.
+        assert_eq!(doc.metadata.title, Some("Test Title with URL".to_string()));
+        // The URL should be extracted from the URL: line.
+        assert_eq!(doc.url, "https://developers.cloudflare.com");
+        // The body should contain the text after the URL line.
+        assert_eq!(
+            doc.markdown,
+            "This is a test body.\nMore text on the same section."
+        );
+    }
+
+    #[test]
+    fn test_split_llms_content_with_source() {
+        // Test a section that provides a Source.
+        let content = "\
+# Test Title with Source
+Source: https://docs.perplexity.ai
+This is another test body.
+Line two of body.
+";
+        let base_url = "example.com";
+        let docs = split_llms_content(content, base_url);
+        assert_eq!(docs.len(), 1, "Should produce one document");
+
+        let doc = &docs[0];
+        assert_eq!(
+            doc.metadata.title,
+            Some("Test Title with Source".to_string())
+        );
+        // The URL should be extracted from the Source: line.
+        assert_eq!(doc.url, "https://docs.perplexity.ai");
+        assert_eq!(
+            doc.markdown,
+            "This is another test body.\nLine two of body."
+        );
+    }
+
+    #[test]
+    fn test_split_llms_content_without_metadata() {
+        // Test a section with no URL or Source line; should fallback to base_url.
+        let content = "\
+# Test Title without URL or Source
+This is test body with no explicit URL.
+Additional content line.
+";
+        let base_url = "example.com";
+        let docs = split_llms_content(content, base_url);
+        assert_eq!(docs.len(), 1, "Should produce one document");
+
+        let doc = &docs[0];
+        assert_eq!(
+            doc.metadata.title,
+            Some("Test Title without URL or Source".to_string())
+        );
+        // Fallback to the provided base_url.
+        assert_eq!(doc.url, "example.com");
+        assert_eq!(
+            doc.markdown,
+            "This is test body with no explicit URL.\nAdditional content line."
+        );
+    }
+
+    #[test]
+    fn test_split_llms_content_multiple_sections() {
+        // Test multiple sections with mixed metadata.
+        let content = "\
+# Section One
+URL: https://developers.cloudflare.com
+Content for section one.
+
+# Section Two
+Source: https://docs.perplexity.ai
+Content for section two.
+
+# Section Three
+Content for section three with no metadata.
+";
+        let base_url = "example.com";
+        let docs = split_llms_content(content, base_url);
+        assert_eq!(docs.len(), 3, "Should produce three documents");
+
+        // Section One.
+        let doc1 = &docs[0];
+        assert_eq!(doc1.metadata.title, Some("Section One".to_string()));
+        assert_eq!(doc1.url, "https://developers.cloudflare.com");
+        assert!(doc1.markdown.contains("Content for section one."));
+
+        // Section Two.
+        let doc2 = &docs[1];
+        assert_eq!(doc2.metadata.title, Some("Section Two".to_string()));
+        assert_eq!(doc2.url, "https://docs.perplexity.ai");
+        assert!(doc2.markdown.contains("Content for section two."));
+
+        // Section Three.
+        let doc3 = &docs[2];
+        assert_eq!(doc3.metadata.title, Some("Section Three".to_string()));
+        // Since no URL/Source is provided, fallback to base_url.
+        assert_eq!(doc3.url, "example.com");
+        assert!(doc3
+            .markdown
+            .contains("Content for section three with no metadata."));
+    }
+}
diff --git a/ee/tabby-webserver/src/service/background_job/web_crawler.rs b/ee/tabby-webserver/src/service/background_job/web_crawler.rs
@@ -3,7 +3,7 @@ use std::{sync::Arc, time::Duration};
 use chrono::Utc;
 use futures::StreamExt;
 use serde::{Deserialize, Serialize};
-use tabby_crawler::crawl_pipeline;
+use tabby_crawler::{crawl_pipeline, crawler_llms};
 use tabby_index::public::{
     StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocState,
     StructuredDocWebFields,
@@ -37,9 +37,52 @@ impl WebCrawlerJob {
     pub async fn run_impl(self, embedding: Arc<dyn Embedding>) -> tabby_schema::Result<()> {
         logkit::info!("Starting doc index pipeline for {}", self.url);
         let embedding = embedding.clone();
-        let mut num_docs = 0;
         let indexer = StructuredDocIndexer::new(embedding.clone());
+        let mut num_docs = 0;
+
+        // attempt to fetch the LLMS file using crawler_llms.
+        match crawler_llms(&self.url).await {
+            Ok(docs) => {
+                logkit::info!(
+                    "Fetched and split llms-full.txt successfully. Indexing {} sections.",
+                    docs.len()
+                );
+                // Index each section separately.
+                for doc in docs {
+                    let source_doc = StructuredDoc {
+                        source_id: self.source_id.clone(),
+                        fields: StructuredDocFields::Web(StructuredDocWebFields {
+                            title: doc.metadata.title.unwrap_or_default(),
+                            link: doc.url,
+                            body: doc.markdown,
+                        }),
+                    };
 
+                    if indexer
+                        .presync(&StructuredDocState {
+                            id: source_doc.id().to_string(),
+                            updated_at: Utc::now(),
+                            deleted: false,
+                        })
+                        .await
+                    {
+                        indexer.sync(source_doc).await;
+                        num_docs += 1;
+                    }
+                }
+                indexer.commit();
+                logkit::info!("Indexed {} documents from '{}'", num_docs, self.url);
+                return Ok(());
+            }
+            Err(err) => {
+                logkit::info!(
+                    "No LLMS file found, continuing with normal indexing. Error: {:?}",
+                    err
+                );
+            }
+        }
+
+        // if no LLMS file was found, use the regular crawl_pipeline.
         let url_prefix = self.url_prefix.as_ref().unwrap_or(&self.url);
         let mut pipeline = Box::pin(crawl_pipeline(&self.url, url_prefix).await?);
         while let Some(doc) = pipeline.next().await {
@@ -52,7 +95,6 @@ impl WebCrawlerJob {
                     body: doc.markdown,
                 }),
             };
-
             num_docs += 1;
 
             if indexer
@@ -70,6 +112,7 @@ impl WebCrawlerJob {
         indexer.commit();
         Ok(())
     }
+
     pub async fn run(self, embedding: Arc<dyn Embedding>) -> tabby_schema::Result<()> {
         let url = self.url.clone();
         if tokio::time::timeout(