Skip to content

Commit

Permalink
feat(query): Documents as first class citizens (#504)
Browse files Browse the repository at this point in the history
For simple RAG, just adding the content of a retrieved document might be
enough. However, in more complex use cases, you might want to add
metadata as well, as is or for conditional formatting.

For instance, when dealing with large amounts of chunked code, providing
the path goes a long way. If generated metadata is good enough, could be
useful as well.

With this retrieved Documents are treated as first class citizens,
including any metadata as well. Additionally, this also paves the way
for multi retrieval (and multi modal).
  • Loading branch information
timonv authored Dec 31, 2024
1 parent 2831101 commit 235780b
Show file tree
Hide file tree
Showing 15 changed files with 491 additions and 189 deletions.
168 changes: 168 additions & 0 deletions swiftide-core/src/document.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
//! Documents are the main data structure that is retrieved via the query pipeline
//!
//! Retrievers are expected to eagerly set any configured metadata on the document, with the same
//! field name used during indexing if applicable.
use std::fmt;

use derive_builder::Builder;
use serde::{Deserialize, Serialize};

use crate::{metadata::Metadata, util::debug_long_utf8};

/// A document represents a single unit of retrieved text
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Builder)]
#[builder(setter(into))]
pub struct Document {
#[builder(default)]
metadata: Metadata,
content: String,
}

impl From<Document> for serde_json::Value {
fn from(document: Document) -> Self {
serde_json::json!({
"metadata": document.metadata,
"content": document.content,
})
}
}

impl From<&Document> for serde_json::Value {
fn from(document: &Document) -> Self {
serde_json::json!({
"metadata": document.metadata,
"content": document.content,
})
}
}

impl PartialOrd for Document {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.content.cmp(&other.content))
}
}

impl Ord for Document {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.content.cmp(&other.content)
}
}

impl fmt::Debug for Document {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Document")
.field("metadata", &self.metadata)
.field("content", &debug_long_utf8(&self.content, 100))
.finish()
}
}

impl<T: AsRef<str>> From<T> for Document {
fn from(value: T) -> Self {
Document::new(value.as_ref(), None)
}
}

impl Document {
pub fn new(content: impl Into<String>, metadata: Option<Metadata>) -> Self {
Self {
metadata: metadata.unwrap_or_default(),
content: content.into(),
}
}

pub fn builder() -> DocumentBuilder {
DocumentBuilder::default()
}

pub fn content(&self) -> &str {
&self.content
}

pub fn metadata(&self) -> &Metadata {
&self.metadata
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::metadata::Metadata;

#[test]
fn test_document_creation() {
let content = "Test content";
let metadata = Metadata::from([("some", "metadata")]);
let document = Document::new(content, Some(metadata.clone()));

assert_eq!(document.content(), content);
assert_eq!(document.metadata(), &metadata);
}

#[test]
fn test_document_default_metadata() {
let content = "Test content";
let document = Document::new(content, None);

assert_eq!(document.content(), content);
assert_eq!(document.metadata(), &Metadata::default());
}

#[test]
fn test_document_from_str() {
let content = "Test content";
let document: Document = content.into();

assert_eq!(document.content(), content);
assert_eq!(document.metadata(), &Metadata::default());
}

#[test]
fn test_document_partial_ord() {
let doc1 = Document::new("A", None);
let doc2 = Document::new("B", None);

assert!(doc1 < doc2);
}

#[test]
fn test_document_ord() {
let doc1 = Document::new("A", None);
let doc2 = Document::new("B", None);

assert!(doc1.cmp(&doc2) == std::cmp::Ordering::Less);
}

#[test]
fn test_document_debug() {
let content = "Test content";
let document = Document::new(content, None);
let debug_str = format!("{document:?}");

assert!(debug_str.contains("Document"));
assert!(debug_str.contains("metadata"));
assert!(debug_str.contains("content"));
}

#[test]
fn test_document_to_json() {
let content = "Test content";
let metadata = Metadata::from([("some", "metadata")]);
let document = Document::new(content, Some(metadata.clone()));
let json_value: serde_json::Value = document.into();

assert_eq!(json_value["content"], content);
assert_eq!(json_value["metadata"], serde_json::json!(metadata));
}

#[test]
fn test_document_ref_to_json() {
let content = "Test content";
let metadata = Metadata::from([("some", "metadata")]);
let document = Document::new(content, Some(metadata.clone()));
let json_value: serde_json::Value = (&document).into();

assert_eq!(json_value["content"], content);
assert_eq!(json_value["metadata"], serde_json::json!(metadata));
}
}
1 change: 1 addition & 0 deletions swiftide-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub mod query_traits;
mod search_strategies;
pub mod type_aliases;

pub mod document;
pub mod prompt;
pub use type_aliases::*;

Expand Down
6 changes: 5 additions & 1 deletion swiftide-core/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use serde::Deserializer;

use crate::util::debug_long_utf8;

#[derive(Clone, Default, PartialEq)]
#[derive(Clone, Default, PartialEq, Eq)]
pub struct Metadata {
inner: BTreeMap<String, serde_json::Value>,
}
Expand Down Expand Up @@ -53,6 +53,10 @@ impl Metadata {
pub fn into_values(self) -> IntoValues<String, serde_json::Value> {
self.inner.into_values()
}

pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
}

impl<K, V> Extend<(K, V)> for Metadata
Expand Down
Loading

0 comments on commit 235780b

Please sign in to comment.