diff --git a/docs/_sidebar.md b/docs/_sidebar.md index ee2c472a..27aaf9be 100644 --- a/docs/_sidebar.md +++ b/docs/_sidebar.md @@ -80,6 +80,7 @@ - [Text](/modules/retrieval/document_loaders/how_to/text.md) - [JSON](/modules/retrieval/document_loaders/how_to/json.md) - [Web page](/modules/retrieval/document_loaders/how_to/web.md) + - [Directory](/modules/retrieval/document_loaders/how_to/directory.md) - [Document transformers](/modules/retrieval/document_transformers/document_transformers.md) - Text splitters - [Split by character](/modules/retrieval/document_transformers/text_splitters/character_text_splitter.md) diff --git a/docs/modules/retrieval/document_loaders/how_to/directory.md b/docs/modules/retrieval/document_loaders/how_to/directory.md new file mode 100644 index 00000000..1ef96c20 --- /dev/null +++ b/docs/modules/retrieval/document_loaders/how_to/directory.md @@ -0,0 +1,214 @@ +# Directory + +Use `DirectoryLoader` to load `Document`s from multiple files in a directory with extensive customization options. + +## Overview + +The `DirectoryLoader` is a versatile document loader that allows you to load documents from a directory with powerful filtering, sampling, and customization capabilities. It supports multiple file types out of the box and provides extensive configuration options. + +## Basic Usage + +```dart +// Load all text files from a directory recursively +final loader = DirectoryLoader( + '/path/to/documents', + glob: '*.txt', + recursive: true, +); +final documents = await loader.load(); +``` + +## Constructor Parameters + +### `filePath` (required) +- Type: `String` +- Description: The path to the directory containing documents to load. + +### `glob` +- Type: `String` +- Default: `'*'` (all files) +- Description: A glob pattern to match files. Only files matching this pattern will be loaded. +- Examples: + ```dart + // Load only JSON and text files + DirectoryLoader('/path', glob: '*.{txt,json}') + + // Load files starting with 'report' + DirectoryLoader('/path', glob: 'report*') + ``` + +### `recursive` +- Type: `bool` +- Default: `true` +- Description: Whether to search recursively in subdirectories. + +### `exclude` +- Type: `List` +- Default: `[]` +- Description: Glob patterns to exclude from loading. +- Example: + ```dart + DirectoryLoader( + '/path', + exclude: ['*.tmp', 'draft*'], + ) + ``` + +### `loaderMap` +- Type: `Map` +- Default: `DirectoryLoader.defaultLoaderMap` +- Description: A map to customize loaders for different file types. +- Default Supported Types: + - `.txt`: TextLoader + - `.json`: JsonLoader (with root schema) + - `.csv` and `.tsv`: CsvLoader +- Example of extending loaders: + ```dart + final loader = DirectoryLoader( + '/path/to/docs', + loaderMap: { + // Add a custom loader for XML files + '.xml': (path) => CustomXmlLoader(path), + + // Combine with default loaders + ...DirectoryLoader.defaultLoaderMap, + }, + ); + ``` + +### `loadHidden` +- Type: `bool` +- Default: `false` +- Description: Whether to load hidden files. +- Platform Specific: + - On Unix-like systems (Linux, macOS): Identifies hidden files by names starting with '.' + - On Windows: May not work as expected due to different hidden file conventions + - Recommended to use platform-specific checks for comprehensive hidden file handling across different operating systems +- Example of platform-aware hidden file checking: + ```dart + import 'dart:io' show Platform; + + bool isHiddenFile(File file) { + if (Platform.isWindows) { + // Windows-specific hidden file check + return (File(file.path).statSync().modeString().startsWith('h')); + } else { + // Unix-like systems + return path.basename(file.path).startsWith('.'); + } + } + ``` + +### `sampleSize` +- Type: `int` +- Default: `0` (load all files) +- Description: Maximum number of files to load. +- Example: + ```dart + // Load only 10 files + DirectoryLoader('/path', sampleSize: 10) + ``` + +### `randomizeSample` +- Type: `bool` +- Default: `false` +- Description: Whether to randomize the sample of files. + +### `sampleSeed` +- Type: `int?` +- Default: `null` +- Description: Seed for random sampling to ensure reproducibility. +- Example: + ```dart + // Consistent random sampling + DirectoryLoader( + '/path', + sampleSize: 10, + randomizeSample: true, + sampleSeed: 42, + ) + ``` + +### `metadataBuilder` +- Type: `Map Function(File file, Map defaultMetadata)?` +- Default: `null` +- Description: A custom function to build metadata for each document. +- Example: + ```dart + final loader = DirectoryLoader( + '/path', + metadataBuilder: (file, defaultMetadata) { + return { + ...defaultMetadata, + 'custom_tag': 'important_document', + 'processing_date': DateTime.now().toIso8601String(), + }; + }, + ); + ``` + +## Default Metadata + +By default, each document receives metadata including: +- `source`: Full file path +- `name`: Filename +- `extension`: File extension +- `size`: File size in bytes +- `lastModified`: Last modification timestamp (milliseconds since epoch) + +## Lazy Loading + +The `DirectoryLoader` supports lazy loading through the `lazyLoad()` method, which returns a `Stream`. This is useful for processing large numbers of documents without loading everything into memory at once. + +```dart +final loader = DirectoryLoader('/path/to/documents'); +await for (final document in loader.lazyLoad()) { + // Process each document as it's loaded + print(document.pageContent); +} +``` + +## Error Handling + +- Throws an `ArgumentError` if the specified directory does not exist +- Throws an assertion error if the sample size is larger than the total number of files + +## Advanced Example + +```dart +final loader = DirectoryLoader( + '/path/to/documents', + glob: '*.{txt,json,csv}', // Multiple file types + recursive: true, // Search subdirectories + exclude: ['temp*', '*.backup'], // Exclude temp and backup files + loadHidden: false, // Ignore hidden files + sampleSize: 50, // Load only 50 files + randomizeSample: true, // Randomize the sample + sampleSeed: 123, // Reproducible random sampling + loaderMap: { + // Custom loader for a specific file type + '.json': (path) => CustomJsonLoader(path), + }, + metadataBuilder: (file, defaultMetadata) { + // Add custom metadata + return { + ...defaultMetadata, + 'category': _categorizeFile(file), + }; + }, +); + +final documents = await loader.load(); +``` + +## Best Practices + +- Use `lazyLoad()` for large directories to manage memory efficiently +- Provide specific glob patterns to reduce unnecessary file processing +- Customize loaders for specialized file types +- Use `metadataBuilder` to add context-specific information to documents + +## Limitations + +- Relies on file system access +- Performance may vary with large directories \ No newline at end of file diff --git a/packages/langchain_community/lib/src/document_loaders/directory.dart b/packages/langchain_community/lib/src/document_loaders/directory.dart new file mode 100644 index 00000000..aa3c6e6b --- /dev/null +++ b/packages/langchain_community/lib/src/document_loaders/directory.dart @@ -0,0 +1,241 @@ +import 'dart:async'; +import 'dart:io'; +import 'dart:math'; +import 'package:glob/glob.dart'; +import 'package:langchain_core/document_loaders.dart'; +import 'package:langchain_core/documents.dart'; +import 'package:path/path.dart' as path; + +import 'csv.dart'; +import 'json.dart'; +import 'text.dart'; + +/// {@template directory_loader} +/// A versatile document loader that loads [Document]s from a directory. +/// +/// This loader can: +/// - Load files from a specified directory +/// - Apply glob patterns to filter files +/// - Recursively search subdirectories +/// - Exclude specific files or patterns +/// - Use custom loaders for different file types +/// - Sample files randomly or by a specific count +/// - Build custom metadata for loaded documents +/// +/// ## Default Supported File Types +/// By default, the DirectoryLoader supports the following file types: +/// - `.txt`: Text files (loaded using [TextLoader]) +/// - Loads the entire file content as a single document +/// - `.json`: JSON files (loaded using [JsonLoader] with root schema) +/// - Extracts all JSON objects or values at the root level +/// - `.csv` and `.tsv`: CSV/TSV files (loaded using [CsvLoader]) +/// - Converts each row into a separate document +/// +/// Example usage: +/// ```dart +/// // Load all text and JSON files from a directory recursively +/// final loader = DirectoryLoader( +/// '/path/to/documents', +/// glob: '*.{txt,json}', +/// recursive: true, +/// ); +/// final documents = await loader.load(); +/// +/// // Load a random sample of 10 CSV files, excluding hidden files +/// final sampleLoader = DirectoryLoader( +/// '/path/to/csvs', +/// glob: '*.csv', +/// loadHidden: false, +/// sampleSize: 10, +/// randomizeSample: true, +/// ); +/// final sampleDocuments = await sampleLoader.load(); +/// ``` +/// +/// The loader supports customization through various parameters: +/// - [filePath]: The directory path to load documents from +/// - [glob]: Glob pattern to match files (defaults to all files) +/// - [recursive]: Whether to search recursively in subdirectories +/// - [exclude]: Patterns to exclude from loading +/// - [loaderMap]: Map of file extensions to specific loaders +/// - [loadHidden]: Whether to load hidden files +/// - [sampleSize]: Maximum number of files to load +/// - [randomizeSample]: Whether to randomize the file sample +/// - [sampleSeed]: Seed for reproducible random sampling +/// - [metadataBuilder]: Custom metadata building function +/// +/// You can extend the default loader support by providing a custom [loaderMap]. +/// {@endtemplate} +class DirectoryLoader extends BaseDocumentLoader { + /// {@macro directory_loader} + const DirectoryLoader( + this.filePath, { + this.glob = '*', + this.recursive = true, + this.exclude = const [], + this.loaderMap = const {}, + this.loadHidden = false, + this.sampleSize = 0, + this.randomizeSample = false, + this.sampleSeed, + this.metadataBuilder, + }); + + /// The path to the directory to load documents from + final String filePath; + + /// Glob pattern to match files + /// Defaults to '*' (all files) + final String glob; + + /// Whether to search recursively in subdirectories + /// Defaults to true + final bool recursive; + + /// Patterns to exclude from loading + final List exclude; + + /// Map of file extensions to specific loaders + /// + /// This map allows customization of how different file types are loaded: + /// - Keys are file extensions (including the dot, e.g., '.txt', '.json') + /// - Values are functions that create a [BaseDocumentLoader] for a given file path + /// + /// If not provided, [defaultLoaderMap] will be used, which supports: + /// - `.txt`: TextLoader + /// - `.json`: JsonLoader (with root schema) + /// - `.csv` and `.tsv`: CsvLoader + /// + /// Example of extending or customizing loaders: + /// ```dart + /// final loader = DirectoryLoader( + /// '/path/to/docs', + /// loaderMap: { + /// // Add a custom loader for XML files + /// '.xml': (path) => CustomXmlLoader(path), + /// + /// // Override default JSON loader with a custom implementation + /// '.json': (path) => CustomJsonLoader(path), + /// + /// // Combine with default loaders + /// ...DirectoryLoader.defaultLoaderMap, + /// }, + /// ); + /// ``` + /// + /// If no loader is found for a file type, [TextLoader] will be used as a fallback. + final Map loaderMap; + + /// Whether to load hidden files (starting with '.') + /// Defaults to false + final bool loadHidden; + + /// Maximum number of files to load + /// Defaults to 0 (load all files) + final int sampleSize; + + /// Whether to randomize the sample of files + /// Defaults to false + final bool randomizeSample; + + /// Seed for random sampling to ensure reproducibility + final int? sampleSeed; + + /// Optional function to build custom metadata for each document + final Map Function( + File file, + Map defaultMetadata, + )? metadataBuilder; + + /// Default loader map with common file type loaders + /// + /// Provides out-of-the-box support for: + /// - Plain text files (`.txt`) + /// - JSON files (`.json`) - uses root schema + /// - CSV and TSV files (`.csv`, `.tsv`) + /// + /// Can be extended or overridden when creating a [DirectoryLoader] + static Map defaultLoaderMap = { + '.txt': TextLoader.new, + '.json': (path) => JsonLoader(path, jpSchema: r'$'), + '.csv': CsvLoader.new, + '.tsv': CsvLoader.new, + }; + + bool _shouldLoadFile(File file) { + if (!loadHidden && path.basename(file.path).startsWith('.')) { + return false; + } + + final globMatcher = Glob(glob); + if (!globMatcher.matches(path.basename(file.path))) { + return false; + } + + for (final excludePattern in exclude) { + if (Glob(excludePattern).matches(path.basename(file.path))) { + return false; + } + } + + return true; + } + + Map _buildDefaultMetadata(File file) { + return { + 'source': file.path, + 'name': path.basename(file.path), + 'extension': path.extension(file.path), + 'size': file.lengthSync(), + 'lastModified': file.lastModifiedSync().millisecondsSinceEpoch, + }; + } + + @override + Stream lazyLoad() async* { + if (glob.isEmpty) { + throw ArgumentError('Glob pattern must not be empty'); + } + + final directory = Directory(filePath); + + List files = directory + .listSync(recursive: recursive) + .whereType() + .where(_shouldLoadFile) + .toList(); + + if (sampleSize > 0) { + if (randomizeSample) { + final seed = sampleSeed ?? DateTime.now().millisecondsSinceEpoch; + files.shuffle(Random(seed)); + } + files = files.take(sampleSize).toList(); + } + + for (final file in files) { + final ext = path.extension(file.path).toLowerCase(); + final loader = (loaderMap.isNotEmpty ? loaderMap : defaultLoaderMap) + .entries + .firstWhere( + (entry) => entry.key == ext, + orElse: () => MapEntry(ext, TextLoader.new), + ) + .value(file.path); + + final defaultMetadata = _buildDefaultMetadata(file); + final metadata = + metadataBuilder?.call(file, defaultMetadata) ?? defaultMetadata; + + await for (final doc in loader.lazyLoad()) { + final finalDoc = Document( + pageContent: doc.pageContent, + metadata: doc.metadata['source'] == file.path + ? metadata + : {...metadata, ...doc.metadata}, + ); + yield finalDoc; + } + } + } +} diff --git a/packages/langchain_community/lib/src/document_loaders/document_loaders.dart b/packages/langchain_community/lib/src/document_loaders/document_loaders.dart index 468fd816..ce735c6d 100644 --- a/packages/langchain_community/lib/src/document_loaders/document_loaders.dart +++ b/packages/langchain_community/lib/src/document_loaders/document_loaders.dart @@ -1,4 +1,5 @@ export 'csv.dart'; +export 'directory.dart'; export 'json.dart'; export 'text.dart'; export 'web.dart'; diff --git a/packages/langchain_community/pubspec.yaml b/packages/langchain_community/pubspec.yaml index c77ab536..99e93796 100644 --- a/packages/langchain_community/pubspec.yaml +++ b/packages/langchain_community/pubspec.yaml @@ -20,12 +20,14 @@ dependencies: cross_file: ^0.3.4+2 csv: ^6.0.0 flat_buffers: ^23.5.26 + glob: ^2.1.2 http: ^1.2.2 json_path: ^0.7.4 langchain_core: 0.3.6 math_expressions: ^2.6.0 meta: ^1.11.0 objectbox: ^4.0.3 + path: ^1.9.1 tavily_dart: ^0.1.0 uuid: ^4.5.1 diff --git a/packages/langchain_community/test/document_loaders/directory_test.dart b/packages/langchain_community/test/document_loaders/directory_test.dart new file mode 100644 index 00000000..9771be53 --- /dev/null +++ b/packages/langchain_community/test/document_loaders/directory_test.dart @@ -0,0 +1,317 @@ +import 'dart:io'; +import 'package:langchain_community/langchain_community.dart'; +import 'package:langchain_core/documents.dart'; +import 'package:test/test.dart'; + +void main() { + group('DirectoryLoader tests', () { + test( + 'Test loading directory with multiple file types and multiple documents per file', + () async { + const filePath = './test/document_loaders/assets'; + const loader = DirectoryLoader(filePath, glob: '*.{txt,json,csv,tsv}'); + + final List docs = await loader.lazyLoad().toList(); + + expect(docs, isNotEmpty); + + final Set processedFiles = + docs.map((doc) => doc.metadata['source'] as String).toSet(); + + final directory = Directory(filePath); + + final Set expectedFiles = directory + .listSync() + .where( + (entity) { + return entity is File && + RegExp(r'\.(txt|json|csv|tsv)$').hasMatch(entity.path); + }, + ) + .map((file) => file.path) + .toSet(); + + expect( + processedFiles, + equals(expectedFiles), + ); + + final textDocs = + docs.where((doc) => doc.metadata['name'] == 'example.txt').toList(); + + expect( + textDocs.length, + greaterThanOrEqualTo(1), + ); + + expect( + textDocs.any((doc) => doc.pageContent.contains('Foo\nBar\nBaz\n')), + isTrue, + reason: 'Text content should match for example.txt', + ); + + final jsonDocs = docs + .where((doc) => doc.metadata['name'] == 'example_2.json') + .toList(); + + expect( + jsonDocs.length, + greaterThanOrEqualTo(1), + ); + + expect( + jsonDocs + .any((doc) => doc.pageContent.contains('Sayings of the Century')), + isTrue, + reason: 'JSON content should match for example_2.json', + ); + }); + + test('Test directory loader with specific loader map', () async { + const filePath = './test/document_loaders/assets'; + + final loader = DirectoryLoader( + filePath, + glob: '*.json', + loaderMap: { + '.json': (path) => JsonLoader(path, jpSchema: r'$..text'), + }, + ); + + expect( + loader.lazyLoad(), + emitsInOrder([ + (final Document doc) => doc.pageContent == 'Foo\nBar\nBaz\n', + emitsDone, + ]), + ); + }); + + test('Test directory loader with sample size', () async { + const filePath = './test/document_loaders/assets'; + + const loader = DirectoryLoader( + filePath, + glob: '*.{txt,json,csv}', + sampleSize: 2, + randomizeSample: true, + sampleSeed: 42, + ); + + final loadedDocs = await loader.load(); + + expect(loadedDocs, hasLength(2)); + }); + + test('Test directory loader with custom metadata builder', () async { + const filePath = './test/document_loaders/assets'; + + final loader = DirectoryLoader( + filePath, + glob: '*.txt', + metadataBuilder: (file, defaultMetadata) { + return { + ...defaultMetadata, + 'custom_info': 'custom_value', + }; + }, + ); + + expect( + loader.lazyLoad(), + emitsInOrder([ + (final Document doc) { + expect(doc.pageContent, 'Foo\nBar\nBaz\n'); + expect(doc.metadata['custom_info'], 'custom_value'); + return true; + }, + emitsDone, + ]), + ); + }); + + test('Test directory loader with non-existent directory', () { + const filePath = './non_existent_directory'; + + const loader = DirectoryLoader(filePath); + + expect( + () async => loader.lazyLoad().toList(), + throwsA(isA()), + ); + }); + + test('Test directory loader with empty directory', () async { + final tempDir = await Directory.systemTemp.createTemp('empty_test_dir'); + + try { + final loader = DirectoryLoader( + tempDir.path, + glob: '*.txt', + ); + + final loadedDocs = await loader.load(); + + expect(loadedDocs, isEmpty); + } finally { + await tempDir.delete(); + } + }); + + test('Test directory loader with exclude patterns', () async { + const filePath = './test/document_loaders/assets'; + + const loader = DirectoryLoader( + filePath, + glob: '*.txt', + exclude: ['example.txt'], + ); + + final loadedDocs = await loader.load(); + + expect( + loadedDocs.any((doc) => doc.metadata['name'] == 'example.txt'), + isFalse, + ); + }); + + test('Test directory loader non-recursive mode', () async { + const filePath = './test/document_loaders/assets'; + + const loader = DirectoryLoader(filePath, glob: '*.txt', recursive: false); + + final loadedDocs = await loader.load(); + + expect(loadedDocs, isNotEmpty); + }); + + test('Sample size of 0 loads all files', () async { + const filePath = './test/document_loaders/assets'; + + const loader = DirectoryLoader( + filePath, + glob: '*.{txt,json}', + sampleSize: 0, + ); + + final loadedDocs = await loader.load(); + + final allFiles = Directory(filePath) + .listSync() + .where( + (entity) => + entity is File && + RegExp(r'\.(txt|json)$').hasMatch(entity.path), + ) + .length; + + expect(loadedDocs, hasLength(allFiles)); + }); + + test('Sample size larger than total files loads all files', () async { + const filePath = './test/document_loaders/assets'; + + const loader = DirectoryLoader( + filePath, + glob: '*.{txt,json}', + sampleSize: 1000, + ); + + final loadedDocs = await loader.load(); + + final allFiles = Directory(filePath) + .listSync() + .where( + (entity) => + entity is File && + RegExp(r'\.(txt|json)$').hasMatch(entity.path), + ) + .length; + + expect(loadedDocs, hasLength(allFiles)); + }); + + test('Reproducible random sampling', () async { + const filePath = './test/document_loaders/assets'; + + const loader1 = DirectoryLoader( + filePath, + glob: '*.{txt,json}', + sampleSize: 2, + randomizeSample: true, + sampleSeed: 42, + ); + + const loader2 = DirectoryLoader( + filePath, + glob: '*.{txt,json}', + sampleSize: 2, + randomizeSample: true, + sampleSeed: 42, + ); + + final docs1 = await loader1.load(); + + final docs2 = await loader2.load(); + + expect( + docs1.map((doc) => doc.metadata['source']), + equals(docs2.map((doc) => doc.metadata['source'])), + ); + }); + + test('Loader map with no matching loader uses fallback', () async { + const filePath = './test/document_loaders/assets'; + + const loader = DirectoryLoader( + filePath, + glob: '*.md', + loaderMap: {'.xml': TextLoader.new}, + ); + + final loadedDocs = await loader.load(); + + expect(loadedDocs, isEmpty); + }); + + test('Mixed hidden and non-hidden files', () async { + final tempDir = await Directory.systemTemp.createTemp('mixed_files_test'); + + try { + await File('${tempDir.path}/.hidden.txt') + .writeAsString('hidden content'); + + await File('${tempDir.path}/visible.txt') + .writeAsString('visible content'); + + final loader1 = DirectoryLoader( + tempDir.path, + glob: '*.txt', + loadHidden: false, + ); + + final docs1 = await loader1.load(); + + expect( + docs1.any((doc) => doc.metadata['name'] == '.hidden.txt'), + isFalse, + ); + + final loader2 = DirectoryLoader( + tempDir.path, + glob: '*.txt', + loadHidden: true, + ); + + final docs2 = await loader2.load(); + + expect( + docs2.any((doc) => doc.metadata['name'] == '.hidden.txt'), + isTrue, + ); + } finally { + await tempDir.delete(recursive: true); + } + }); + }); +}