Skip to content

Commit

Permalink
Improve artifact type detection during pipeline execution, e.g. don' …
Browse files Browse the repository at this point in the history
…try to summarize

generated files, and partition only the text extracted from original files.
  • Loading branch information
dluc committed Aug 24, 2023
1 parent b995086 commit c1ce50b
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@
"**/.DS_Store": true,
"**/Thumbs.db": true
},
"dotnet.defaultSolution": ".\\SemanticMemory.sln"
"dotnet.defaultSolution": "SemanticMemory.sln"
}
13 changes: 7 additions & 6 deletions dotnet/CoreLib/Handlers/GenerateEmbeddingsHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,16 @@ public GenerateEmbeddingsHandler(
continue;
}

// Calc embeddings only for partitions
if (!partitionFile.IsPartition)
// Calc embeddings only for partitions (text chunks) and synthetic data
if (partitionFile.ArtifactType != DataPipeline.ArtifactTypes.TextPartition
&& partitionFile.ArtifactType != DataPipeline.ArtifactTypes.SyntheticData)
{
this._log.LogTrace("Skipping file {0} (not a partition)", partitionFile.Name);
this._log.LogTrace("Skipping file {0} (not a partition, not synthetic data)", partitionFile.Name);
continue;
}

// TODO: cost/perf: if the partition SHA256 is the same and the embedding exists, avoid generating it again
switch (partitionFile.Type)
switch (partitionFile.MimeType)
{
case MimeTypes.PlainText:
case MimeTypes.MarkDown:
Expand Down Expand Up @@ -133,8 +134,8 @@ public GenerateEmbeddingsHandler(
ParentId = uploadedFile.Id,
Name = embeddingFileName,
Size = text.Length,
Type = MimeTypes.TextEmbeddingVector,
IsPartition = false
MimeType = MimeTypes.TextEmbeddingVector,
ArtifactType = DataPipeline.ArtifactTypes.TextEmbeddingVector
};
embeddingFileNameDetails.MarkProcessedBy(this);
newFiles.Add(embeddingFileName, embeddingFileNameDetails);
Expand Down
11 changes: 7 additions & 4 deletions dotnet/CoreLib/Handlers/SaveEmbeddingsHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ public SaveEmbeddingsHandler(
pipeline.PreviousExecutionsToPurge = new List<DataPipeline>();

// For each embedding file => For each Vector DB => Store vector (collections ==> tags)
foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> embeddingFile in pipeline.Files.SelectMany(x => x.GeneratedFiles.Where(f => f.Value.IsEmbeddingFile())))
foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> embeddingFile in
pipeline.Files.SelectMany(x => x.GeneratedFiles.Where(f => f.Value.ArtifactType == DataPipeline.ArtifactTypes.TextEmbeddingVector)))
{
if (embeddingFile.Value.AlreadyProcessedBy(this))
{
Expand All @@ -79,7 +80,7 @@ public SaveEmbeddingsHandler(
record.Tags.Add(Constants.ReservedDocumentIdTag, pipeline.DocumentId);
record.Tags.Add(Constants.ReservedFileIdTag, embeddingFile.Value.ParentId);
record.Tags.Add(Constants.ReservedFilePartitionTag, embeddingFile.Value.Id);
record.Tags.Add(Constants.ReservedFileTypeTag, pipeline.GetFile(embeddingFile.Value.ParentId).Type);
record.Tags.Add(Constants.ReservedFileTypeTag, pipeline.GetFile(embeddingFile.Value.ParentId).MimeType);

pipeline.Tags.CopyTo(record.Tags);

Expand Down Expand Up @@ -116,7 +117,8 @@ private async Task DeletePreviousEmbeddingsAsync(DataPipeline pipeline, Cancella
var embeddingsToKeep = new HashSet<string>();

// Decide which embeddings not to delete, looking at the current pipeline
foreach (DataPipeline.GeneratedFileDetails embeddingFile in pipeline.Files.SelectMany(f1 => f1.GeneratedFiles.Where(f2 => f2.Value.IsEmbeddingFile()).Select(x => x.Value)))
foreach (DataPipeline.GeneratedFileDetails embeddingFile
in pipeline.Files.SelectMany(f1 => f1.GeneratedFiles.Where(f2 => f2.Value.ArtifactType == DataPipeline.ArtifactTypes.TextEmbeddingVector).Select(x => x.Value)))
{
string recordId = GetEmbeddingRecordId(pipeline.DocumentId, embeddingFile.Id);
embeddingsToKeep.Add(recordId);
Expand All @@ -125,7 +127,8 @@ private async Task DeletePreviousEmbeddingsAsync(DataPipeline pipeline, Cancella
// Purge old pipelines data, unless it's still relevant in the current pipeline
foreach (DataPipeline oldPipeline in pipeline.PreviousExecutionsToPurge)
{
foreach (DataPipeline.GeneratedFileDetails embeddingFile in oldPipeline.Files.SelectMany(f1 => f1.GeneratedFiles.Where(f2 => f2.Value.IsEmbeddingFile()).Select(x => x.Value)))
foreach (DataPipeline.GeneratedFileDetails embeddingFile
in oldPipeline.Files.SelectMany(f1 => f1.GeneratedFiles.Where(f2 => f2.Value.ArtifactType == DataPipeline.ArtifactTypes.TextEmbeddingVector).Select(x => x.Value)))
{
string recordId = GetEmbeddingRecordId(oldPipeline.DocumentId, embeddingFile.Id);
if (embeddingsToKeep.Contains(recordId)) { continue; }
Expand Down
10 changes: 5 additions & 5 deletions dotnet/CoreLib/Handlers/SummarizationHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,14 @@ public SummarizationHandler(
continue;
}

// Don't summarize partitions
if (file.IsPartition)
// Summarize only the original content
if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
{
this._log.LogTrace("Skipping partition file {0}", file.Name);
continue;
}

switch (file.Type)
switch (file.MimeType)
{
case MimeTypes.PlainText:
case MimeTypes.MarkDown:
Expand All @@ -99,8 +99,8 @@ public SummarizationHandler(
ParentId = uploadedFile.Id,
Name = destFile,
Size = summary.Length,
Type = MimeTypes.PlainText,
IsPartition = true, // TODO
MimeType = MimeTypes.PlainText,
ArtifactType = DataPipeline.ArtifactTypes.SyntheticData,
ContentSHA256 = CalculateSHA256(summary),
});
}
Expand Down
8 changes: 4 additions & 4 deletions dotnet/CoreLib/Handlers/TextExtractionHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public TextExtractionHandler(
string text = string.Empty;
string extractType = MimeTypes.PlainText;

switch (uploadedFile.Type)
switch (uploadedFile.MimeType)
{
case MimeTypes.PlainText:
this._log.LogDebug("Extracting text from plain text file {0}", uploadedFile.Name);
Expand Down Expand Up @@ -97,7 +97,7 @@ public TextExtractionHandler(
break;

default:
throw new NotSupportedException($"File type not supported: {uploadedFile.Type}");
throw new NotSupportedException($"File type not supported: {uploadedFile.MimeType}");
}

this._log.LogDebug("Saving extracted text file {0}", destFile);
Expand All @@ -109,8 +109,8 @@ public TextExtractionHandler(
ParentId = uploadedFile.Id,
Name = destFile,
Size = text.Length,
Type = extractType,
IsPartition = false
MimeType = extractType,
ArtifactType = DataPipeline.ArtifactTypes.ExtractedText
};
destFileDetails.MarkProcessedBy(this);

Expand Down
13 changes: 10 additions & 3 deletions dotnet/CoreLib/Handlers/TextPartitioningHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,17 @@ public TextPartitioningHandler(
continue;
}

// Partition only the original text
if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
{
this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
continue;
}

// Use a different partitioning strategy depending on the file type
List<string> paragraphs;
List<string> lines;
switch (file.Type)
switch (file.MimeType)
{
case MimeTypes.PlainText:
{
Expand Down Expand Up @@ -113,8 +120,8 @@ public TextPartitioningHandler(
ParentId = uploadedFile.Id,
Name = destFile,
Size = text.Length,
Type = MimeTypes.PlainText,
IsPartition = true,
MimeType = MimeTypes.PlainText,
ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
ContentSHA256 = CalculateSHA256(text),
};
newFiles.Add(destFile, destFileDetails);
Expand Down
2 changes: 1 addition & 1 deletion dotnet/CoreLib/Pipeline/BaseOrchestrator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ private async Task UploadFormFilesAsync(DataPipeline pipeline, CancellationToken
Id = Guid.NewGuid().ToString("N"),
Name = file.FileName,
Size = size,
Type = this.MimeTypeDetection.GetFileType(file.FileName),
MimeType = this.MimeTypeDetection.GetFileType(file.FileName),
});

this.Log.LogInformation("File uploaded: {0}, {1} bytes", file.FileName, size);
Expand Down
31 changes: 16 additions & 15 deletions dotnet/CoreLib/Pipeline/DataPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ namespace Microsoft.SemanticMemory.Pipeline;
/// </summary>
public class DataPipeline
{
[JsonConverter(typeof(JsonStringEnumConverter))]
public enum ArtifactTypes
{
Undefined = 0,
TextPartition = 1,
ExtractedText = 2,
TextEmbeddingVector = 3,
SyntheticData = 4,
}

public abstract class FileDetailsBase
{
/// <summary>
Expand Down Expand Up @@ -43,17 +53,15 @@ public abstract class FileDetailsBase
/// File (MIME) type
/// </summary>
[JsonPropertyOrder(3)]
[JsonPropertyName("type")]
public string Type { get; set; } = string.Empty;
[JsonPropertyName("mime_type")]
public string MimeType { get; set; } = string.Empty;

/// <summary>
/// Check if this is an embedding file (checking the file extension)
/// File (MIME) type
/// </summary>
/// <returns>True if the file contains an embedding</returns>
public bool IsEmbeddingFile()
{
return this.Type == MimeTypes.TextEmbeddingVector;
}
[JsonPropertyOrder(4)]
[JsonPropertyName("artifact_type")]
public ArtifactTypes ArtifactType { get; set; } = ArtifactTypes.Undefined;

/// <summary>
/// List of handlers who have already processed this file
Expand Down Expand Up @@ -91,13 +99,6 @@ public class GeneratedFileDetails : FileDetailsBase
[JsonPropertyName("parent_id")]
public string ParentId { get; set; } = string.Empty;

/// <summary>
/// Whether this is a partition/chunk/piece of the original content
/// </summary>
[JsonPropertyOrder(15)]
[JsonPropertyName("is_partition")]
public bool IsPartition { get; set; } = false;

/// <summary>
/// Deduplication hash used for consolidation tasks
/// </summary>
Expand Down
2 changes: 1 addition & 1 deletion examples/001-dotnet-Serverless/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ await memory.ImportDocumentAsync(new Document("doc003")
// =======================

// Question without filters
var question = "What's mc^2?";
var question = "What's E = m*c^2?";
Console.WriteLine($"\n\nQuestion: {question}");

var answer = await memory.AskAsync(question);
Expand Down

0 comments on commit c1ce50b

Please sign in to comment.