From 4ed315fe3bd1aab77934ba45443187848868930a Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Sun, 30 Jul 2023 01:03:50 -0700 Subject: [PATCH] Adopt "Document" naming, one document per pipeline, pipeline id = document id. Complete support for Tags, supersedes Collections. Move Upload Endpoint to Core lib, start reorg project folders. Start centralizing constants in one class. Azure Search: handle concurrent index creations. Remove NameValueCollection because it's not serializable. Reduce Info log verbosity. --- SemanticMemory.sln | 55 ++----- SemanticMemory.sln.DotSettings | 2 + .../MemoryPipelineClient.csproj | 40 ----- .../dotnet/MemoryWebClient/MemoryWebClient.cs | 96 ------------ .../MemoryWebClient/MemoryWebClient.csproj | 40 ----- ...Example1_ImportWithMemoryPipelineClient.cs | 42 +++-- .../Example2_ImportWithMemoryWebClient.cs | 42 ++++- .../Example3_CustomInProcessPipeline.cs | 7 +- .../FileImportExamples.csproj | 2 - lib/dotnet/Core.NetStandard20/Constants.cs | 25 +++ lib/dotnet/Core.NetStandard20/Document.cs | 20 +++ .../Core.NetStandard20/DocumentDetails.cs | 52 +++++++ .../ISemanticMemoryClient.cs | 10 +- .../Core.NetStandard20/ImportFileOptions.cs | 59 ------- .../Core.NetStandard20/MemoryWebClient.cs | 129 +++++++++++++++ .../Core.NetStandard20/TagCollection.cs | 147 ++++++++++++++++++ .../Configuration/SemanticMemoryConfig.cs | 1 - lib/dotnet/Core/ContentStorage/AzureBlob.cs | 1 + lib/dotnet/Core/Core.csproj | 4 + .../Core/Handlers/SaveEmbeddingsHandler.cs | 19 ++- .../AzureCognitiveSearchMemory.cs | 21 ++- .../AzureCognitiveSearchMemoryRecord.cs | 23 +-- lib/dotnet/Core/MemoryStorage/MemoryRecord.cs | 4 +- lib/dotnet/Core/Pipeline/BaseOrchestrator.cs | 26 ++-- lib/dotnet/Core/Pipeline/DataPipeline.cs | 6 +- .../Core/Pipeline/IPipelineOrchestrator.cs | 13 +- .../Core/Pipeline}/MemoryPipelineClient.cs | 91 +++++++---- lib/dotnet/Core/WebService/Endpoints.cs | 70 +++++++++ lib/dotnet/Core/WebService/UploadRequest.cs | 36 +++-- server/combinedservices-dotnet/Program.cs | 52 +------ server/webservice-dotnet/Program.cs | 51 +----- 31 files changed, 678 insertions(+), 508 deletions(-) delete mode 100644 clients/dotnet/MemoryPipelineClient/MemoryPipelineClient.csproj delete mode 100644 clients/dotnet/MemoryWebClient/MemoryWebClient.cs delete mode 100644 clients/dotnet/MemoryWebClient/MemoryWebClient.csproj create mode 100644 lib/dotnet/Core.NetStandard20/Constants.cs create mode 100644 lib/dotnet/Core.NetStandard20/Document.cs create mode 100644 lib/dotnet/Core.NetStandard20/DocumentDetails.cs delete mode 100644 lib/dotnet/Core.NetStandard20/ImportFileOptions.cs create mode 100644 lib/dotnet/Core.NetStandard20/MemoryWebClient.cs create mode 100644 lib/dotnet/Core.NetStandard20/TagCollection.cs rename {clients/dotnet/MemoryPipelineClient => lib/dotnet/Core/Pipeline}/MemoryPipelineClient.cs (65%) create mode 100644 lib/dotnet/Core/WebService/Endpoints.cs diff --git a/SemanticMemory.sln b/SemanticMemory.sln index cbe4b6085..e5d559e34 100644 --- a/SemanticMemory.sln +++ b/SemanticMemory.sln @@ -14,22 +14,12 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DataFormats.Pdf", "lib\dotn EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "clients", "clients", "{B0F7938A-1D0F-44C9-AF2D-A89946E94972}" - ProjectSection(SolutionItems) = preProject - clients\README.md = clients\README.md - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "curl", "curl", "{7761EC04-9E17-44EE-A6D8-9FB97F14AF61}" ProjectSection(SolutionItems) = preProject clients\curl\README.md = clients\curl\README.md clients\curl\upload-file.sh = clients\curl\upload-file.sh EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "samples", "samples", "{6335B02C-9964-4B39-9795-C5F5F0392515}" - ProjectSection(SolutionItems) = preProject - clients\samples\README.md = clients\samples\README.md - EndProjectSection -EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FileImportExamples", "clients\samples\FileImportExamples\FileImportExamples.csproj", "{DF03D46C-D50D-4709-89C5-BB596A94096D}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TextExtractionFromDocsExamples", "clients\samples\TextExtractionFromDocsExamples\TextExtractionFromDocsExamples.csproj", "{F3E26445-0850-4FA0-8C23-947CD260D617}" @@ -38,15 +28,8 @@ EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "server", "server", "{6CADEE0D-3600-4AA6-A49C-24AA96456ECC}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "samples", "samples", "{5B0FAB7B-F674-4322-9D40-8902C7D4CF15}" -EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CustomHandlerExample", "server\samples\CustomHandlerExample\CustomHandlerExample.csproj", "{C7E85644-7558-470D-AA38-1E9AD9F9169D}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{E097BACD-C329-4FF6-A4AC-260F7BA143FF}" - ProjectSection(SolutionItems) = preProject - server\tools\run-rabbitmq.sh = server\tools\run-rabbitmq.sh - EndProjectSection -EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PipelineService", "server\pipelineservice-dotnet\PipelineService.csproj", "{3700B51B-B34A-4AC9-9895-FF422B346E6E}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WebService", "server\webservice-dotnet\WebService.csproj", "{409046D9-64C4-49A9-9E93-6F5475ECC6D4}" @@ -68,10 +51,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "root", "root", "{6EF76FD8-4 .editorconfig = .editorconfig EndProjectSection EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MemoryPipelineClient", "clients\dotnet\MemoryPipelineClient\MemoryPipelineClient.csproj", "{7325C2AE-E7B2-4866-96BF-8E0385DA06C6}" -EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MemoryWebClient", "clients\dotnet\MemoryWebClient\MemoryWebClient.csproj", "{BA4B40FF-7C98-46BE-8B34-C87D609F66E4}" -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "curl", "curl", "{56AB71BB-335A-41BF-92A1-105359D9DC38}" ProjectSection(SolutionItems) = preProject clients\samples\curl\example.sh = clients\samples\curl\example.sh @@ -83,6 +62,17 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CombinedServices", "server\ EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "InteractiveSetup", "lib\dotnet\InteractiveSetup\InteractiveSetup.csproj", "{25A5E2C3-A16E-4888-9B4B-07BD23C7E1B0}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "samples", "samples", "{0A43C65C-6007-4BB4-B3FE-8D439FC91841}" + ProjectSection(SolutionItems) = preProject + clients\samples\README.md = clients\samples\README.md + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{CA49F1A1-C3FA-4E99-ACB3-D7FF33D47976}" + ProjectSection(SolutionItems) = preProject + clients\README.md = clients\README.md + server\tools\run-rabbitmq.sh = server\tools\run-rabbitmq.sh + EndProjectSection +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -91,22 +81,17 @@ Global GlobalSection(NestedProjects) = preSolution {3700B51B-B34A-4AC9-9895-FF422B346E6E} = {6CADEE0D-3600-4AA6-A49C-24AA96456ECC} {409046D9-64C4-49A9-9E93-6F5475ECC6D4} = {6CADEE0D-3600-4AA6-A49C-24AA96456ECC} - {E097BACD-C329-4FF6-A4AC-260F7BA143FF} = {6CADEE0D-3600-4AA6-A49C-24AA96456ECC} - {6335B02C-9964-4B39-9795-C5F5F0392515} = {B0F7938A-1D0F-44C9-AF2D-A89946E94972} - {7761EC04-9E17-44EE-A6D8-9FB97F14AF61} = {B0F7938A-1D0F-44C9-AF2D-A89946E94972} {066053BB-FFC2-43FF-A912-61C6ABCC7B4E} = {8E36E21B-00CC-4B85-98FB-6CB2CA5F978C} {08A995C6-C5E7-461A-A14C-467141C37157} = {8E36E21B-00CC-4B85-98FB-6CB2CA5F978C} {74E4482A-590B-4243-AC08-1F81BBF0A0AF} = {8E36E21B-00CC-4B85-98FB-6CB2CA5F978C} {53D3C75F-3C40-4D55-8C3F-062607217204} = {8E36E21B-00CC-4B85-98FB-6CB2CA5F978C} - {DF03D46C-D50D-4709-89C5-BB596A94096D} = {6335B02C-9964-4B39-9795-C5F5F0392515} - {F3E26445-0850-4FA0-8C23-947CD260D617} = {6335B02C-9964-4B39-9795-C5F5F0392515} - {5B0FAB7B-F674-4322-9D40-8902C7D4CF15} = {6CADEE0D-3600-4AA6-A49C-24AA96456ECC} - {C7E85644-7558-470D-AA38-1E9AD9F9169D} = {5B0FAB7B-F674-4322-9D40-8902C7D4CF15} - {7325C2AE-E7B2-4866-96BF-8E0385DA06C6} = {B0F7938A-1D0F-44C9-AF2D-A89946E94972} - {BA4B40FF-7C98-46BE-8B34-C87D609F66E4} = {B0F7938A-1D0F-44C9-AF2D-A89946E94972} - {56AB71BB-335A-41BF-92A1-105359D9DC38} = {6335B02C-9964-4B39-9795-C5F5F0392515} {9292FE05-6206-4046-8C7F-E1DCA25DEAAE} = {6CADEE0D-3600-4AA6-A49C-24AA96456ECC} {25A5E2C3-A16E-4888-9B4B-07BD23C7E1B0} = {8E36E21B-00CC-4B85-98FB-6CB2CA5F978C} + {56AB71BB-335A-41BF-92A1-105359D9DC38} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841} + {DF03D46C-D50D-4709-89C5-BB596A94096D} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841} + {F3E26445-0850-4FA0-8C23-947CD260D617} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841} + {7761EC04-9E17-44EE-A6D8-9FB97F14AF61} = {CA49F1A1-C3FA-4E99-ACB3-D7FF33D47976} + {C7E85644-7558-470D-AA38-1E9AD9F9169D} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841} EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {3700B51B-B34A-4AC9-9895-FF422B346E6E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU @@ -145,14 +130,6 @@ Global {C7E85644-7558-470D-AA38-1E9AD9F9169D}.Debug|Any CPU.Build.0 = Debug|Any CPU {C7E85644-7558-470D-AA38-1E9AD9F9169D}.Release|Any CPU.ActiveCfg = Release|Any CPU {C7E85644-7558-470D-AA38-1E9AD9F9169D}.Release|Any CPU.Build.0 = Release|Any CPU - {7325C2AE-E7B2-4866-96BF-8E0385DA06C6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {7325C2AE-E7B2-4866-96BF-8E0385DA06C6}.Debug|Any CPU.Build.0 = Debug|Any CPU - {7325C2AE-E7B2-4866-96BF-8E0385DA06C6}.Release|Any CPU.ActiveCfg = Release|Any CPU - {7325C2AE-E7B2-4866-96BF-8E0385DA06C6}.Release|Any CPU.Build.0 = Release|Any CPU - {BA4B40FF-7C98-46BE-8B34-C87D609F66E4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {BA4B40FF-7C98-46BE-8B34-C87D609F66E4}.Debug|Any CPU.Build.0 = Debug|Any CPU - {BA4B40FF-7C98-46BE-8B34-C87D609F66E4}.Release|Any CPU.ActiveCfg = Release|Any CPU - {BA4B40FF-7C98-46BE-8B34-C87D609F66E4}.Release|Any CPU.Build.0 = Release|Any CPU {9292FE05-6206-4046-8C7F-E1DCA25DEAAE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {9292FE05-6206-4046-8C7F-E1DCA25DEAAE}.Debug|Any CPU.Build.0 = Debug|Any CPU {9292FE05-6206-4046-8C7F-E1DCA25DEAAE}.Release|Any CPU.ActiveCfg = Release|Any CPU diff --git a/SemanticMemory.sln.DotSettings b/SemanticMemory.sln.DotSettings index 983aaefec..a58f271aa 100644 --- a/SemanticMemory.sln.DotSettings +++ b/SemanticMemory.sln.DotSettings @@ -219,11 +219,13 @@ public void It$SOMENAME$() True True True + True True True True True True + True True True True diff --git a/clients/dotnet/MemoryPipelineClient/MemoryPipelineClient.csproj b/clients/dotnet/MemoryPipelineClient/MemoryPipelineClient.csproj deleted file mode 100644 index 4f705661a..000000000 --- a/clients/dotnet/MemoryPipelineClient/MemoryPipelineClient.csproj +++ /dev/null @@ -1,40 +0,0 @@ - - - - net7.0 - Microsoft.SemanticMemory.PipelineClient - Microsoft.SemanticMemory.PipelineClient - - - - - - - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - - diff --git a/clients/dotnet/MemoryWebClient/MemoryWebClient.cs b/clients/dotnet/MemoryWebClient/MemoryWebClient.cs deleted file mode 100644 index 31629b1ad..000000000 --- a/clients/dotnet/MemoryWebClient/MemoryWebClient.cs +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System; -using System.Collections.Generic; -using System.IO; -using System.Net.Http; -using System.Threading.Tasks; -using Microsoft.SemanticMemory.Core20; - -namespace Microsoft.SemanticMemory.WebClient; - -public class MemoryWebClient : ISemanticMemoryClient -{ - private readonly HttpClient _client; - - public MemoryWebClient(string endpoint) : this(endpoint, new HttpClient()) - { - } - - public MemoryWebClient(string endpoint, HttpClient client) - { - this._client = client; - this._client.BaseAddress = new Uri(endpoint); - } - - public Task ImportFileAsync(string file, ImportFileOptions options) - { - return this.ImportFilesInternalAsync(new[] { file }, options); - } - - public Task ImportFilesAsync(string[] files, ImportFileOptions options) - { - return this.ImportFilesInternalAsync(files, options); - } - - public async Task AskAsync(string question) - { - await Task.Delay(0).ConfigureAwait(false); - return "...work in progress..."; - } - - private async Task ImportFilesInternalAsync(string[] files, ImportFileOptions options) - { - options.Sanitize(); - options.Validate(); - - // Populate form with values and files from disk - using var formData = new MultipartFormDataContent(); - - using var documentIdContent = new StringContent(options.DocumentId); - using (var userContent = new StringContent(options.UserId)) - { - List disposables = new(); - formData.Add(documentIdContent, "documentId"); - formData.Add(userContent, "user"); - foreach (var collectionId in options.CollectionIds) - { - var content = new StringContent(collectionId); - disposables.Add(content); - formData.Add(content, "collections"); - } - - for (int index = 0; index < files.Length; index++) - { - string filename = files[index]; - byte[] bytes = File.ReadAllBytes(filename); - var content = new ByteArrayContent(bytes, 0, bytes.Length); - disposables.Add(content); - formData.Add(content, $"file{index + 1}", filename); - } - - // Send HTTP request - try - { - HttpResponseMessage? response = await this._client.PostAsync("/upload", formData).ConfigureAwait(false); - formData.Dispose(); - response.EnsureSuccessStatusCode(); - } - catch (HttpRequestException e) when (e.Data.Contains("StatusCode")) - { - throw new SemanticMemoryWebException($"{e.Message} [StatusCode: {e.Data["StatusCode"]}]", e); - } - catch (Exception e) - { - throw new SemanticMemoryWebException(e.Message, e); - } - finally - { - foreach (var disposable in disposables) - { - disposable.Dispose(); - } - } - } - } -} diff --git a/clients/dotnet/MemoryWebClient/MemoryWebClient.csproj b/clients/dotnet/MemoryWebClient/MemoryWebClient.csproj deleted file mode 100644 index 7a8c18dde..000000000 --- a/clients/dotnet/MemoryWebClient/MemoryWebClient.csproj +++ /dev/null @@ -1,40 +0,0 @@ - - - - netstandard2.0 - Microsoft.SemanticMemory.WebClient - Microsoft.SemanticMemory.WebClient - - - - - - - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - - diff --git a/clients/samples/FileImportExamples/Example1_ImportWithMemoryPipelineClient.cs b/clients/samples/FileImportExamples/Example1_ImportWithMemoryPipelineClient.cs index fb4bdd337..4598e8094 100644 --- a/clients/samples/FileImportExamples/Example1_ImportWithMemoryPipelineClient.cs +++ b/clients/samples/FileImportExamples/Example1_ImportWithMemoryPipelineClient.cs @@ -1,8 +1,8 @@ // Copyright (c) Microsoft. All rights reserved. using Microsoft.SemanticMemory.Core.Configuration; +using Microsoft.SemanticMemory.Core.Pipeline; using Microsoft.SemanticMemory.Core20; -using Microsoft.SemanticMemory.PipelineClient; public static class Example1_ImportWithMemoryPipelineClient { @@ -12,27 +12,43 @@ public static async Task RunAsync() var memory = new MemoryPipelineClient(config); + // Uploading one file - This will create + // a new upload every time because no file ID is specified, and + // stored under the "default" user because no User ID is specified. + await memory.ImportFileAsync("file1.txt"); + + // Uploading one file specifying IDs await memory.ImportFileAsync("file1.txt", - new ImportFileOptions(userId: "user1", collectionId: "collection01", documentId: "doc1")); + new DocumentDetails(documentId: "f01", userId: "user1")); - await memory.ImportFilesAsync(new[] { "file2.txt", "file3.docx", "file4.pdf" }, - new ImportFileOptions(userId: "user2", collectionId: "collection01", documentId: "doc2")); + // Uploading multiple files + await memory.ImportFilesAsync(new[] + { + new Document("file2.txt", new DocumentDetails("f02", "user1")), + new Document("file3.docx", new DocumentDetails("f03", "user1")), + new Document("file4.pdf", new DocumentDetails("f04", "user1")), + }); + // Categorizing files with tags await memory.ImportFileAsync("file5.pdf", - new ImportFileOptions(userId: "user3", collectionId: "collection01", documentId: "doc1")); + new DocumentDetails("f05", "user2") + .AddTag("collection", "samples") + .AddTag("collection", "webClient") + .AddTag("collection", ".NET") + .AddTag("type", "news")); - // Test with User 2 memory - var question = "\n\nWhat's Semantic Kernel?"; - Console.WriteLine($"Question: {question}"); + // Test with User 1 memory + var question = "What's Semantic Kernel?"; + Console.WriteLine($"\n\nQuestion: {question}"); - string answer = await memory.AskAsync(question, "user2"); + string answer = await memory.AskAsync(question, "user1"); Console.WriteLine($"Answer: {answer}"); - // Test with User 3 memory - question = "\n\nAny news from NASA about Orion?"; - Console.WriteLine($"Question: {question}"); + // Test with User 2 memory + question = "Any news from NASA about Orion?"; + Console.WriteLine($"\n\nQuestion: {question}"); - answer = await memory.AskAsync(question, "user3"); + answer = await memory.AskAsync(question, "user2"); Console.WriteLine($"Answer: {answer}"); } } diff --git a/clients/samples/FileImportExamples/Example2_ImportWithMemoryWebClient.cs b/clients/samples/FileImportExamples/Example2_ImportWithMemoryWebClient.cs index 8be2b1902..e0eab6a0e 100644 --- a/clients/samples/FileImportExamples/Example2_ImportWithMemoryWebClient.cs +++ b/clients/samples/FileImportExamples/Example2_ImportWithMemoryWebClient.cs @@ -1,7 +1,6 @@ // Copyright (c) Microsoft. All rights reserved. using Microsoft.SemanticMemory.Core20; -using Microsoft.SemanticMemory.WebClient; public static class Example2_ImportWithMemoryWebClient { @@ -9,14 +8,45 @@ public static async Task RunAsync(string endpoint) { MemoryWebClient memory = new(endpoint); + // Uploading one file - This will create + // a new upload every time because no file ID is specified, and + // stored under the "default" user because no User ID is specified. + await memory.ImportFileAsync("file1.txt"); + + // Uploading one file specifying IDs await memory.ImportFileAsync("file1.txt", - new ImportFileOptions("example2-user", "collection01")); + new DocumentDetails(documentId: "f01", userId: "user1")); + + // Uploading multiple files + await memory.ImportFilesAsync(new[] + { + new Document("file2.txt", new DocumentDetails("f02", "user1")), + new Document("file3.docx", new DocumentDetails("f03", "user1")), + new Document("file4.pdf", new DocumentDetails("f04", "user1")), + }); + + // Categorizing files with tags + await memory.ImportFileAsync("file5.pdf", + new DocumentDetails("f05", "user2") + .AddTag("collection", "samples") + .AddTag("collection", "webClient") + .AddTag("collection", ".NET") + .AddTag("type", "news")); + + // TODO: wait for pipelines to complete + + // Test with User 1 memory + var question = "What's Semantic Kernel?"; + Console.WriteLine($"\n\nQuestion: {question}"); + + string answer = await memory.AskAsync(question, "user1"); + Console.WriteLine($"Answer: {answer}"); - await memory.ImportFilesAsync(new[] { "file2.txt", "file3.docx", "file4.pdf" }, - new ImportFileOptions("example2-user", "collection01")); + // Test with User 2 memory + question = "Any news from NASA about Orion?"; + Console.WriteLine($"\n\nQuestion: {question}"); - Console.WriteLine("Question: What's SK?"); - string answer = await memory.AskAsync("What's SK?"); + answer = await memory.AskAsync(question, "user2"); Console.WriteLine($"Answer: {answer}"); } } diff --git a/clients/samples/FileImportExamples/Example3_CustomInProcessPipeline.cs b/clients/samples/FileImportExamples/Example3_CustomInProcessPipeline.cs index 1ed5676c2..3aad8df3d 100644 --- a/clients/samples/FileImportExamples/Example3_CustomInProcessPipeline.cs +++ b/clients/samples/FileImportExamples/Example3_CustomInProcessPipeline.cs @@ -7,6 +7,7 @@ using Microsoft.SemanticMemory.Core.ContentStorage; using Microsoft.SemanticMemory.Core.Handlers; using Microsoft.SemanticMemory.Core.Pipeline; +using Microsoft.SemanticMemory.Core20; public static class Example3_CustomInProcessPipeline { @@ -36,13 +37,13 @@ public static async Task RunAsync() SaveEmbeddingsHandler saveEmbedding = new("save_embeddings", orchestrator, app.Services.GetService()!); await orchestrator.AddHandlerAsync(saveEmbedding); - // orchestrator.AttachHandlerAsync(...); - // orchestrator.AttachHandlerAsync(...); + // orchestrator.AddHandlerAsync(...); + // orchestrator.AddHandlerAsync(...); // Create sample pipeline with 4 files Console.WriteLine("* Defining pipeline with 4 files..."); var pipeline = orchestrator - .PrepareNewFileUploadPipeline("inProcessTest", "userId", new[] { "collection1" }) + .PrepareNewFileUploadPipeline("inProcessTest", "userZ", new TagCollection { { "testName", "example3" } }) .AddUploadFile("file1", "file1.txt", "file1.txt") .AddUploadFile("file2", "file2.txt", "file2.txt") .AddUploadFile("file3", "file3.docx", "file3.docx") diff --git a/clients/samples/FileImportExamples/FileImportExamples.csproj b/clients/samples/FileImportExamples/FileImportExamples.csproj index f2c8e9177..795a2ce15 100644 --- a/clients/samples/FileImportExamples/FileImportExamples.csproj +++ b/clients/samples/FileImportExamples/FileImportExamples.csproj @@ -12,8 +12,6 @@ - - diff --git a/lib/dotnet/Core.NetStandard20/Constants.cs b/lib/dotnet/Core.NetStandard20/Constants.cs new file mode 100644 index 000000000..7a02ab634 --- /dev/null +++ b/lib/dotnet/Core.NetStandard20/Constants.cs @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.SemanticMemory.Core20; + +public static class Constants +{ + // Default User ID owning documents uploaded without specifying a user + public const string DefaultDocumentOwnerUserId = "defaultUser"; + + // Form field containing the User ID + public const string WebServiceUserIdField = "userId"; + + // Form field containing the Document ID + public const string WebServiceDocumentIdField = "documentId"; + + // Internal file used to track progress of asynchronous pipelines + public const string PipelineStatusFilename = "__pipeline_status.json"; + + // Tags reserved for internal logic + public const string ReservedUserIdTag = "__user"; + public const string ReservedDocIdTag = "__doc_id"; + public const string ReservedFileIdTag = "__file_id"; + public const string ReservedFilePartitionTag = "__file_part"; + public const string ReservedFileTypeTag = "__file_type"; +} diff --git a/lib/dotnet/Core.NetStandard20/Document.cs b/lib/dotnet/Core.NetStandard20/Document.cs new file mode 100644 index 000000000..605996a10 --- /dev/null +++ b/lib/dotnet/Core.NetStandard20/Document.cs @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Microsoft.SemanticMemory.Core20; + +public class Document +{ + public string FileName { get; set; } = string.Empty; + + public DocumentDetails Details { get; set; } = new(); + + public Document() { } + + public Document(string fileName) { this.FileName = fileName; } + + public Document(string fileName, DocumentDetails details) + { + this.FileName = fileName; + this.Details = details; + } +} diff --git a/lib/dotnet/Core.NetStandard20/DocumentDetails.cs b/lib/dotnet/Core.NetStandard20/DocumentDetails.cs new file mode 100644 index 000000000..bd0995925 --- /dev/null +++ b/lib/dotnet/Core.NetStandard20/DocumentDetails.cs @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Globalization; + +namespace Microsoft.SemanticMemory.Core20; + +public class DocumentDetails +{ + public string DocumentId + { + get { return this._documentId; } + set { this._documentId = string.IsNullOrWhiteSpace(value) ? RandomId() : value; } + } + + public string UserId + { + get { return this._userId; } + set { this._userId = string.IsNullOrWhiteSpace(value) ? Constants.DefaultDocumentOwnerUserId : value; } + } + + public TagCollection Tags { get; set; } = new(); + + public DocumentDetails( + string documentId = "", + string userId = Constants.DefaultDocumentOwnerUserId, + TagCollection? tags = null) + { + this.DocumentId = string.IsNullOrEmpty(documentId) ? RandomId() : documentId; + this.UserId = userId; + if (tags != null) { this.Tags = tags; } + } + + public DocumentDetails AddTag(string name, string value) + { + this.Tags.Add(name, value); + return this; + } + + #region private + + private string _userId = string.Empty; + private string _documentId = string.Empty; + + private static string RandomId() + { + const string LocalDateFormat = "yyyyMMddhhmmssfffffff"; + return Guid.NewGuid().ToString("N") + DateTimeOffset.Now.ToString(LocalDateFormat, CultureInfo.InvariantCulture); + } + + #endregion +} diff --git a/lib/dotnet/Core.NetStandard20/ISemanticMemoryClient.cs b/lib/dotnet/Core.NetStandard20/ISemanticMemoryClient.cs index ce2fae623..712acd456 100644 --- a/lib/dotnet/Core.NetStandard20/ISemanticMemoryClient.cs +++ b/lib/dotnet/Core.NetStandard20/ISemanticMemoryClient.cs @@ -1,13 +1,15 @@ // Copyright (c) Microsoft. All rights reserved. +using System.Collections.Generic; using System.Threading.Tasks; -// ReSharper disable CommentTypo - namespace Microsoft.SemanticMemory.Core20; public interface ISemanticMemoryClient { - public Task ImportFileAsync(string file, ImportFileOptions options); - public Task ImportFilesAsync(string[] files, ImportFileOptions options); + public Task ImportFileAsync(Document file); + public Task> ImportFilesAsync(Document[] files); + public Task ImportFileAsync(string fileName); + public Task ImportFileAsync(string fileName, DocumentDetails details); + public Task AskAsync(string question, string userId); } diff --git a/lib/dotnet/Core.NetStandard20/ImportFileOptions.cs b/lib/dotnet/Core.NetStandard20/ImportFileOptions.cs deleted file mode 100644 index aa39c65ac..000000000 --- a/lib/dotnet/Core.NetStandard20/ImportFileOptions.cs +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System; -using System.Collections.Generic; -using System.Globalization; - -namespace Microsoft.SemanticMemory.Core20; - -public class ImportFileOptions -{ - public string UserId { get; set; } = string.Empty; - public List CollectionIds { get; set; } = new(); - public string DocumentId { get; set; } = string.Empty; - - public ImportFileOptions() - { - } - - public ImportFileOptions(string userId, string collectionId) - : this(userId, collectionId, string.Empty) - { - } - - public ImportFileOptions(string userId, string collectionId, string documentId) - { - this.UserId = userId; - this.CollectionIds.Add(collectionId); - this.DocumentId = documentId; - } - - public ImportFileOptions(string userId, List collectionIds, string documentId) - { - this.UserId = userId; - this.CollectionIds = collectionIds; - this.DocumentId = documentId; - } - - public void Sanitize() - { - if (string.IsNullOrEmpty(this.DocumentId)) - { - // note: the ID doesn't include the full date, to avoid "personal" details - this.DocumentId = Guid.NewGuid().ToString("D") + "-" + DateTimeOffset.UtcNow.ToString("ss.fffffff", CultureInfo.InvariantCulture); - } - } - - public void Validate() - { - if (string.IsNullOrEmpty(this.UserId)) - { - throw new ArgumentNullException(nameof(this.UserId), "User ID is empty"); - } - - if (this.CollectionIds.Count < 1) - { - throw new ArgumentNullException(nameof(this.CollectionIds), "The list of collections is empty"); - } - } -} diff --git a/lib/dotnet/Core.NetStandard20/MemoryWebClient.cs b/lib/dotnet/Core.NetStandard20/MemoryWebClient.cs new file mode 100644 index 000000000..72f45126f --- /dev/null +++ b/lib/dotnet/Core.NetStandard20/MemoryWebClient.cs @@ -0,0 +1,129 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Net.Http; +using System.Threading.Tasks; + +namespace Microsoft.SemanticMemory.Core20; + +public class MemoryWebClient : ISemanticMemoryClient +{ + private readonly HttpClient _client; + + public MemoryWebClient(string endpoint) : this(endpoint, new HttpClient()) + { + } + + public MemoryWebClient(string endpoint, HttpClient client) + { + this._client = client; + this._client.BaseAddress = new Uri(endpoint); + } + + /// + public Task ImportFileAsync(Document file) + { + return this.ImportFileInternalAsync(file); + } + + /// + public Task> ImportFilesAsync(Document[] files) + { + return this.ImportFilesInternalAsync(files); + } + + /// + public Task ImportFileAsync(string fileName) + { + return this.ImportFileAsync(new Document(fileName)); + } + + /// + public Task ImportFileAsync(string fileName, DocumentDetails details) + { + return this.ImportFileInternalAsync(new Document(fileName) { Details = details }); + } + + public async Task AskAsync(string question, string userId) + { + // Work in progress + + await Task.Delay(0).ConfigureAwait(false); + + return "...work in progress..."; + } + + public async Task AskAsync(string question) + { + await Task.Delay(0).ConfigureAwait(false); + return "...work in progress..."; + } + + #region private + + private async Task> ImportFilesInternalAsync(Document[] files) + { + List docIds = new(); + foreach (Document file in files) + { + docIds.Add(await this.ImportFileInternalAsync(file).ConfigureAwait(false)); + } + + return docIds; + } + + private async Task ImportFileInternalAsync(Document file) + { + // Populate form with values and files from disk + using var formData = new MultipartFormDataContent(); + + using StringContent documentIdContent = new(file.Details.DocumentId); + using (StringContent userContent = new(file.Details.UserId)) + { + List disposables = new(); + formData.Add(documentIdContent, Constants.WebServiceDocumentIdField); + formData.Add(userContent, Constants.WebServiceUserIdField); + + foreach (var tag in file.Details.Tags.Pairs) + { + var tagContent = new StringContent(tag.Value); + disposables.Add(tagContent); + formData.Add(tagContent, tag.Key); + } + + byte[] bytes = File.ReadAllBytes(file.FileName); + var fileContent = new ByteArrayContent(bytes, 0, bytes.Length); + disposables.Add(fileContent); + formData.Add(fileContent, "file1", file.FileName); + + // Send HTTP request + try + { + HttpResponseMessage? response = await this._client.PostAsync("/upload", formData).ConfigureAwait(false); + formData.Dispose(); + response.EnsureSuccessStatusCode(); + } + catch (HttpRequestException e) when (e.Data.Contains("StatusCode")) + { + throw new SemanticMemoryWebException($"{e.Message} [StatusCode: {e.Data["StatusCode"]}]", e); + } + catch (Exception e) + { + throw new SemanticMemoryWebException(e.Message, e); + } + finally + { + foreach (var disposable in disposables) + { + disposable.Dispose(); + } + } + } + + return file.Details.DocumentId; + } + + #endregion +} diff --git a/lib/dotnet/Core.NetStandard20/TagCollection.cs b/lib/dotnet/Core.NetStandard20/TagCollection.cs new file mode 100644 index 000000000..31ae7ec20 --- /dev/null +++ b/lib/dotnet/Core.NetStandard20/TagCollection.cs @@ -0,0 +1,147 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Collections; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.SemanticMemory.Core20; + +// JSON serializable alternative to NameValueCollection +public class TagCollection : IDictionary> +{ + private readonly IDictionary> _data = new Dictionary>(); + + public ICollection Keys { get { return this._data.Keys; } } + + public ICollection> Values { get { return this._data.Values; } } + + public IEnumerable> Pairs + { + get + { + return from key in this._data.Keys + from value in this._data[key] + select new KeyValuePair(key, value); + } + } + + public int Count { get { return this._data.Count; } } + + public bool IsReadOnly { get { return this._data.IsReadOnly; } } + + public List this[string key] + { + get => this._data[key]; + set + { + ValidateKey(key); + this._data[key] = value; + } + } + + public IEnumerator>> GetEnumerator() + { + return this._data.GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return this.GetEnumerator(); + } + + public void Add(KeyValuePair> item) + { + ValidateKey(item.Key); + this._data.Add(item); + } + + public void Add(string key) + { + if (!this._data.ContainsKey(key)) + { + this._data[key] = new List(); + } + } + + public void Add(string key, string? value) + { + ValidateKey(key); + // If the key exists + if (this._data.TryGetValue(key, out List list) && list != null) + { + if (value != null) { list.Add(value); } + } + else + { + // Add the key, but the value only if not null + this._data[key] = value == null ? new List() : new List { value }; + } + } + + public void Add(string key, List value) + { + ValidateKey(key); + this._data.Add(key, value); + } + + public bool TryGetValue(string key, out List value) + { + return this._data.TryGetValue(key, out value); + } + + public bool Contains(KeyValuePair> item) + { + return this._data.Contains(item); + } + + public bool ContainsKey(string key) + { + return this._data.ContainsKey(key); + } + + public void CopyTo(KeyValuePair>[] array, int arrayIndex) + { + this._data.CopyTo(array, arrayIndex); + } + + public void CopyTo(TagCollection tagCollection) + { + foreach (string key in this._data.Keys) + { + if (this._data[key] == null || this._data[key].Count == 0) + { + tagCollection.Add(key); + } + else + { + foreach (string? value in this._data[key]) + { + tagCollection.Add(key, value); + } + } + } + } + + public bool Remove(KeyValuePair> item) + { + return this._data.Remove(item); + } + + public bool Remove(string key) + { + return this._data.Remove(key); + } + + public void Clear() + { + this._data.Clear(); + } + + private static void ValidateKey(string key) + { + if (key.Contains("=")) + { + throw new SemanticMemoryException("A tag name cannot contain the '=' symbol"); + } + } +} diff --git a/lib/dotnet/Core/Configuration/SemanticMemoryConfig.cs b/lib/dotnet/Core/Configuration/SemanticMemoryConfig.cs index 3ff0b9d74..939ad19e0 100644 --- a/lib/dotnet/Core/Configuration/SemanticMemoryConfig.cs +++ b/lib/dotnet/Core/Configuration/SemanticMemoryConfig.cs @@ -42,7 +42,6 @@ public class SemanticMemoryConfig return new T(); } - // return section.GetSection(sectionName).Get() ?? new T(); return section.Get() ?? new T(); } diff --git a/lib/dotnet/Core/ContentStorage/AzureBlob.cs b/lib/dotnet/Core/ContentStorage/AzureBlob.cs index f7e499fb7..982bf956d 100644 --- a/lib/dotnet/Core/ContentStorage/AzureBlob.cs +++ b/lib/dotnet/Core/ContentStorage/AzureBlob.cs @@ -16,6 +16,7 @@ namespace Microsoft.SemanticMemory.Core.ContentStorage; +// TODO: a container can contain up to 50000 blocks public class AzureBlob : IContentStorage { private readonly BlobContainerClient _containerClient; diff --git a/lib/dotnet/Core/Core.csproj b/lib/dotnet/Core/Core.csproj index d0c41c168..38d879084 100644 --- a/lib/dotnet/Core/Core.csproj +++ b/lib/dotnet/Core/Core.csproj @@ -7,6 +7,10 @@ CA1711,CA1724,CA1308 + + + + diff --git a/lib/dotnet/Core/Handlers/SaveEmbeddingsHandler.cs b/lib/dotnet/Core/Handlers/SaveEmbeddingsHandler.cs index d2e7b834c..d2352f298 100644 --- a/lib/dotnet/Core/Handlers/SaveEmbeddingsHandler.cs +++ b/lib/dotnet/Core/Handlers/SaveEmbeddingsHandler.cs @@ -14,6 +14,7 @@ using Microsoft.SemanticMemory.Core.Diagnostics; using Microsoft.SemanticMemory.Core.MemoryStorage; using Microsoft.SemanticMemory.Core.Pipeline; +using Microsoft.SemanticMemory.Core20; namespace Microsoft.SemanticMemory.Core.Handlers; @@ -61,7 +62,7 @@ public SaveEmbeddingsHandler( public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync(DataPipeline pipeline, CancellationToken cancellationToken) { // For each embedding file => For each Vector DB => Store vector (collections ==> tags) - foreach (var embeddingFile in pipeline.Files.SelectMany(x => x.GeneratedFiles.Where(f => f.Value.IsEmbeddingFile()))) + foreach (KeyValuePair embeddingFile in pipeline.Files.SelectMany(x => x.GeneratedFiles.Where(f => f.Value.IsEmbeddingFile()))) { foreach (object storageConfig in this._vectorDbs) { @@ -80,15 +81,13 @@ public SaveEmbeddingsHandler( Owner = pipeline.UserId, }; - record.Tags.Add("user", pipeline.UserId); - record.Tags.Add("upload_id", pipeline.Id); - record.Tags.Add("file", embeddingFile.Value.ParentId); - record.Tags.Add("file_type", pipeline.GetFile(embeddingFile.Value.ParentId).Type); - record.Tags.Add("file_partition", embeddingFile.Value.Id); - foreach (var collectionId in pipeline.CollectionIds) - { - record.Tags.Add("collection", collectionId); - } + record.Tags.Add(Constants.ReservedUserIdTag, pipeline.UserId); + record.Tags.Add(Constants.ReservedDocIdTag, pipeline.Id); + record.Tags.Add(Constants.ReservedFileIdTag, embeddingFile.Value.ParentId); + record.Tags.Add(Constants.ReservedFilePartitionTag, embeddingFile.Value.Id); + record.Tags.Add(Constants.ReservedFileTypeTag, pipeline.GetFile(embeddingFile.Value.ParentId).Type); + + pipeline.Tags.CopyTo(record.Tags); record.Metadata.Add("file_name", pipeline.GetFile(embeddingFile.Value.ParentId).Name); record.Metadata.Add("vector_provider", embeddingData.GeneratorProvider); diff --git a/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemory.cs b/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemory.cs index e6646af85..ad796e6ef 100644 --- a/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemory.cs +++ b/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemory.cs @@ -13,6 +13,8 @@ using Azure.Search.Documents.Indexes; using Azure.Search.Documents.Indexes.Models; using Azure.Search.Documents.Models; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; using Microsoft.SemanticKernel.AI.Embeddings; using Microsoft.SemanticMemory.Core.Configuration; using Microsoft.SemanticMemory.Core.Diagnostics; @@ -21,7 +23,12 @@ namespace Microsoft.SemanticMemory.Core.MemoryStorage; public class AzureCognitiveSearchMemory { - public AzureCognitiveSearchMemory(string endpoint, string apiKey) + private readonly ILogger _log; + + public AzureCognitiveSearchMemory( + string endpoint, + string apiKey, + ILogger? log = null) { if (string.IsNullOrEmpty(endpoint)) { @@ -33,6 +40,8 @@ public AzureCognitiveSearchMemory(string endpoint, string apiKey) throw new ConfigurationException("Azure Cognitive Search API key is empty"); } + this._log = log ?? NullLogger.Instance; + AzureKeyCredential credentials = new(apiKey); this._adminClient = new SearchIndexClient(new Uri(endpoint), credentials, GetClientOptions()); } @@ -54,7 +63,15 @@ public async Task CreateCollectionAsync(string collectionName, VectorDbSchema sc } var indexSchema = PrepareIndexSchema(collectionName, schema); - await this._adminClient.CreateIndexAsync(indexSchema, cancellationToken).ConfigureAwait(false); + + try + { + await this._adminClient.CreateIndexAsync(indexSchema, cancellationToken).ConfigureAwait(false); + } + catch (RequestFailedException e) when (e.Status == 409) + { + this._log.LogWarning(e, "Index already exists, nothing to do"); + } } public Task DeleteCollectionAsync(string collectionName, CancellationToken cancellationToken = default) diff --git a/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemoryRecord.cs b/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemoryRecord.cs index ef92f197a..a47d92fea 100644 --- a/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemoryRecord.cs +++ b/lib/dotnet/Core/MemoryStorage/AzureCognitiveSearchMemoryRecord.cs @@ -2,12 +2,12 @@ using System; using System.Collections.Generic; -using System.Collections.Specialized; using System.Linq; using System.Text; using System.Text.Json; using System.Text.Json.Serialization; using Microsoft.SemanticKernel.AI.Embeddings; +using Microsoft.SemanticMemory.Core20; namespace Microsoft.SemanticMemory.Core.MemoryStorage; @@ -81,7 +81,7 @@ public MemoryRecord ToMemoryRecord(bool withEmbedding = true) result.Vector = new Embedding(this.Vector); } - NameValueCollection tags = new(); + TagCollection tags = new(); foreach (string[] keyValue in this.Tags.Select(tag => tag.Split('=', 2))) { tags.Add(keyValue[0], keyValue.Length == 1 ? null : keyValue[1]); @@ -101,24 +101,9 @@ public static AzureCognitiveSearchMemoryRecord FromMemoryRecord(MemoryRecord rec Metadata = JsonSerializer.Serialize(record.Metadata, s_jsonOptions) }; - foreach (string? key in record.Tags.Keys) + foreach (var tag in record.Tags.Pairs) { - if (key == null) { continue; } - - if (key.Contains('=', StringComparison.Ordinal)) { throw new AzureCognitiveSearchMemoryException("A tag name cannot contain the '=' symbol"); } - - string[]? values = record.Tags.GetValues(key); - if (values == null) - { - result.Tags.Add($"{key}"); - } - else - { - foreach (var value in values) - { - result.Tags.Add($"{key}={value}"); - } - } + result.Tags.Add($"{tag.Key}={tag.Value}"); } return result; diff --git a/lib/dotnet/Core/MemoryStorage/MemoryRecord.cs b/lib/dotnet/Core/MemoryStorage/MemoryRecord.cs index e7c5eadaa..7d3193d79 100644 --- a/lib/dotnet/Core/MemoryStorage/MemoryRecord.cs +++ b/lib/dotnet/Core/MemoryStorage/MemoryRecord.cs @@ -1,8 +1,8 @@ // Copyright (c) Microsoft. All rights reserved. using System.Collections.Generic; -using System.Collections.Specialized; using Microsoft.SemanticKernel.AI.Embeddings; +using Microsoft.SemanticMemory.Core20; namespace Microsoft.SemanticMemory.Core.MemoryStorage; @@ -42,7 +42,7 @@ public class MemoryRecord /// * versioning, e.g. [ "LLM=AzureAda2", "Schema=1.0" ] /// * etc. /// - public NameValueCollection Tags { get; set; } = new(); + public TagCollection Tags { get; set; } = new(); /// /// Optional Non-Searchable metadata processed client side. diff --git a/lib/dotnet/Core/Pipeline/BaseOrchestrator.cs b/lib/dotnet/Core/Pipeline/BaseOrchestrator.cs index 87f25f5e6..201c2e423 100644 --- a/lib/dotnet/Core/Pipeline/BaseOrchestrator.cs +++ b/lib/dotnet/Core/Pipeline/BaseOrchestrator.cs @@ -10,13 +10,12 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.SemanticMemory.Core.ContentStorage; +using Microsoft.SemanticMemory.Core20; namespace Microsoft.SemanticMemory.Core.Pipeline; public abstract class BaseOrchestrator : IPipelineOrchestrator, IDisposable { - protected const string StatusFile = "__pipeline_status.json"; - protected IContentStorage ContentStorage { get; private set; } protected ILogger Log { get; private set; } protected CancellationTokenSource CancellationTokenSource { get; private set; } @@ -43,23 +42,26 @@ protected BaseOrchestrator( public abstract Task RunPipelineAsync(DataPipeline pipeline, CancellationToken cancellationToken = default); /// - public DataPipeline PrepareNewFileUploadPipeline(string id, string userId, IEnumerable collectionIds) + public DataPipeline PrepareNewFileUploadPipeline( + string documentId, + string userId, + TagCollection tags) { - return this.PrepareNewFileUploadPipeline(id, userId, collectionIds, new List()); + return this.PrepareNewFileUploadPipeline(documentId, userId, tags, new List()); } /// public DataPipeline PrepareNewFileUploadPipeline( - string id, + string documentId, string userId, - IEnumerable collectionIds, + TagCollection tags, IEnumerable filesToUpload) { var pipeline = new DataPipeline { - Id = id, + Id = documentId, UserId = userId, - CollectionIds = collectionIds.ToList(), + Tags = tags, Creation = DateTimeOffset.UtcNow, LastUpdate = DateTimeOffset.UtcNow, FilesToUpload = filesToUpload.ToList(), @@ -134,13 +136,13 @@ protected async Task UploadFilesAsync(DataPipeline pipeline, CancellationToken c /// Whether to throw exceptions or just log them protected async Task UpdatePipelineStatusAsync(DataPipeline pipeline, CancellationToken cancellationToken, bool ignoreExceptions = false) { - this.Log.LogInformation("Saving pipeline status to {0}/{1}", pipeline.Id, StatusFile); + this.Log.LogDebug("Saving pipeline status to {0}/{1}", pipeline.Id, Constants.PipelineStatusFilename); try { var dirPath = this.ContentStorage.JoinPaths(pipeline.UserId, pipeline.Id); await this.ContentStorage.WriteTextFileAsync( dirPath, - StatusFile, + Constants.PipelineStatusFilename, ToJson(pipeline, true), cancellationToken) .ConfigureAwait(false); @@ -177,13 +179,13 @@ private async Task UploadFormFilesAsync(DataPipeline pipeline, CancellationToken foreach (IFormFile file in pipeline.FilesToUpload) { - if (string.Equals(file.FileName, StatusFile, StringComparison.OrdinalIgnoreCase)) + if (string.Equals(file.FileName, Constants.PipelineStatusFilename, StringComparison.OrdinalIgnoreCase)) { this.Log.LogError("Invalid file name, upload not supported: {0}", file.FileName); continue; } - this.Log.LogInformation("Uploading file: {0}", file.FileName); + this.Log.LogDebug("Uploading file: {0}", file.FileName); var size = await this.ContentStorage.WriteStreamAsync(dirPath, file.FileName, file.OpenReadStream(), cancellationToken).ConfigureAwait(false); pipeline.Files.Add(new DataPipeline.FileDetails { diff --git a/lib/dotnet/Core/Pipeline/DataPipeline.cs b/lib/dotnet/Core/Pipeline/DataPipeline.cs index 8ccd76735..a8d907563 100644 --- a/lib/dotnet/Core/Pipeline/DataPipeline.cs +++ b/lib/dotnet/Core/Pipeline/DataPipeline.cs @@ -6,8 +6,8 @@ using System.Linq; using System.Text.Json.Serialization; using Microsoft.AspNetCore.Http; -using Microsoft.AspNetCore.Http.Internal; using Microsoft.SemanticMemory.Core.Diagnostics; +using Microsoft.SemanticMemory.Core20; namespace Microsoft.SemanticMemory.Core.Pipeline; @@ -151,8 +151,8 @@ public string GetPartitionFileName(int partitionNumber) public string UserId { get; set; } = string.Empty; [JsonPropertyOrder(6)] - [JsonPropertyName("collections")] - public List CollectionIds { get; set; } = new(); + [JsonPropertyName("tags")] + public TagCollection Tags { get; set; } = new(); [JsonPropertyOrder(7)] [JsonPropertyName("creation")] diff --git a/lib/dotnet/Core/Pipeline/IPipelineOrchestrator.cs b/lib/dotnet/Core/Pipeline/IPipelineOrchestrator.cs index 2fb7e6a99..68fd5d74f 100644 --- a/lib/dotnet/Core/Pipeline/IPipelineOrchestrator.cs +++ b/lib/dotnet/Core/Pipeline/IPipelineOrchestrator.cs @@ -5,6 +5,7 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.AspNetCore.Http; +using Microsoft.SemanticMemory.Core20; namespace Microsoft.SemanticMemory.Core.Pipeline; @@ -27,21 +28,21 @@ public interface IPipelineOrchestrator /// /// Create a new pipeline value object for files upload /// - /// Id of the pipeline instance. This value will persist throughout the pipeline and final data lineage used for citations. + /// Id of the pipeline instance. This value will persist throughout the pipeline and final data lineage used for citations. /// Primary user who the data belongs to. Other users, e.g. sharing, is not supported in the pipeline at this time. - /// List of collections where to store the semantic memory extracted from the files. E.g. "chat ID", "personal", etc. + /// List of key-value pairs, used to organize and label the memories. E.g. "type", "category", etc. Multiple values per key are allowed. /// List of files provided before starting the pipeline, to be uploaded into the container before starting. /// Pipeline representation - DataPipeline PrepareNewFileUploadPipeline(string id, string userId, IEnumerable collectionIds, IEnumerable filesToUpload); + DataPipeline PrepareNewFileUploadPipeline(string documentId, string userId, TagCollection tags, IEnumerable filesToUpload); /// /// Create a new pipeline value object, with an empty list of files /// - /// Id of the pipeline instance. This value will persist throughout the pipeline and final data lineage used for citations. + /// Id of the pipeline instance. This value will persist throughout the pipeline and final data lineage used for citations. /// Primary user who the data belongs to. Other users, e.g. sharing, is not supported in the pipeline at this time. - /// List of collections where to store the semantic memory extracted from the files. E.g. "chat ID", "personal", etc. + /// List of key-value pairs, used to organize and label the memories. E.g. "type", "category", etc. Multiple values per key are allowed. /// Pipeline representation - DataPipeline PrepareNewFileUploadPipeline(string id, string userId, IEnumerable collectionIds); + DataPipeline PrepareNewFileUploadPipeline(string documentId, string userId, TagCollection tags); /// /// Start a new data pipeline execution diff --git a/clients/dotnet/MemoryPipelineClient/MemoryPipelineClient.cs b/lib/dotnet/Core/Pipeline/MemoryPipelineClient.cs similarity index 65% rename from clients/dotnet/MemoryPipelineClient/MemoryPipelineClient.cs rename to lib/dotnet/Core/Pipeline/MemoryPipelineClient.cs index 67c823fc1..ba0d6bdf1 100644 --- a/clients/dotnet/MemoryPipelineClient/MemoryPipelineClient.cs +++ b/lib/dotnet/Core/Pipeline/MemoryPipelineClient.cs @@ -1,6 +1,8 @@ // Copyright (c) Microsoft. All rights reserved. using System; +using System.Collections.Generic; +using System.Linq; using System.Threading.Tasks; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; @@ -8,41 +10,49 @@ using Microsoft.SemanticMemory.Core.Configuration; using Microsoft.SemanticMemory.Core.Diagnostics; using Microsoft.SemanticMemory.Core.Handlers; -using Microsoft.SemanticMemory.Core.Pipeline; using Microsoft.SemanticMemory.Core20; -namespace Microsoft.SemanticMemory.PipelineClient; +namespace Microsoft.SemanticMemory.Core.Pipeline; public class MemoryPipelineClient : ISemanticMemoryClient { - private readonly SemanticMemoryConfig _config; - private readonly Lazy> _inProcessOrchestrator = new(BuildInProcessOrchestratorAsync); + public MemoryPipelineClient() : this(SemanticMemoryConfig.LoadFromAppSettings()) + { + } - private Task Orchestrator + public MemoryPipelineClient(SemanticMemoryConfig config) { - get { return this._inProcessOrchestrator.Value; } + this._config = config; } - public MemoryPipelineClient() : this(SemanticMemoryConfig.LoadFromAppSettings()) + /// + public async Task ImportFileAsync(Document file) { + var ids = await this.ImportFilesAsync(new[] { file }).ConfigureAwait(false); + return ids.First(); } - public MemoryPipelineClient(SemanticMemoryConfig config) + /// + public Task> ImportFilesAsync(Document[] files) { - this._config = config; + return this.ImportFilesInternalAsync(files); } - public Task ImportFileAsync(string file, ImportFileOptions options) + /// + public Task ImportFileAsync(string fileName) { - return this.ImportFilesInternalAsync(new[] { file }, options); + return this.ImportFileAsync(new Document(fileName)); } - public Task ImportFilesAsync(string[] files, ImportFileOptions options) + /// + public async Task ImportFileAsync(string fileName, DocumentDetails details) { - return this.ImportFilesInternalAsync(files, options); + var ids = await this.ImportFilesAsync(new[] { new Document(fileName) { Details = details } }).ConfigureAwait(false); + return ids.First(); } - public async Task AskAsync(string question, string owner) + /// + public async Task AskAsync(string question, string userId) { // Work in progress @@ -51,32 +61,45 @@ public async Task AskAsync(string question, string owner) return "...work in progress..."; } - private async Task ImportFilesInternalAsync(string[] files, ImportFileOptions options) + #region private + + private readonly SemanticMemoryConfig _config; + private readonly Lazy> _inProcessOrchestrator = new(BuildInProcessOrchestratorAsync); + + private Task Orchestrator { - options.Sanitize(); - options.Validate(); + get { return this._inProcessOrchestrator.Value; } + } + private async Task> ImportFilesInternalAsync(Document[] files) + { + List ids = new(); InProcessPipelineOrchestrator orchestrator = await this.Orchestrator.ConfigureAwait(false); - var pipeline = orchestrator - .PrepareNewFileUploadPipeline(options.DocumentId, options.UserId, options.CollectionIds); - - // Include all files - for (int index = 0; index < files.Length; index++) + foreach (Document file in files) { - string file = files[index]; - pipeline.AddUploadFile($"file{index + 1}", file, file); + var pipeline = orchestrator + .PrepareNewFileUploadPipeline( + documentId: file.Details.DocumentId, + userId: file.Details.UserId, file.Details.Tags); + + pipeline.AddUploadFile( + name: "file1", + filename: file.FileName, + sourceFile: file.FileName); + + pipeline + .Then("extract") + .Then("partition") + .Then("gen_embeddings") + .Then("save_embeddings") + .Build(); + + await orchestrator.RunPipelineAsync(pipeline).ConfigureAwait(false); + ids.Add(file.Details.DocumentId); } - pipeline - .Then("extract") - .Then("partition") - .Then("gen_embeddings") - .Then("save_embeddings") - .Build(); - - // Execute pipeline - await orchestrator.RunPipelineAsync(pipeline).ConfigureAwait(false); + return ids; } private static async Task BuildInProcessOrchestratorAsync() @@ -137,4 +160,6 @@ private static SemanticMemoryConfig GetConfig(IServiceProvider services) { return services.GetService>(); } + + #endregion } diff --git a/lib/dotnet/Core/WebService/Endpoints.cs b/lib/dotnet/Core/WebService/Endpoints.cs new file mode 100644 index 000000000..8a783ba04 --- /dev/null +++ b/lib/dotnet/Core/WebService/Endpoints.cs @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Http; +using Microsoft.Extensions.Logging; +using Microsoft.SemanticMemory.Core.Pipeline; + +namespace Microsoft.SemanticMemory.Core.WebService; + +public static class Endpoints +{ + public static async Task UploadAsync( + WebApplication app, + HttpRequest request, + IPipelineOrchestrator orchestrator, + ILogger log) + { + log.LogTrace("New upload request"); + + // Note: .NET doesn't yet support binding multipart forms including data and files + (UploadRequest input, bool isValid, string errMsg) = await UploadRequest.BindHttpRequestAsync(request).ConfigureAwait(false); + + if (!isValid) + { +#pragma warning disable CA2254 // The log msg template should be a constant + // ReSharper disable once TemplateIsNotCompileTimeConstantProblem + log.LogError(errMsg); +#pragma warning restore CA2254 + return Results.BadRequest(errMsg); + } + + log.LogInformation("Queueing upload of {0} files for further processing [request {1}]", input.Files.Count(), input.DocumentId); + + // Define all the steps in the pipeline + var pipeline = orchestrator + .PrepareNewFileUploadPipeline( + documentId: input.DocumentId, + userId: input.UserId, input.Tags, input.Files) + .Then("extract") + .Then("partition") + .Then("gen_embeddings") + .Then("save_embeddings") + .Build(); + + try + { + await orchestrator.RunPipelineAsync(pipeline).ConfigureAwait(false); + } +#pragma warning disable CA1031 // Must catch all to log and keep the process alive + catch (Exception e) + { + app.Logger.LogError(e, "Pipeline start failed"); + return Results.Problem( + title: "Upload failed", + detail: e.Message, + statusCode: 503); + } +#pragma warning restore CA1031 + + return Results.Accepted($"/upload-status?id={pipeline.Id}", new + { + Id = pipeline.Id, + Message = "Upload completed, pipeline started", + Count = input.Files.Count() + }); + } +} diff --git a/lib/dotnet/Core/WebService/UploadRequest.cs b/lib/dotnet/Core/WebService/UploadRequest.cs index 35c4fc49d..18c71abb7 100644 --- a/lib/dotnet/Core/WebService/UploadRequest.cs +++ b/lib/dotnet/Core/WebService/UploadRequest.cs @@ -7,6 +7,7 @@ using System.Threading.Tasks; using Microsoft.AspNetCore.Http; using Microsoft.Extensions.Primitives; +using Microsoft.SemanticMemory.Core20; namespace Microsoft.SemanticMemory.Core.WebService; @@ -14,7 +15,7 @@ public class UploadRequest { public string DocumentId { get; set; } = string.Empty; public string UserId { get; set; } = string.Empty; - public IEnumerable CollectionIds { get; set; } = new List(); + public TagCollection Tags { get; set; } = new(); public IEnumerable Files { get; set; } = new List(); /* Resources: @@ -25,9 +26,8 @@ public class UploadRequest */ public static async Task<(UploadRequest model, bool isValid, string errMsg)> BindHttpRequestAsync(HttpRequest httpRequest) { - const string UserField = "user"; - const string CollectionsField = "collections"; - const string DocumentIdField = "documentId"; + string userIdField = Constants.WebServiceUserIdField; + string documentIdField = Constants.WebServiceDocumentIdField; var result = new UploadRequest(); @@ -47,30 +47,32 @@ public class UploadRequest } // TODO: extract user ID from auth headers - if (!form.TryGetValue(UserField, out StringValues userIds) || userIds.Count != 1 || string.IsNullOrEmpty(userIds[0])) + if (!form.TryGetValue(userIdField, out StringValues userIds) || userIds.Count != 1 || string.IsNullOrEmpty(userIds[0])) { - return (result, false, $"Invalid or missing user ID, '{UserField}' value empty or not found, or multiple values provided"); + return (result, false, $"Invalid or missing user ID, '{userIdField}' value empty or not found, or multiple values provided"); } - // At least one collection must be specified. Note: the pipeline might decide to ignore the specified collections, - // i.e. custom pipelines can override/ignore this value, depending on the implementation chosen. - if (!form.TryGetValue(CollectionsField, out StringValues collectionIds) || collectionIds.Count == 0 || collectionIds.Any(string.IsNullOrEmpty)) + if (form.TryGetValue(documentIdField, out StringValues documentIds) && documentIds.Count > 1) { - return (result, false, $"Invalid or missing collection ID, '{CollectionsField}' list is empty or contains empty values"); - } - - if (form.TryGetValue(DocumentIdField, out StringValues documentIds) && documentIds.Count > 1) - { - return (result, false, $"Invalid document ID, '{DocumentIdField}' must be a single value, not a list"); + return (result, false, $"Invalid document ID, '{documentIdField}' must be a single value, not a list"); } // Document Id is optional, e.g. used if the client wants to retry the same upload, otherwise we generate a random/unique one result.DocumentId = documentIds.FirstOrDefault() ?? DateTimeOffset.Now.ToString("yyyyMMdd.HHmmss.", CultureInfo.InvariantCulture) + Guid.NewGuid().ToString("N"); - result.UserId = userIds[0]!; - result.CollectionIds = collectionIds; result.Files = form.Files; + // Store any extra field as a tag + foreach (string key in form.Keys) + { + if (key == documentIdField || key == userIdField || !form.TryGetValue(key, out StringValues values)) { continue; } + + foreach (var x in values) + { + result.Tags.Add(key, x); + } + } + return (result, true, string.Empty); } } diff --git a/server/combinedservices-dotnet/Program.cs b/server/combinedservices-dotnet/Program.cs index bd08e45be..aedfd9463 100644 --- a/server/combinedservices-dotnet/Program.cs +++ b/server/combinedservices-dotnet/Program.cs @@ -66,61 +66,11 @@ // Simple ping endpoint app.MapGet("/", () => Results.Ok("Ingestion service is running. " + "Uptime: " + (DateTimeOffset.UtcNow.ToUnixTimeSeconds() - start.ToUnixTimeSeconds()) + " secs")); - // File upload endpoint app.MapPost("/upload", async Task ( HttpRequest request, IPipelineOrchestrator orchestrator, - ILogger log) => -{ - log.LogTrace("New upload request"); - - // Note: .NET doesn't yet support binding multipart forms including data and files - (UploadRequest input, bool isValid, string errMsg) = await UploadRequest.BindHttpRequestAsync(request); - - if (!isValid) - { -#pragma warning disable CA2254 // The log msg template should be a constant - // ReSharper disable once TemplateIsNotCompileTimeConstantProblem - log.LogError(errMsg); -#pragma warning restore CA2254 - return Results.BadRequest(errMsg); - } - - log.LogInformation("Queueing upload of {0} files for further processing [request {1}]", input.Files.Count(), input.DocumentId); - var containerId = $"usr.{input.UserId}.op.{input.DocumentId}"; - - // Define all the steps in the pipeline - var pipeline = orchestrator - .PrepareNewFileUploadPipeline(containerId, input.UserId, input.CollectionIds, input.Files) - .Then("extract") - .Then("partition") - .Then("gen_embeddings") - .Then("save_embeddings") - .Build(); - - try - { - await orchestrator.RunPipelineAsync(pipeline); - } -#pragma warning disable CA1031 // Must catch all to log and keep the process alive - catch (Exception e) - { - app.Logger.LogError(e, "Pipeline start failed"); - return Results.Problem( - title: "Upload failed", - detail: e.Message, - statusCode: 503); - } -#pragma warning restore CA1031 - - return Results.Accepted($"/upload-status?id={pipeline.Id}", new - { - Id = pipeline.Id, - Message = "Upload completed, pipeline started", - Count = input.Files.Count() - }); -}); + ILogger log) => await Endpoints.UploadAsync(app, request, orchestrator, log)); app.Logger.LogInformation( "Starting web service, Log Level: {0}, .NET Env: {1}, Orchestration: {2}", diff --git a/server/webservice-dotnet/Program.cs b/server/webservice-dotnet/Program.cs index 382bf95f5..68b1a475f 100644 --- a/server/webservice-dotnet/Program.cs +++ b/server/webservice-dotnet/Program.cs @@ -64,56 +64,7 @@ app.MapPost("/upload", async Task ( HttpRequest request, IPipelineOrchestrator orchestrator, - ILogger log) => -{ - log.LogTrace("New upload request"); - - // Note: .NET doesn't yet support binding multipart forms including data and files - (UploadRequest input, bool isValid, string errMsg) = await UploadRequest.BindHttpRequestAsync(request); - - if (!isValid) - { -#pragma warning disable CA2254 // The log msg template should be a constant - // ReSharper disable once TemplateIsNotCompileTimeConstantProblem - log.LogError(errMsg); -#pragma warning restore CA2254 - return Results.BadRequest(errMsg); - } - - log.LogInformation("Queueing upload of {0} files for further processing [request {1}]", input.Files.Count(), input.DocumentId); - var containerId = $"usr.{input.UserId}.op.{input.DocumentId}"; - - // Define all the steps in the pipeline - var pipeline = orchestrator - .PrepareNewFileUploadPipeline(containerId, input.UserId, input.CollectionIds, input.Files) - .Then("extract") - .Then("partition") - .Then("gen_embeddings") - .Then("save_embeddings") - .Build(); - - try - { - await orchestrator.RunPipelineAsync(pipeline); - } -#pragma warning disable CA1031 // Must catch all to log and keep the process alive - catch (Exception e) - { - app.Logger.LogError(e, "Pipeline start failed"); - return Results.Problem( - title: "Upload failed", - detail: e.Message, - statusCode: 503); - } -#pragma warning restore CA1031 - - return Results.Accepted($"/upload-status?id={pipeline.Id}", new - { - Id = pipeline.Id, - Message = "Upload completed, pipeline started", - Count = input.Files.Count() - }); -}); + ILogger log) => await Endpoints.UploadAsync(app, request, orchestrator, log)); app.Logger.LogInformation( "Starting web service, Log Level: {0}, .NET Env: {1}, Orchestration: {2}",