Skip to content

Commit f6daebd

Browse files
luismanez“luismanez”dluc
authored
New handler added to allow the deletion of the generated files. (#177)
Added a new Handler that allows the deletion of the generated "physical" (disk, blob...) files after a document has been imported. ## Motivation and Context (Why the change? What's the scenario?) This should address this question: https://github.com/microsoft/kernel-memory/issues/107 ## High level description (Approach, Design) This new handler has not been added to any default pipeline, so nothing changes for current clients. However, the Handler is not present, so it can be passed as _step_ when calling any of the current ```ImportDocumentAsync```` i.e: ```csharp await memory.ImportDocumentAsync( "sample-Wikipedia-Moon.txt", steps: new[] { Constants.PipelineStepsDeleteGeneratedFiles }); ``` --------- Co-authored-by: “luismanez” <“[email protected]”> Co-authored-by: Devis Lucato <[email protected]>
1 parent 50e28fa commit f6daebd

File tree

5 files changed

+92
-20
lines changed

5 files changed

+92
-20
lines changed

service/Abstractions/Constants.cs

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,37 @@ public static class Constants
5757
public const string HttpIndexPlaceholder = "{index}";
5858
public const string HttpDocumentIdPlaceholder = "{documentId}";
5959

60-
// Handlers
61-
public const string DeleteDocumentPipelineStepName = "private_delete_document";
62-
public const string DeleteIndexPipelineStepName = "private_delete_index";
60+
// Pipeline Handlers, Step names
61+
public const string PipelineStepsExtract = "extract";
62+
public const string PipelineStepsPartition = "partition";
63+
public const string PipelineStepsGenEmbeddings = "gen_embeddings";
64+
public const string PipelineStepsSaveRecords = "save_records";
65+
public const string PipelineStepsSummarize = "summarize";
66+
public const string PipelineStepsDeleteGeneratedFiles = "delete_generated_files";
67+
public const string PipelineStepsDeleteDocument = "private_delete_document";
68+
public const string PipelineStepsDeleteIndex = "private_delete_index";
6369

6470
// Pipeline steps
65-
public static readonly string[] DefaultPipeline = { "extract", "partition", "gen_embeddings", "save_records" };
66-
public static readonly string[] PipelineWithoutSummary = { "extract", "partition", "gen_embeddings", "save_records" };
67-
public static readonly string[] PipelineWithSummary = { "extract", "partition", "gen_embeddings", "save_records", "summarize", "gen_embeddings", "save_records" };
68-
public static readonly string[] PipelineOnlySummary = { "extract", "summarize", "gen_embeddings", "save_records" };
71+
public static readonly string[] DefaultPipeline =
72+
{
73+
PipelineStepsExtract, PipelineStepsPartition, PipelineStepsGenEmbeddings, PipelineStepsSaveRecords
74+
};
75+
76+
public static readonly string[] PipelineWithoutSummary =
77+
{
78+
PipelineStepsExtract, PipelineStepsPartition, PipelineStepsGenEmbeddings, PipelineStepsSaveRecords
79+
};
80+
81+
public static readonly string[] PipelineWithSummary =
82+
{
83+
PipelineStepsExtract, PipelineStepsPartition, PipelineStepsGenEmbeddings, PipelineStepsSaveRecords,
84+
PipelineStepsSummarize, PipelineStepsGenEmbeddings, PipelineStepsSaveRecords
85+
};
86+
87+
public static readonly string[] PipelineOnlySummary =
88+
{
89+
PipelineStepsExtract, PipelineStepsSummarize, PipelineStepsGenEmbeddings, PipelineStepsSaveRecords
90+
};
6991

7092
// Standard prompt names
7193
public const string PromptNamesSummarize = "summarize";

service/Abstractions/Pipeline/DataPipeline.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,12 +368,12 @@ public string RollbackToPreviousStep()
368368

369369
public bool IsDocumentDeletionPipeline()
370370
{
371-
return this.Steps.Count == 1 && this.Steps.First() == Constants.DeleteDocumentPipelineStepName;
371+
return this.Steps.Count == 1 && this.Steps.First() == Constants.PipelineStepsDeleteDocument;
372372
}
373373

374374
public bool IsIndexDeletionPipeline()
375375
{
376-
return this.Steps.Count == 1 && this.Steps.First() == Constants.DeleteIndexPipelineStepName;
376+
return this.Steps.Count == 1 && this.Steps.First() == Constants.PipelineStepsDeleteIndex;
377377
}
378378

379379
public void Validate()
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Threading;
4+
using System.Threading.Tasks;
5+
using Microsoft.Extensions.Logging;
6+
using Microsoft.KernelMemory.ContentStorage;
7+
using Microsoft.KernelMemory.Diagnostics;
8+
using Microsoft.KernelMemory.Pipeline;
9+
10+
namespace Microsoft.KernelMemory.Handlers;
11+
12+
public class DeleteGeneratedFilesHandler : IPipelineStepHandler
13+
{
14+
private readonly IContentStorage _contentStorage;
15+
private readonly ILogger<DeleteGeneratedFilesHandler> _log;
16+
17+
public string StepName { get; }
18+
19+
public DeleteGeneratedFilesHandler(
20+
string stepName,
21+
IContentStorage contentStorage,
22+
ILogger<DeleteGeneratedFilesHandler>? log = null)
23+
{
24+
this.StepName = stepName;
25+
this._contentStorage = contentStorage;
26+
this._log = log ?? DefaultLogger<DeleteGeneratedFilesHandler>.Instance;
27+
28+
this._log.LogInformation("Handler '{0}' ready", stepName);
29+
}
30+
31+
/// <inheritdoc />
32+
public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync(
33+
DataPipeline pipeline, CancellationToken cancellationToken = default)
34+
{
35+
this._log.LogDebug("Deleting generated files, pipeline '{0}/{1}'", pipeline.Index, pipeline.DocumentId);
36+
37+
// Delete files, leaving the status file
38+
await this._contentStorage.EmptyDocumentDirectoryAsync(
39+
index: pipeline.Index,
40+
documentId: pipeline.DocumentId,
41+
cancellationToken).ConfigureAwait(false);
42+
43+
return (true, pipeline);
44+
}
45+
}

service/Core/KernelMemoryBuilder.cs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -510,10 +510,13 @@ private MemoryServerless BuildServerlessClient()
510510
=> ActivatorUtilities.CreateInstance<SaveRecordsHandler>(serviceProvider, "save_records"));
511511

512512
this._memoryServiceCollection.AddTransient<DeleteDocumentHandler>(serviceProvider
513-
=> ActivatorUtilities.CreateInstance<DeleteDocumentHandler>(serviceProvider, Constants.DeleteDocumentPipelineStepName));
513+
=> ActivatorUtilities.CreateInstance<DeleteDocumentHandler>(serviceProvider, Constants.PipelineStepsDeleteDocument));
514514

515515
this._memoryServiceCollection.AddTransient<DeleteIndexHandler>(serviceProvider
516-
=> ActivatorUtilities.CreateInstance<DeleteIndexHandler>(serviceProvider, Constants.DeleteIndexPipelineStepName));
516+
=> ActivatorUtilities.CreateInstance<DeleteIndexHandler>(serviceProvider, Constants.PipelineStepsDeleteIndex));
517+
518+
this._memoryServiceCollection.AddTransient<DeleteGeneratedFilesHandler>(serviceProvider
519+
=> ActivatorUtilities.CreateInstance<DeleteGeneratedFilesHandler>(serviceProvider, Constants.PipelineStepsDeleteGeneratedFiles));
517520
}
518521

519522
var serviceProvider = this._memoryServiceCollection.BuildServiceProvider();
@@ -544,6 +547,7 @@ private MemoryServerless BuildServerlessClient()
544547
memoryClientInstance.AddHandler(serviceProvider.GetService<SaveRecordsHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(SaveRecordsHandler)));
545548
memoryClientInstance.AddHandler(serviceProvider.GetService<DeleteDocumentHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(DeleteDocumentHandler)));
546549
memoryClientInstance.AddHandler(serviceProvider.GetService<DeleteIndexHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(DeleteIndexHandler)));
550+
memoryClientInstance.AddHandler(serviceProvider.GetService<DeleteGeneratedFilesHandler>() ?? throw new ConfigurationException("Unable to build " + nameof(DeleteGeneratedFilesHandler)));
547551
}
548552

549553
return memoryClientInstance;
@@ -583,13 +587,14 @@ private MemoryService BuildAsyncClient()
583587

584588
// Handlers - Register these handlers to run as hosted services in the caller app.
585589
// At start each hosted handler calls IPipelineOrchestrator.AddHandlerAsync() to register in the orchestrator.
586-
this._hostServiceCollection.AddHandlerAsHostedService<TextExtractionHandler>("extract");
587-
this._hostServiceCollection.AddHandlerAsHostedService<SummarizationHandler>("summarize");
588-
this._hostServiceCollection.AddHandlerAsHostedService<TextPartitioningHandler>("partition");
589-
this._hostServiceCollection.AddHandlerAsHostedService<GenerateEmbeddingsHandler>("gen_embeddings");
590-
this._hostServiceCollection.AddHandlerAsHostedService<SaveRecordsHandler>("save_records");
591-
this._hostServiceCollection.AddHandlerAsHostedService<DeleteDocumentHandler>(Constants.DeleteDocumentPipelineStepName);
592-
this._hostServiceCollection.AddHandlerAsHostedService<DeleteIndexHandler>(Constants.DeleteIndexPipelineStepName);
590+
this._hostServiceCollection.AddHandlerAsHostedService<TextExtractionHandler>(Constants.PipelineStepsExtract);
591+
this._hostServiceCollection.AddHandlerAsHostedService<TextPartitioningHandler>(Constants.PipelineStepsPartition);
592+
this._hostServiceCollection.AddHandlerAsHostedService<GenerateEmbeddingsHandler>(Constants.PipelineStepsGenEmbeddings);
593+
this._hostServiceCollection.AddHandlerAsHostedService<SaveRecordsHandler>(Constants.PipelineStepsSaveRecords);
594+
this._hostServiceCollection.AddHandlerAsHostedService<SummarizationHandler>(Constants.PipelineStepsSummarize);
595+
this._hostServiceCollection.AddHandlerAsHostedService<DeleteDocumentHandler>(Constants.PipelineStepsDeleteDocument);
596+
this._hostServiceCollection.AddHandlerAsHostedService<DeleteIndexHandler>(Constants.PipelineStepsDeleteIndex);
597+
this._hostServiceCollection.AddHandlerAsHostedService<DeleteGeneratedFilesHandler>(Constants.PipelineStepsDeleteGeneratedFiles);
593598
}
594599

595600
this.CheckForMissingDependencies();

service/Core/Pipeline/BaseOrchestrator.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ protected static DataPipeline PrepareIndexDeletion(string? index)
279279
DocumentId = string.Empty,
280280
};
281281

282-
return pipeline.Then(Constants.DeleteIndexPipelineStepName).Build();
282+
return pipeline.Then(Constants.PipelineStepsDeleteIndex).Build();
283283
}
284284

285285
protected static DataPipeline PrepareDocumentDeletion(string? index, string documentId)
@@ -295,7 +295,7 @@ protected static DataPipeline PrepareDocumentDeletion(string? index, string docu
295295
DocumentId = documentId,
296296
};
297297

298-
return pipeline.Then(Constants.DeleteDocumentPipelineStepName).Build();
298+
return pipeline.Then(Constants.PipelineStepsDeleteDocument).Build();
299299
}
300300

301301
protected async Task UploadFilesAsync(DataPipeline currentPipeline, CancellationToken cancellationToken = default)

0 commit comments

Comments
 (0)