Skip to content

Commit fa2d656

Browse files
Eliminate ingestion cache from AI Chat Web template (#6428)
* Begin updating to latest MEVD * Reimplement JsonVectorStore to match updated MEVD APIs * Remove ingestion cache and track ingestion status inside the vector DB * Track the document metadata in a separate collection so we don't have to fetch literally everything from the vector DB in order to update ingestion * Fix equality comparison issue with Qdrant connector * Tidying * More tidying * Update MEAI.Templates test snapshots --------- Co-authored-by: Jeff Handley <[email protected]>
1 parent 08fbb67 commit fa2d656

File tree

49 files changed

+597
-647
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+597
-647
lines changed

eng/Versions.props

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,9 @@
165165
<CommunityToolkitAspireMicrosoftEntityFrameworkCoreSqliteVersion>9.4.1-beta.277</CommunityToolkitAspireMicrosoftEntityFrameworkCoreSqliteVersion>
166166
<CommunityToolkitAspireOllamaSharpVersion>9.4.1-beta.277</CommunityToolkitAspireOllamaSharpVersion>
167167
<MicrosoftExtensionsServiceDiscoveryVersion>9.2.0</MicrosoftExtensionsServiceDiscoveryVersion>
168-
<MicrosoftSemanticKernelConnectorsAzureAISearchVersion>1.47.0-preview</MicrosoftSemanticKernelConnectorsAzureAISearchVersion>
169-
<MicrosoftSemanticKernelConnectorsQdrantVersion>1.47.0-preview</MicrosoftSemanticKernelConnectorsQdrantVersion>
170-
<MicrosoftSemanticKernelCoreVersion>1.47.0</MicrosoftSemanticKernelCoreVersion>
168+
<MicrosoftSemanticKernelConnectorsAzureAISearchVersion>1.49.0-preview</MicrosoftSemanticKernelConnectorsAzureAISearchVersion>
169+
<MicrosoftSemanticKernelConnectorsQdrantVersion>1.49.0-preview</MicrosoftSemanticKernelConnectorsQdrantVersion>
170+
<MicrosoftSemanticKernelCoreVersion>1.49.0</MicrosoftSemanticKernelCoreVersion>
171171
<OllamaSharpVersion>5.1.13</OllamaSharpVersion>
172172
<OpenTelemetryVersion>1.9.0</OpenTelemetryVersion>
173173
<PdfPigVersion>0.1.9</PdfPigVersion>

src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.AppHost/ChatWithCustomData-CSharp.AppHost.csproj.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
<!--#if (UseQdrant)
1717
<PackageReference Include="Aspire.Hosting.Qdrant" Version="${AspireVersion}" />
1818
#endif -->
19-
<PackageReference Include="CommunityToolkit.Aspire.Hosting.Sqlite" Version="${CommunityToolkitAspireHostingSqliteVersion}" />
2019
<!--#if (IsOllama)
2120
<PackageReference Include="CommunityToolkit.Aspire.Hosting.Ollama" Version="${CommunityToolkitAspireHostingOllamaVersion}" />
2221
#endif -->

src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.AppHost/Program.cs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
#else // UseLocalVectorStore
3939
#endif
4040

41-
var ingestionCache = builder.AddSqlite("ingestionCache");
42-
4341
var webApp = builder.AddProject<Projects.ChatWithCustomData_CSharp_Web>("aichatweb-app");
4442
#if (IsOllama) // AI SERVICE PROVIDER REFERENCES
4543
webApp
@@ -58,8 +56,5 @@
5856
.WaitFor(vectorDB);
5957
#else // UseLocalVectorStore
6058
#endif
61-
webApp
62-
.WithReference(ingestionCache)
63-
.WaitFor(ingestionCache);
6459

6560
builder.Build().Run();

src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/ChatWithCustomData-CSharp.Web.csproj.in

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,6 @@
3030
<!--#endif -->
3131
<!--#if (UseManagedIdentity) -->
3232
<PackageReference Include="Azure.Identity" Version="${AzureIdentityVersion}" />
33-
<!--#endif -->
34-
<!--#if (IsAspire) -->
35-
<PackageReference Include="CommunityToolkit.Aspire.Microsoft.EntityFrameworkCore.Sqlite" Version="${CommunityToolkitAspireMicrosoftEntityFrameworkCoreSqliteVersion}" />
36-
<!--#else -->
37-
<PackageReference Include="Microsoft.EntityFrameworkCore.Sqlite" Version="${MicrosoftEntityFrameworkCoreSqliteVersion}" />
3833
<!--#endif -->
3934
<PackageReference Include="Microsoft.Extensions.AI" Version="${MicrosoftExtensionsAIVersion}" />
4035
<PackageReference Include="Microsoft.SemanticKernel.Core" Version="${MicrosoftSemanticKernelCoreVersion}" />

src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Components/Pages/Chat/Chat.razor

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@
118118
await InvokeAsync(StateHasChanged);
119119
var results = await Search.SearchAsync(searchPhrase, filenameFilter, maxResults: 5);
120120
return results.Select(result =>
121-
$"<result filename=\"{result.FileName}\" page_number=\"{result.PageNumber}\">{result.Text}</result>");
121+
$"<result filename=\"{result.DocumentId}\" page_number=\"{result.PageNumber}\">{result.Text}</result>");
122122
}
123123

124124
public void Dispose()

src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Program.Aspire.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,8 @@
5353
#endif
5454
builder.Services.AddScoped<DataIngestor>();
5555
builder.Services.AddSingleton<SemanticSearch>();
56-
builder.AddSqliteDbContext<IngestionCacheDbContext>("ingestionCache");
5756

5857
var app = builder.Build();
59-
IngestionCacheDbContext.Initialize(app.Services);
6058

6159
app.MapDefaultEndpoints();
6260

src/ProjectTemplates/Microsoft.Extensions.AI.Templates/src/ChatWithCustomData/ChatWithCustomData-CSharp.Web/Program.cs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
using Microsoft.EntityFrameworkCore;
21
using Microsoft.Extensions.AI;
32
using Microsoft.Extensions.VectorData;
43
using ChatWithCustomData_CSharp.Web.Components;
@@ -102,11 +101,7 @@
102101
builder.Services.AddChatClient(chatClient).UseFunctionInvocation().UseLogging();
103102
builder.Services.AddEmbeddingGenerator(embeddingGenerator);
104103

105-
builder.Services.AddDbContext<IngestionCacheDbContext>(options =>
106-
options.UseSqlite("Data Source=ingestioncache.db"));
107-
108104
var app = builder.Build();
109-
IngestionCacheDbContext.Initialize(app.Services);
110105

111106
// Configure the HTTP request pipeline.
112107
if (!app.Environment.IsDevelopment())
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
namespace ChatWithCustomData_CSharp.Web.Services;
44

5-
public class SemanticSearchRecord
5+
public class IngestedChunk
66
{
77
[VectorStoreRecordKey]
88
#if (UseQdrant)
@@ -11,8 +11,8 @@ public class SemanticSearchRecord
1111
public required string Key { get; set; }
1212
#endif
1313

14-
[VectorStoreRecordData(IsFilterable = true)]
15-
public required string FileName { get; set; }
14+
[VectorStoreRecordData(IsIndexed = true)]
15+
public required string DocumentId { get; set; }
1616

1717
[VectorStoreRecordData]
1818
public int PageNumber { get; set; }
@@ -21,9 +21,9 @@ public class SemanticSearchRecord
2121
public required string Text { get; set; }
2222

2323
#if (IsOllama)
24-
[VectorStoreRecordVector(384, DistanceFunction.CosineSimilarity)] // 384 is the default vector size for the all-minilm embedding model
24+
[VectorStoreRecordVector(384, DistanceFunction = DistanceFunction.CosineSimilarity)] // 384 is the default vector size for the all-minilm embedding model
2525
#else
26-
[VectorStoreRecordVector(1536, DistanceFunction.CosineSimilarity)] // 1536 is the default vector size for the OpenAI text-embedding-3-small model
26+
[VectorStoreRecordVector(1536, DistanceFunction = DistanceFunction.CosineSimilarity)] // 1536 is the default vector size for the OpenAI text-embedding-3-small model
2727
#endif
2828
public ReadOnlyMemory<float> Vector { get; set; }
2929
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
using Microsoft.Extensions.VectorData;
2+
3+
namespace ChatWithCustomData_CSharp.Web.Services;
4+
5+
public class IngestedDocument
6+
{
7+
[VectorStoreRecordKey]
8+
#if (UseQdrant)
9+
public required Guid Key { get; set; }
10+
#else
11+
public required string Key { get; set; }
12+
#endif
13+
14+
[VectorStoreRecordData(IsIndexed = true)]
15+
public required string SourceId { get; set; }
16+
17+
[VectorStoreRecordData]
18+
public required string DocumentId { get; set; }
19+
20+
[VectorStoreRecordData]
21+
public required string DocumentVersion { get; set; }
22+
23+
// The vector is not used but required for some vector databases
24+
[VectorStoreRecordVector(1, DistanceFunction = DistanceFunction.CosineSimilarity)]
25+
public ReadOnlyMemory<float> Vector { get; set; } = new ReadOnlyMemory<float>([0]);
26+
}
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
using Microsoft.EntityFrameworkCore;
2-
using Microsoft.Extensions.AI;
1+
using Microsoft.Extensions.AI;
32
using Microsoft.Extensions.VectorData;
43

54
namespace ChatWithCustomData_CSharp.Web.Services.Ingestion;
65

76
public class DataIngestor(
87
ILogger<DataIngestor> logger,
98
IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator,
10-
IVectorStore vectorStore,
11-
IngestionCacheDbContext ingestionCacheDb)
9+
IVectorStore vectorStore)
1210
{
1311
public static async Task IngestDataAsync(IServiceProvider services, IIngestionSource source)
1412
{
@@ -20,48 +18,48 @@ public static async Task IngestDataAsync(IServiceProvider services, IIngestionSo
2018
public async Task IngestDataAsync(IIngestionSource source)
2119
{
2220
#if (UseQdrant)
23-
var vectorCollection = vectorStore.GetCollection<Guid, SemanticSearchRecord>("data-ChatWithCustomData-CSharp.Web-ingestion");
21+
var chunksCollection = vectorStore.GetCollection<Guid, IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks");
22+
var documentsCollection = vectorStore.GetCollection<Guid, IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents");
2423
#else
25-
var vectorCollection = vectorStore.GetCollection<string, SemanticSearchRecord>("data-ChatWithCustomData-CSharp.Web-ingestion");
24+
var chunksCollection = vectorStore.GetCollection<string, IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks");
25+
var documentsCollection = vectorStore.GetCollection<string, IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents");
2626
#endif
27-
await vectorCollection.CreateCollectionIfNotExistsAsync();
27+
await chunksCollection.CreateCollectionIfNotExistsAsync();
28+
await documentsCollection.CreateCollectionIfNotExistsAsync();
2829

29-
var documentsForSource = ingestionCacheDb.Documents
30-
.Where(d => d.SourceId == source.SourceId)
31-
.Include(d => d.Records);
30+
var sourceId = source.SourceId;
31+
var documentsForSource = await documentsCollection.GetAsync(doc => doc.SourceId == sourceId, top: int.MaxValue).ToListAsync();
3232

33-
var deletedFiles = await source.GetDeletedDocumentsAsync(documentsForSource);
34-
foreach (var deletedFile in deletedFiles)
33+
var deletedDocuments = await source.GetDeletedDocumentsAsync(documentsForSource);
34+
foreach (var deletedDocument in deletedDocuments)
3535
{
36-
logger.LogInformation("Removing ingested data for {file}", deletedFile.Id);
37-
await vectorCollection.DeleteBatchAsync(deletedFile.Records.Select(r => r.Id));
38-
ingestionCacheDb.Documents.Remove(deletedFile);
36+
logger.LogInformation("Removing ingested data for {documentId}", deletedDocument.DocumentId);
37+
await DeleteChunksForDocumentAsync(deletedDocument);
38+
await documentsCollection.DeleteAsync(deletedDocument.Key);
3939
}
40-
await ingestionCacheDb.SaveChangesAsync();
4140

42-
var modifiedDocs = await source.GetNewOrModifiedDocumentsAsync(documentsForSource);
43-
foreach (var modifiedDoc in modifiedDocs)
41+
var modifiedDocuments = await source.GetNewOrModifiedDocumentsAsync(documentsForSource);
42+
foreach (var modifiedDocument in modifiedDocuments)
4443
{
45-
logger.LogInformation("Processing {file}", modifiedDoc.Id);
44+
logger.LogInformation("Processing {documentId}", modifiedDocument.DocumentId);
45+
await DeleteChunksForDocumentAsync(modifiedDocument);
4646

47-
if (modifiedDoc.Records.Count > 0)
48-
{
49-
await vectorCollection.DeleteBatchAsync(modifiedDoc.Records.Select(r => r.Id));
50-
}
47+
await documentsCollection.UpsertAsync(modifiedDocument);
5148

52-
var newRecords = await source.CreateRecordsForDocumentAsync(embeddingGenerator, modifiedDoc.Id);
53-
await foreach (var id in vectorCollection.UpsertBatchAsync(newRecords)) { }
49+
var newRecords = await source.CreateChunksForDocumentAsync(embeddingGenerator, modifiedDocument);
50+
await chunksCollection.UpsertAsync(newRecords);
51+
}
5452

55-
modifiedDoc.Records.Clear();
56-
modifiedDoc.Records.AddRange(newRecords.Select(r => new IngestedRecord { Id = r.Key, DocumentId = modifiedDoc.Id }));
53+
logger.LogInformation("Ingestion is up-to-date");
5754

58-
if (ingestionCacheDb.Entry(modifiedDoc).State == EntityState.Detached)
55+
async Task DeleteChunksForDocumentAsync(IngestedDocument document)
56+
{
57+
var documentId = document.DocumentId;
58+
var chunksToDelete = await chunksCollection.GetAsync(record => record.DocumentId == documentId, int.MaxValue).ToListAsync();
59+
if (chunksToDelete.Any())
5960
{
60-
ingestionCacheDb.Documents.Add(modifiedDoc);
61+
await chunksCollection.DeleteAsync(chunksToDelete.Select(r => r.Key));
6162
}
6263
}
63-
64-
await ingestionCacheDb.SaveChangesAsync();
65-
logger.LogInformation("Ingestion is up-to-date");
6664
}
6765
}

0 commit comments

Comments
 (0)