From bc34929190edffeee7112e0c8cc557676bc93d30 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Wed, 4 Jun 2025 16:32:40 -0400 Subject: [PATCH 01/11] Implement BLEU score evaluation for NLP tests --- .../BLEU/BLEUAlgorithm.cs | 205 ++++++++++++++++ .../BLEU/MatchCounter.cs | 50 ++++ .../BLEU/NGram.cs | 97 ++++++++ .../BLEU/RationalNumber.cs | 35 +++ .../BLEU/SmoothingFunction.cs | 71 ++++++ .../BLEUEvaluator.cs | 107 +++++++++ .../BLEUEvaluatorContext.cs | 38 +++ .../Directory.Build.targets | 33 +++ ...rosoft.Extensions.AI.Evaluation.NLP.csproj | 31 +++ .../README.md | 49 ++++ .../SimpleWordTokenizer.cs | 209 +++++++++++++++++ .../Utilities/CollectionBuilderAttribute.cs | 21 ++ .../BLEUAlgorithmicTests.cs | 219 ++++++++++++++++++ .../BLEUEvaluatorTests.cs | 91 ++++++++ .../MatchCounterTests.cs | 73 ++++++ ....Extensions.AI.Evaluation.NLP.Tests.csproj | 13 ++ .../NGramTests.cs | 68 ++++++ .../RationalNumberTests.cs | 55 +++++ .../SimpleTokenizerTests.cs | 91 ++++++++ 19 files changed, 1556 insertions(+) create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/BLEUAlgorithm.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/SmoothingFunction.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/Microsoft.Extensions.AI.Evaluation.NLP.Tests.csproj create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs create mode 100644 test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/BLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/BLEUAlgorithm.cs new file mode 100644 index 00000000000..6499f5af2fe --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/BLEUAlgorithm.cs @@ -0,0 +1,205 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; + +internal static class BLEUAlgorithm +{ + internal static IEnumerable> NGrams(IEnumerable input, int n) + where T : IEquatable + { + if (n <= 0) + { + Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); + } + + var output = input.Take(n).ToArray(); + + while (output.Length == n) + { + yield return new NGram(output); + + input = input.Skip(1); + output = input.Take(n).ToArray(); + } + } + + internal static int ClosestRefLength(IEnumerable> references, int hypLength) + { + if (!references.Any()) + { + return 0; + } + + int closestRefLength = 0; + int closest = int.MaxValue; + foreach (var reference in references) + { + int refLength = reference.Count(); + int diff = System.Math.Abs(refLength - hypLength); + if (diff < closest || + (diff == closest && refLength < closestRefLength)) + { + closest = diff; + closestRefLength = refLength; + } + } + + return closestRefLength; + } + + internal static double BrevityPenalty(int closestRefLength, int hypLength) + { + if (hypLength <= 0) + { + return 0.0; + } + + if (closestRefLength <= 0 || hypLength > closestRefLength) + { + return 1.0; + } + + return System.Math.Exp(1 - ((double)closestRefLength / hypLength)); + } + + internal static RationalNumber ModifiedPrecision(IEnumerable> references, IEnumerable hypothesis, int n = 1) + { + if (n <= 0) + { + Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); + } + + if (!references.Any() || !hypothesis.Any()) + { + return new RationalNumber(0, 0); + } + + var hyp = NGrams(hypothesis, n).ToArray(); + var hypCounts = new MatchCounter>(hyp); + + Dictionary, int> maxCounts = []; + + var matchCounts = new MatchCounter>(); + + foreach (var rf in references) + { + var refGrams = NGrams(rf, n).ToArray(); + var refCounts = new MatchCounter>(refGrams); + + foreach (var ct in refCounts.Values) + { + if (maxCounts.TryGetValue(ct.Key, out int val)) + { + maxCounts[ct.Key] = System.Math.Max(val, ct.Value); + } + else + { + maxCounts[ct.Key] = ct.Value; + } + } + } + + Dictionary, int> clippedCounts = []; + foreach (var h in hypCounts.Values) + { + if (maxCounts.TryGetValue(h.Key, out var v)) + { + clippedCounts[h.Key] = System.Math.Min(h.Value, v); + } + else + { + // If the hypothesis n-gram is not in any reference, it is clipped to 0. + clippedCounts[h.Key] = 0; + } + } + + int numerator = clippedCounts.Values.Sum(); + int denominator = System.Math.Max(1, hypCounts.Sum); + + return new RationalNumber(numerator, denominator); + } + + internal static double[] EqualWeights(int n) + { + if (n <= 0) + { + Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); + } + + double[] weights = new double[n]; + for (int i = 0; i < n; i++) + { + weights[i] = 1.0 / n; + } + + return weights; + } + + internal static readonly double[] DefaultBLEUWeights = EqualWeights(4); + + internal static double SentenceBLEU(IEnumerable> references, IEnumerable hypothesis) + => SentenceBLEU(references, hypothesis, DefaultBLEUWeights); + + internal static double SentenceBLEU(IEnumerable> references, IEnumerable hypothesis, double[] weights, + Func? smoothingFunction = null) + { + if (references == null || !references.Any()) + { + Throw.ArgumentNullException(nameof(references), "References cannot be null or empty."); + } + + if (hypothesis == null || !hypothesis.Any()) + { + Throw.ArgumentNullException(nameof(hypothesis), "Hypothesis cannot be null or empty."); + } + + if (weights == null || !weights.Any()) + { + Throw.ArgumentNullException(nameof(weights), "Weights cannot be null or empty."); + } + + var precisionValues = new RationalNumber[weights.Length]; + for (int i = 0; i < weights.Length; i++) + { + int n = i + 1; + RationalNumber prec = ModifiedPrecision(references, hypothesis, n); + + if (i == 0 && prec.Numerator == 0) + { + // If the precision for unigrams (n == 1) is zero, the there can be no higher order matches and BLEU score is zero. + return 0.0; + } + + precisionValues[i] = prec; + } + + int hypLen = hypothesis.Count(); + int closestRefLength = ClosestRefLength(references, hypLen); + double brevityPenalty = BrevityPenalty(closestRefLength, hypLen); + + if (smoothingFunction == null) + { + smoothingFunction = SmoothingFunction.Method0; + } + + double[] smoothedValues = smoothingFunction(precisionValues, hypLen); + + double score = 0.0; + for (int i = 0; i < weights.Length; i++) + { + if (smoothedValues[i] > 0) + { + score += weights[i] * System.Math.Log(smoothedValues[i]); + } + } + + return brevityPenalty * System.Math.Exp(score); + } + +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs new file mode 100644 index 00000000000..0f07c61386f --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs @@ -0,0 +1,50 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; + +internal readonly struct MatchCounter + where T : IEquatable +{ + private readonly Dictionary _counts = []; + + public readonly IEnumerable> Values => _counts; + + public readonly int Sum => _counts.Values.Sum(); + + public MatchCounter() + { } + + public MatchCounter(IEnumerable items) + { + _ = Throw.IfNull(items, nameof(items)); + AddRange(items); + } + + public void Add(T item) + { + if (_counts.TryGetValue(item, out int currentCount)) + { + _counts[item] = currentCount + 1; + } + else + { + _counts[item] = 1; + } + } + + public void AddRange(IEnumerable items) + { + foreach (var item in items) + { + Add(item); + } + } + + public override string ToString() => string.Concat(Values.Select(v => $"{v.Key}: {v.Value}, ")); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs new file mode 100644 index 00000000000..898be9b28f5 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs @@ -0,0 +1,97 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; + +[CollectionBuilder(typeof(NGramBuilder), nameof(NGramBuilder.Create))] +internal readonly struct NGram : IEquatable>, IEnumerable + where T : IEquatable +{ + /// + /// Create a sequence of n-grams from the input sequence. + /// + /// The input sequence of items. + /// The size of each n-gram. + internal static IEnumerable> Create(IEnumerable input, int n) + { + if (n <= 0) + { + Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); + } + + var output = input.Take(n).ToArray(); + + while (output.Length == n) + { + yield return new NGram(output); + + input = input.Skip(1); + output = input.Take(n).ToArray(); + } + } + + public NGram(ReadOnlySpan values) + : this(values.ToArray()) + { + } + + public NGram(params T[] values) + { + Values = Throw.IfNull(values, nameof(values)); + _ = Throw.IfLessThan(values.Length, 1, nameof(values)); + } + + public readonly T[] Values { get; } + + public int Length => Values.Length; + + public bool Equals(NGram other) + { + if (other.Length != Length) + { + return false; + } + + for (int i = 0; i < Length; i++) + { + if (!Values[i].Equals(other.Values[i])) + { + return false; + } + } + + return true; + } + + public override bool Equals(object? obj) => obj is NGram other && Equals(other); + + public override int GetHashCode() + { + int hashCode = 0; + foreach (var value in Values) + { + hashCode = HashCode.Combine(hashCode, value.GetHashCode()); + } + return hashCode; + } + + public IEnumerator GetEnumerator() => ((IEnumerable)Values).GetEnumerator(); + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + + public override string ToString() => $"[{string.Join(",", Values.Select(v => v.ToString()))}]"; +} + +internal static class NGramBuilder +{ + public static NGram Create(ReadOnlySpan values) + where T : IEquatable => new(values); +} + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs new file mode 100644 index 00000000000..65919ee83b6 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; + +internal readonly struct RationalNumber : IEquatable +{ + public RationalNumber(int numerator, int denominator) + { + if (denominator == 0) + { + throw new DivideByZeroException("Denominator cannot be zero."); + } + + Numerator = numerator; + Denominator = denominator; + } + + public int Numerator { get; } + public int Denominator { get; } + + public double ToDouble() => (double)Numerator / Denominator; + + public override string ToString() => $"{Numerator}/{Denominator}"; + + public bool Equals(RationalNumber other) + => other.Numerator == Numerator && other.Denominator == Denominator; + + public override bool Equals(object? obj) => obj is RationalNumber other && Equals(other); + + public override int GetHashCode() + => HashCode.Combine(Numerator, Denominator); +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/SmoothingFunction.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/SmoothingFunction.cs new file mode 100644 index 00000000000..7746248c36a --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/SmoothingFunction.cs @@ -0,0 +1,71 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; + +/// +/// Implementations of smoothing functions for BLEU scores taken from +/// `A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU` +/// by Chen and Cherry. http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf. +/// +internal static class SmoothingFunction +{ + /// + /// This is the baseline method, which does not apply any smoothing. + /// + /// N precision values to be smoothed. + /// Number of tokens in the hypothesis. + /// Smoothed precision values. + [System.Diagnostics.CodeAnalysis.SuppressMessage("Style", "IDE0060:Remove unused parameter", Justification = "Matches expected signature of SmoothingFunction")] + internal static double[] Method0(RationalNumber[] precisions, int hypLen) + { + double[] smoothed = new double[precisions.Length]; + for (int i = 0; i < precisions.Length; i++) + { + if (precisions[i].Numerator == 0) + { + smoothed[i] = double.Epsilon; + } + else + { + smoothed[i] = precisions[i].ToDouble(); + } + } + + return smoothed; + } + + /// + /// Smoothing method 4: + /// Shorter translations may have inflated precision values due to having + /// smaller denominators; therefore, we give them proportionally + /// smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry + /// suggests dividing by 1/ln(len(T)), where T is the length of the translation. + /// + /// N precision values to be smoothed. + /// Number of tokens in the hypothesis. + /// Smoothed precision values. + internal static double[] Method4(RationalNumber[] precisions, int hypLen) + { + const double DefaultK = 5.0; + + double[] smoothed = new double[precisions.Length]; + + int incvnt = 0; + for (int i = 0; i < precisions.Length; i++) + { + RationalNumber p = precisions[i]; + if (precisions[i].Numerator == 0 && hypLen > 1) + { + double numerator = 1 / (System.Math.Pow(2.0, incvnt) * DefaultK / System.Math.Log(hypLen)); + incvnt++; + } + else + { + smoothed[i] = p.ToDouble(); + } + } + + return smoothed; + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs new file mode 100644 index 00000000000..c2c5f60addb --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -0,0 +1,107 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP; + +/// +/// An that evaluates the quality of a response produced by an AI model by comparing +/// it to a reference response using the BLEU (Bilingual Evaluation Understudy) algorithm. +/// +/// +/// +/// The computes the BLUE score of a response ("hypothesis") compared to a reference +/// . The score is returned in a +/// with a value between 0.0 and 1.0 where 0.0 represents no match at all and 1.0 indicates a perfect match. +/// By default, the score is interpreted with a pass/fail cutoff of 0.5. So a score of 0.5 or higher is +/// passing and a score below 0.5 is failing. +/// +/// +public sealed class BLEUEvaluator : IEvaluator +{ + /// + /// Gets the of the returned by + /// . + /// + public static string BLEUMetricName => "BLEU"; + + /// + public IReadOnlyCollection EvaluationMetricNames { get; } = [BLEUMetricName]; + + /// + public ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + _ = Throw.IfNull(modelResponse); + + var metric = new NumericMetric(BLEUMetricName); + var result = new EvaluationResult(metric); + + if (string.IsNullOrWhiteSpace(modelResponse.Text)) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); + + return new ValueTask(result); + } + + if (additionalContext?.OfType().FirstOrDefault() + is not BLEUEvaluatorContext context) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"A value of type '{nameof(BLEUEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); + + return new ValueTask(result); + } + + var reference = SimpleWordTokenizer.WordTokenize(context.ReferenceText); + var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); + metric.Value = BLEUAlgorithm.SentenceBLEU([reference], hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); + + metric.AddOrUpdateContext(context); + metric.Interpretation = InterpretScore(metric); + + return new ValueTask(result); + } + + private static EvaluationMetricInterpretation InterpretScore(NumericMetric metric) + { + // BLEU scores range from 0.0 to 1.0, where: + // - 0.0 means no match at all, + // - 1.0 means a perfect match. + // 0.5 is considered the minimum passing score for BLEU evaluation. + + EvaluationRating rating = metric.Value switch + { + null => EvaluationRating.Inconclusive, + > 1.0 => EvaluationRating.Inconclusive, + > 0.8 and <= 1.0 => EvaluationRating.Exceptional, + > 0.6 and <= 0.8 => EvaluationRating.Good, + > 0.4 and <= 0.6 => EvaluationRating.Average, + > 0.2 and <= 0.4 => EvaluationRating.Poor, + >= 0.0 and <= 0.2 => EvaluationRating.Unacceptable, + < 0.0 => EvaluationRating.Inconclusive, + _ => EvaluationRating.Inconclusive, + }; + + const double MinimumPassingScore = 0.5; + return metric.Value is double value && value < MinimumPassingScore + ? new EvaluationMetricInterpretation( + rating, + failed: true, + reason: $"{metric.Name} is less than {MinimumPassingScore}.") + : new EvaluationMetricInterpretation(rating); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs new file mode 100644 index 00000000000..03a07bcf23d --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs @@ -0,0 +1,38 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable S3604 +// S3604: Member initializer values should not be redundant. +// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary +// constructor syntax. + +namespace Microsoft.Extensions.AI.Evaluation.NLP; + +/// +/// Contextual information that the uses to compute the BLEU score for a response. +/// +/// +/// The reference response against which the response that is being evaluated is compared. +/// +/// +/// measures the BLEU score of a response compared to a reference. BLEU (Bilingual Evaluation Understudy) +/// is a metric used to evaluate the quality if machine-generated text. +/// +public sealed class BLEUEvaluatorContext(string reference) + : EvaluationContext(name: BLEUContext, content: reference) +{ + /// + /// Gets the unique that is used for + /// . + /// + public static string BLEUContext => "BLEU Context"; + + /// + /// Gets the reference response against which the provided chat response will be scored. + /// + /// + /// The measures the degree to which the response being evaluated is similar to + /// the response supplied via . The metric will be reported as a BLEU score. + /// + public string ReferenceText { get; } = reference; +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets new file mode 100644 index 00000000000..3e3526f1ac8 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets @@ -0,0 +1,33 @@ + + + + + + + <_ConstantsFilePath>$(IntermediateOutputPath)Constants.g.cs + <_Lines> +// +// This file is auto-generated by MSBuild. +// + +namespace $(RootNamespace)%3B + +internal static class Constants +{ + public const string Version = "$(Version)"%3B +} + + + + + + + + + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj new file mode 100644 index 00000000000..99024c86c4e --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj @@ -0,0 +1,31 @@ + + + + A library that contains a set of evaluators that implement commonly used algorithmic evaluators. + $(TargetFrameworks);netstandard2.0 + Microsoft.Extensions.AI.Evaluation.NLP + + + + AIEval + preview + true + false + 0 + 0 + + + + + + + + + + + + + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md new file mode 100644 index 00000000000..c21e2a299ad --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md @@ -0,0 +1,49 @@ +# The Microsoft.Extensions.AI.Evaluation libraries + +`Microsoft.Extensions.AI.Evaluation` is a set of .NET libraries defined in the following NuGet packages that have been designed to work together to support building processes for evaluating the quality of AI software. + +* [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. +* [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. +* [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. +* [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. +* [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. + +## Install the packages + +From the command-line: + +```console +dotnet add package Microsoft.Extensions.AI.Evaluation +dotnet add package Microsoft.Extensions.AI.Evaluation.Quality +dotnet add package Microsoft.Extensions.AI.Evaluation.Safety +dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +``` + +Or directly in the C# project file: + +```xml + + + + + + +``` + +You can optionally add the `Microsoft.Extensions.AI.Evaluation.Reporting.Azure` package in either of these places if you need Azure Storage support. + +## Install the command line tool + +```console +dotnet tool install Microsoft.Extensions.AI.Evaluation.Console --create-manifest-if-needed +``` + +## Usage Examples + +For a comprehensive tour of all the functionality, concepts and APIs available in the `Microsoft.Extensions.AI.Evaluation` libraries, check out the [API Usage Examples](https://github.com/dotnet/ai-samples/blob/main/src/microsoft-extensions-ai-evaluation/api/) available in the [dotnet/ai-samples](https://github.com/dotnet/ai-samples) repo. These examples are structured as a collection of unit tests. Each unit test showcases a specific concept or API, and builds on the concepts and APIs showcased in previous unit tests. + + +## Feedback & Contributing + +We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions). diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs new file mode 100644 index 00000000000..0db4983ed02 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs @@ -0,0 +1,209 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Buffers; +using System.Collections.Generic; +using System.Reflection; +using System.Text; +using System.Text.RegularExpressions; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP; + +/// +/// Tokenizes a string into segments using the common rules established by the NLTK word tokenizer. +/// +public static class SimpleWordTokenizer +{ + /// + /// Tokenizes the input text into individual words based on specific rules for text normalization and segmentation. + /// + /// This method applies text normalization steps, such as removing skipped markers, handling line + /// breaks, and replacing common HTML entities. It also ensures consistent tokenization by inserting spaces around + /// punctuation, symbols, and certain character patterns. The tokenization rules are inspired by common BLEU algorithms, + /// such as those used in NLTK, SacreBLEU, and MOSES. + /// The input text to be tokenized. Cannot be . + /// An enumerable collection of strings, where each string represents a tokenized word. The collection will be empty + /// if the input text contains no valid tokens. + public static IEnumerable WordTokenize(string text) + { + _ = Throw.IfNull(text, nameof(text)); + + return WordTokenize(text.AsMemory()); + } + + /// + /// Tokenizes the input text into individual words based on specific rules for text normalization and segmentation. + /// + /// This method applies text normalization steps, such as removing skipped markers, handling line + /// breaks, and replacing common HTML entities. It also ensures consistent tokenization by inserting spaces around + /// punctuation, symbols, and certain character patterns. The tokenization rules are inspired by common BLEU algorithms, + /// such as those used in NLTK, SacreBLEU, and MOSES. + /// The input text to be tokenized. Cannot be . + /// An enumerable collection of strings, where each string represents a tokenized word. The collection will be empty + /// if the input text contains no valid tokens. + public static IEnumerable WordTokenize(ReadOnlyMemory text) + { + StringBuilder sb = new StringBuilder(); + + while (true) + { + if (text.IsEmpty) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + yield break; + } + + var span = text.Span; + char nextChar = span[0]; + + // Skip whitespace as separator + if (char.IsWhiteSpace(nextChar)) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + text = text.Slice(1); + continue; + } + + // Join hyphenated words + if (span[0] == '-' && + span.Length > 1 && + span[1] == '\n') + { + text = text.Slice(2); + continue; + } + + // Translate HTML entities + if (nextChar == '&') + { + if (span.StartsWith(""".AsSpan())) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + text = text.Slice(""".Length); + yield return "\""; + continue; + } + else if (span.StartsWith("&".AsSpan())) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + text = text.Slice("&".Length); + yield return "&"; + continue; + } + else if (span.StartsWith("<".AsSpan())) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + text = text.Slice("<".Length); + yield return "<"; + continue; + } + else if (span.StartsWith(">".AsSpan())) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + text = text.Slice(">".Length); + yield return ">"; + continue; + } + else if (span.StartsWith("'".AsSpan())) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + text = text.Slice("'".Length); + yield return "'"; + continue; + } + } + + // Each symbol is a separate token + if (char.IsSymbol(nextChar)) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + yield return nextChar.ToString(); + text = text.Slice(1); + continue; + } + + // Return punctuation + if (char.IsPunctuation(nextChar)) + { + if (sb.Length > 0) + { + yield return sb.ToString(); + _ = sb.Clear(); + } + + yield return nextChar.ToString(); + text = text.Slice(1); + continue; + } + + // if we have a number, consume it along with any internal punctuation + if (char.IsNumber(nextChar)) + { + // in this case we are still building a token, then the number + // should be added to the end of it, rather than as a separate number + if (sb.Length > 0) + { + _ = sb.Append(nextChar); + text = text.Slice(1); + continue; + } + + while (!text.IsEmpty && (char.IsNumber(text.Span[0]) || char.IsPunctuation(text.Span[0]))) + { + _ = sb.Append(text.Span[0]); + text = text.Slice(1); + } + + yield return sb.ToString(); + _ = sb.Clear(); + continue; + } + + _ = sb.Append(char.ToUpperInvariant(nextChar)); + text = text.Slice(1); + } + + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs new file mode 100644 index 00000000000..607486de6eb --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs @@ -0,0 +1,21 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#if !NET8_0_OR_GREATER +namespace System.Runtime.CompilerServices; + +[AttributeUsage(AttributeTargets.Class | AttributeTargets.Struct | AttributeTargets.Interface)] +internal sealed class CollectionBuilderAttribute : Attribute +{ + public CollectionBuilderAttribute(Type builderType, string methodName) + { + BuilderType = builderType; + MethodName = methodName; + } + + public Type BuilderType { get; } + public string MethodName { get; } +} + + +#endif diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs new file mode 100644 index 00000000000..015a46d97a1 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs @@ -0,0 +1,219 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using Xunit; +using static Microsoft.Extensions.AI.Evaluation.NLP.BLEU.BLEUAlgorithm; +using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +public class BLEUAlgorithmicTests +{ + [Fact] + public void NGramGenerationNoPadding() + { + IEnumerable> result = BLEUAlgorithm.NGrams([1, 2, 3, 4, 5], 1); + List> expected = [[1], [2], [3], [4], [5]]; + Assert.True(result.SequenceEqual(expected)); + + result = BLEUAlgorithm.NGrams([1, 2, 3, 4, 5], 2); + expected = [[1, 2], [2, 3], [3, 4], [4, 5]]; + Assert.True(result.SequenceEqual(expected)); + + result = BLEUAlgorithm.NGrams([1, 2, 3, 4, 5], 3); + expected = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]; + Assert.True(result.SequenceEqual(expected)); + } + + [Fact] + public void ModifiedPrecisionTests() + { + IEnumerable> references = ["the cat is on the mat".Split(' '), "there is a cat on the mat".Split(' ')]; + IEnumerable hypothesis = "the the the the the the the".Split(' '); + RationalNumber prec = ModifiedPrecision(references, hypothesis, 1); + Assert.Equal(0.2857, prec.ToDouble(), 4); + + + references = [ + "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), + "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), + "It is the practical guide for the army always to heed the directions of the party".Split(' '), + ]; + hypothesis = "of the".Split(' '); + prec = ModifiedPrecision(references, hypothesis, 1); + Assert.Equal(1.0, prec.ToDouble(), 4); + prec = ModifiedPrecision(references, hypothesis, 2); + Assert.Equal(1.0, prec.ToDouble(), 4); + + + references = [ + "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), + "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), + "It is the practical guide for the army always to heed the directions of the party".Split(' '), + ]; + IEnumerable hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + IEnumerable hypothesis2 = "It is to insure the troops forever hearing the activity guidebook that party direct".Split(' '); + prec = ModifiedPrecision(references, hypothesis1, 1); + Assert.Equal(0.9444, prec.ToDouble(), 4); + prec = ModifiedPrecision(references, hypothesis2, 1); + Assert.Equal(0.5714, prec.ToDouble(), 4); + prec = ModifiedPrecision(references, hypothesis1, 2); + Assert.Equal(0.5882, prec.ToDouble(), 4); + prec = ModifiedPrecision(references, hypothesis2, 2); + Assert.Equal(0.07692, prec.ToDouble(), 4); + } + + [Fact] + public void TestBrevityPenalty() + { + IEnumerable> references = [ + Enumerable.Repeat("a", 11), + Enumerable.Repeat("a", 8), + ]; + IEnumerable hypothesis = Enumerable.Repeat("a", 7); + int hypLength = hypothesis.Count(); + int closestRefLength = ClosestRefLength(references, hypLength); + double brevityPenalty = BrevityPenalty(closestRefLength, hypLength); + Assert.Equal(0.8669, brevityPenalty, 4); + + references = [ + Enumerable.Repeat("a", 11), + Enumerable.Repeat("a", 8), + Enumerable.Repeat("a", 6), + Enumerable.Repeat("a", 7), + ]; + hypothesis = Enumerable.Repeat("a", 7); + hypLength = hypothesis.Count(); + closestRefLength = ClosestRefLength(references, hypLength); + brevityPenalty = BrevityPenalty(closestRefLength, hypLength); + Assert.Equal(1.0, brevityPenalty, 4); + + references = [ + Enumerable.Repeat("a", 28), + Enumerable.Repeat("a", 28), + ]; + hypothesis = Enumerable.Repeat("a", 12); + hypLength = hypothesis.Count(); + closestRefLength = ClosestRefLength(references, hypLength); + brevityPenalty = BrevityPenalty(closestRefLength, hypLength); + Assert.Equal(0.26359, brevityPenalty, 4); + + references = [ + Enumerable.Repeat("a", 13), + Enumerable.Repeat("a", 2), + ]; + hypothesis = Enumerable.Repeat("a", 12); + hypLength = hypothesis.Count(); + closestRefLength = ClosestRefLength(references, hypLength); + brevityPenalty = BrevityPenalty(closestRefLength, hypLength); + Assert.Equal(0.9200, brevityPenalty, 4); + + references = [ + Enumerable.Repeat("a", 13), + Enumerable.Repeat("a", 11), + ]; + hypothesis = Enumerable.Repeat("a", 12); + hypLength = hypothesis.Count(); + closestRefLength = ClosestRefLength(references, hypLength); + brevityPenalty = BrevityPenalty(closestRefLength, hypLength); + Assert.Equal(1.0, brevityPenalty, 4); + + references = [ + Enumerable.Repeat("a", 11), + Enumerable.Repeat("a", 13), + ]; + hypothesis = Enumerable.Repeat("a", 12); + hypLength = hypothesis.Count(); + closestRefLength = ClosestRefLength(references, hypLength); + brevityPenalty = BrevityPenalty(closestRefLength, hypLength); + Assert.Equal(1.0, brevityPenalty, 4); + + } + + [Fact] + public void TestZeroMatches() + { + IEnumerable> references = ["The candidate has no alignment to any of the references".Split(' '),]; + IEnumerable hypothesis = "John loves Mary".Split(' '); + + double score = SentenceBLEU(references, hypothesis, EqualWeights(hypothesis.Count())); + Assert.Equal(0.0, score, 4); + } + + [Fact] + public void TestFullMatches() + { + IEnumerable> references = ["John loves Mary".Split(' '),]; + IEnumerable hypothesis = "John loves Mary".Split(' '); + + double score = SentenceBLEU(references, hypothesis, EqualWeights(hypothesis.Count())); + Assert.Equal(1.0, score, 4); + } + + [Fact] + public void TestPartialMatchesHypothesisLongerThanReference() + { + IEnumerable> references = ["John loves Mary".Split(' '),]; + IEnumerable hypothesis = "John loves Mary who loves Mike".Split(' '); + + double score = SentenceBLEU(references, hypothesis); + Assert.Equal(0, score, 4); + } + + [Fact] + public void TestSentenceBLEUExampleA() + { + IEnumerable> references = [ + "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), + "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), + "It is the practical guide for the army always to heed the directions of the party".Split(' ') + ]; + IEnumerable hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party".Split(' '); + + double score = SentenceBLEU(references, hypothesis); + Assert.Equal(0.5046, score, 4); + + } + + [Fact] + public void TestSentenceBLEUExampleB() + { + IEnumerable> references = [ + "he was interested in world history because he read the book".Split(' '), + ]; + IEnumerable hypothesis = "he read the book because he was interested in world history".Split(' '); + + double score = SentenceBLEU(references, hypothesis); + Assert.Equal(0.74009, score, 4); + } + + [Fact] + public void TestSentenceBLEUExampleAWithWordTokenizer() + { + IEnumerable> references = [ + SimpleWordTokenizer.WordTokenize("It is a guide to action that ensures that the military will forever heed Party commands"), + SimpleWordTokenizer.WordTokenize("It is the guiding principle which guarantees the military forces always being under the command of the Party"), + SimpleWordTokenizer.WordTokenize("It is the practical guide for the army always to heed the directions of the party") + ]; + IEnumerable hypothesis = SimpleWordTokenizer.WordTokenize("It is a guide to action which ensures that the military always obeys the commands of the party"); + + double score = SentenceBLEU(references, hypothesis); + Assert.Equal(0.5046, score, 4); + + } + + [Fact] + public void TestSentenceBLEUExampleBWithWordTokenizer() + { + IEnumerable> references = [ + SimpleWordTokenizer.WordTokenize("he was interested in world history because he read the book"), + ]; + IEnumerable hypothesis = SimpleWordTokenizer.WordTokenize("he read the book because he was interested in world history"); + + double score = SentenceBLEU(references, hypothesis); + Assert.Equal(0.74009, score, 4); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs new file mode 100644 index 00000000000..b32f05e0089 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs @@ -0,0 +1,91 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +public class BLEUEvaluatorTests +{ + [Fact] + public async Task EvaluateAsync_ReturnsPerfectScoreForIdenticalText() + { + var evaluator = new BLEUEvaluator(); + var context = new BLEUEvaluatorContext("The quick brown fox jumps over the lazy dog."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "The quick brown fox jumps over the lazy dog.")); + var result = await evaluator.EvaluateAsync([], response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); + Assert.Equal(1.0, (double)metric!.Value!, 4); + Assert.NotNull(metric.Interpretation); + Assert.Equal(EvaluationRating.Exceptional, metric.Interpretation.Rating); + Assert.False(metric.Interpretation.Failed); + } + + [Fact] + public async Task EvaluateAsync_ReturnsLowScoreForCompletelyDifferentText() + { + var evaluator = new BLEUEvaluator(); + var context = new BLEUEvaluatorContext("The quick brown fox jumps over the lazy dog."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Completely unrelated sentence.")); + var result = await evaluator.EvaluateAsync([], response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); + Assert.Equal(0.1578, (double)metric!.Value!, 4); + Assert.NotNull(metric.Interpretation); + Assert.Equal(EvaluationRating.Unacceptable, metric.Interpretation.Rating); + Assert.True(metric.Interpretation.Failed); + } + + [Fact] + public async Task EvaluateAsync_ReturnsErrorDiagnosticIfNoContext() + { + var evaluator = new BLEUEvaluator(); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Some text.")); + var result = await evaluator.EvaluateAsync([], response, null, null); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); + Assert.NotNull(metric.Diagnostics); + Assert.Contains(metric.Diagnostics, d => d.Severity == EvaluationDiagnosticSeverity.Error); + } + + [Theory] + [InlineData("the cat is on the mat", + "the the the the the the the", 0.7311)] + [InlineData("It is a guide to action that ensures that the military will forever heed Party commands", + "It is a guide to action which ensures that the military always obeys the commands of the party", 0.4209)] + [InlineData("It is the practical guide for the army always to heed the directions of the party", + "It is to insure the troops forever hearing the activity guidebook that party direct", 0.3694)] + public async Task EvaluateAsync_SampleCases(string reference, string hypothesis, double score) + { + var evaluator = new BLEUEvaluator(); + var context = new BLEUEvaluatorContext(reference); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, hypothesis)); + var result = await evaluator.EvaluateAsync([], response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); + Assert.Equal(score, (double)metric!.Value!, 4); + } + + [Fact] + public async Task EvaluateAsync_ReturnsErrorDiagnosticIfEmptyResponse() + { + var evaluator = new BLEUEvaluator(); + var context = new BLEUEvaluatorContext("Reference text."); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "")); + var result = await evaluator.EvaluateAsync([], response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); + Assert.NotNull(metric.Diagnostics); + Assert.Contains(metric.Diagnostics, d => d.Severity == EvaluationDiagnosticSeverity.Error); + } + +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs new file mode 100644 index 00000000000..fe828e347ea --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +public class MatchCounterTests +{ + [Fact] + public void EmptyConstructor_InitializesEmptyCounter() + { + var counter = new MatchCounter(); + Assert.Empty(counter.Values); + Assert.Equal(0, counter.Sum); + } + + [Fact] + public void ConstructorWithItems_CountsCorrectly() + { + var counter = new MatchCounter(new[] { "a", "b", "a", "c", "b", "a" }); + var dict = counter.Values.ToDictionary(kv => kv.Key, kv => kv.Value); + Assert.Equal(3, dict["a"]); + Assert.Equal(2, dict["b"]); + Assert.Equal(1, dict["c"]); + Assert.Equal(6, counter.Sum); + } + + [Fact] + public void Add_AddsSingleItemCorrectly() + { + var counter = new MatchCounter(); + counter.Add(5); + counter.Add(5); + counter.Add(3); + var dict = counter.Values.ToDictionary(kv => kv.Key, kv => kv.Value); + Assert.Equal(2, dict[5]); + Assert.Equal(1, dict[3]); + Assert.Equal(3, counter.Sum); + } + + [Fact] + public void AddRange_AddsMultipleItemsCorrectly() + { + var counter = new MatchCounter(); + counter.AddRange("hello"); + var dict = counter.Values.ToDictionary(kv => kv.Key, kv => kv.Value); + Assert.Equal(1, dict['h']); + Assert.Equal(1, dict['e']); + Assert.Equal(2, dict['l']); + Assert.Equal(1, dict['o']); + Assert.Equal(5, counter.Sum); + } + + [Fact] + public void ToString_FormatsCorrectly() + { + var counter = new MatchCounter(new[] { "x", "y", "x" }); + var str = counter.ToString(); + Assert.Contains("x: 2", str); + Assert.Contains("y: 1", str); + } + + [Fact] + public void Constructor_ThrowsOnNull() + { + Assert.Throws(() => new MatchCounter(null)); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/Microsoft.Extensions.AI.Evaluation.NLP.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/Microsoft.Extensions.AI.Evaluation.NLP.Tests.csproj new file mode 100644 index 00000000000..6b485136520 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/Microsoft.Extensions.AI.Evaluation.NLP.Tests.csproj @@ -0,0 +1,13 @@ + + + + Microsoft.Extensions.AI.Evaluation.NLP.Tests + Unit tests for Microsoft.Extensions.AI.Evaluation.NLP. + + + + + + + + diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs new file mode 100644 index 00000000000..ea059ebec21 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs @@ -0,0 +1,68 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +public class NGramTests +{ + [Fact] + public void Constructor_StoresValuesAndLength() + { + var ngram = new NGram(1, 2, 3); + Assert.Equal(new[] { 1, 2, 3 }, ngram.Values); + Assert.Equal(3, ngram.Length); + } + + [Fact] + public void Constructor_ThrowsOnNull() + { + Assert.Throws(() => new NGram(null)); + } + + [Fact] + public void Constructor_ThrowsOnEmpty() + { + Assert.Throws(() => new NGram(Array.Empty())); + } + + [Fact] + public void Equals_And_HashCode_WorkCorrectly() + { + var a = new NGram(1, 2, 3); + var b = new NGram(1, 2, 3); + var c = new NGram(3, 2, 1); + Assert.True(a.Equals(b)); + Assert.True(a.Equals((object)b)); + Assert.False(a.Equals(c)); + Assert.NotEqual(a.GetHashCode(), c.GetHashCode()); + } + + [Fact] + public void Enumerator_And_IEnumerable() + { + var ngram = new NGram('a', 'b', 'c'); + var list = ngram.ToList(); + Assert.Equal(new[] { 'a', 'b', 'c' }, list); + } + + [Fact] + public void ToString_FormatsCorrectly() + { + var ngram = new NGram("x", "y"); + Assert.Equal("[x,y]", ngram.ToString()); + } + + [Fact] + public void NGramBuilder_Create_Works() + { + var arr = new[] { 1, 2 }; + var ngram = NGramBuilder.Create(arr); + Assert.Equal(new NGram(1, 2), ngram); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs new file mode 100644 index 00000000000..b40c82734fc --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs @@ -0,0 +1,55 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +public class RationalNumberTests +{ + [Fact] + public void Constructor_StoresNumeratorAndDenominator() + { + var r = new RationalNumber(3, 4); + Assert.Equal(3, r.Numerator); + Assert.Equal(4, r.Denominator); + } + + [Fact] + public void Constructor_ThrowsOnZeroDenominator() + { + Assert.Throws(() => new RationalNumber(1, 0)); + } + + [Theory] + [InlineData(1, 2, 0.5)] + [InlineData(-3, 4, -0.75)] + [InlineData(0, 5, 0.0)] + public void ToDouble_ReturnsExpected(int num, int denom, double expected) + { + var r = new RationalNumber(num, denom); + Assert.Equal(expected, r.ToDouble(), 6); + } + + [Fact] + public void ToString_FormatsCorrectly() + { + var r = new RationalNumber(7, 9); + Assert.Equal("7/9", r.ToString()); + } + + [Fact] + public void Equals_And_HashCode_WorkCorrectly() + { + var a = new RationalNumber(2, 3); + var b = new RationalNumber(2, 3); + var c = new RationalNumber(3, 2); + Assert.True(a.Equals(b)); + Assert.True(a.Equals((object)b)); + Assert.False(a.Equals(c)); + Assert.Equal(a.GetHashCode(), b.GetHashCode()); + Assert.NotEqual(a.GetHashCode(), c.GetHashCode()); + } +} diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs new file mode 100644 index 00000000000..bdabc4b1351 --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs @@ -0,0 +1,91 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; + +public class SimpleTokenizerTests +{ + [Fact] + public void TokenizeText() + { + (string, IEnumerable)[] cases = [ + ("It is a guide to action that ensures that the military will forever heed Party commands.", + ["IT", "IS", "A", "GUIDE", "TO", "ACTION", "THAT", "ENSURES", "THAT", "THE", "MILITARY", "WILL", "FOREVER", "HEED", "PARTY", "COMMANDS", "."]), + ("Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.", + ["GOOD", "MUFFINS", "COST", "$", "3.88", "(", "ROUGHLY", "3,36", "EUROS", ")", "IN", "NEW", "YORK", ".", "PLEASE", "BUY", "ME", "TWO", "OF", "THEM", ".", "THANKS", "."]), + ("", []), + ("Hello, world! How's it going?", ["HELLO", ",", "WORLD", "!", "HOW", "'", "S", "IT", "GOING", "?"]), + (""Quotes" and & symbols < > '", ["\"", "QUOTES", "\"", "AND", "&", "SYMBOLS", "<", ">", "'"]), + ("-\nThis is a test.", ["THIS", "IS", "A", "TEST", "."]), + ]; + + foreach (var (text, expected) in cases) + { + IEnumerable result = SimpleWordTokenizer.WordTokenize(text); + Assert.Equal(expected, result); + } + } + + [Fact] + public void HandlesNullInput_Throws() + { + Assert.Throws(() => SimpleWordTokenizer.WordTokenize((string)null)); + } + + [Theory] + [InlineData(" $41.23 ", new[] { "$", "41.23" })] + [InlineData("word", new[] { "WORD" })] + [InlineData("word1 word2", new[] { "WORD1", "WORD2" })] + [InlineData("word1,word2", new[] { "WORD1", ",", "WORD2" })] + [InlineData("word1.word2", new[] { "WORD1", ".", "WORD2" })] + [InlineData("word1!word2?", new[] { "WORD1", "!", "WORD2", "?" })] + [InlineData("word1-word2", new[] { "WORD1", "-", "WORD2" })] + [InlineData("word1\nword2", new[] { "WORD1", "WORD2" })] + [InlineData("word1\tword2", new[] { "WORD1", "WORD2" })] + [InlineData("It is a guide to action that ensures that the military will forever heed Party commands.", + new[] { "IT", "IS", "A", "GUIDE", "TO", "ACTION", "THAT", "ENSURES", "THAT", "THE", "MILITARY", "WILL", "FOREVER", "HEED", "PARTY", "COMMANDS", "." })] + [InlineData("Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.", + new[] { "GOOD", "MUFFINS", "COST", "$", "3.88", "(", "ROUGHLY", "3,36", "EUROS", ")", "IN", "NEW", "YORK", ".", "PLEASE", "BUY", "ME", "TWO", "OF", "THEM", ".", "THANKS", "." })] + [InlineData("", new string[0])] + [InlineData(" This is a test.", new[] { "THIS", "IS", "A", "TEST", "." })] + [InlineData("Hello, world! How's it going?", new[] { "HELLO", ",", "WORLD", "!", "HOW", "'", "S", "IT", "GOING", "?" })] + [InlineData(""Quotes" and & symbols < > '", new[] { "\"", "QUOTES", "\"", "AND", "&", "SYMBOLS", "<", ">", "'" })] + [InlineData("-\nThis is a test.", new[] { "THIS", "IS", "A", "TEST", "." })] + public void Tokenize_Cases(string input, string[] expected) + { + var result = SimpleWordTokenizer.WordTokenize(input); + Assert.Equal(expected, result); + } + + [Fact] + public void HandlesMultipleSpacesAndEmptyEntries() + { + var input = " word1 word2 word3 "; + var expected = new[] { "WORD1", "WORD2", "WORD3" }; + var result = SimpleWordTokenizer.WordTokenize(input); + Assert.Equal(expected, result); + } + + [Fact] + public void HandlesUnicodeSymbolsAndPunctuation() + { + var input = "word1 © word2 ™ word3 — word4"; + var expected = new[] { "WORD1", "©", "WORD2", "™", "WORD3", "—", "WORD4" }; + var result = SimpleWordTokenizer.WordTokenize(input); + Assert.Equal(expected, result); + } + + [Fact] + public void HandlesHtmlEntities() + { + var input = ""Hello" & Goodbye <test> '"; + var expected = new[] { "\"", "HELLO", "\"", "&", "GOODBYE", "<", "TEST", ">", "'" }; + var result = SimpleWordTokenizer.WordTokenize(input); + Assert.Equal(expected, result); + } +} From b05ee5b3b8415379197eb69b2fbe403bbab10004 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Tue, 24 Jun 2025 07:27:13 -0400 Subject: [PATCH 02/11] Fix style warnings --- .../BLEU/MatchCounter.cs | 3 ++- .../Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs | 1 + .../BLEU/RationalNumber.cs | 2 +- .../SimpleWordTokenizer.cs | 5 ++--- .../Utilities/CollectionBuilderAttribute.cs | 1 - .../BLEUAlgorithmicTests.cs | 4 +--- .../MatchCounterTests.cs | 6 ------ .../NGramTests.cs | 6 ------ .../SimpleTokenizerTests.cs | 7 ------- 9 files changed, 7 insertions(+), 28 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs index 0f07c61386f..1bbf117b9f3 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs @@ -18,7 +18,8 @@ internal readonly struct MatchCounter public readonly int Sum => _counts.Values.Sum(); public MatchCounter() - { } + { + } public MatchCounter(IEnumerable items) { diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs index 898be9b28f5..574919e3e18 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs @@ -79,6 +79,7 @@ public override int GetHashCode() { hashCode = HashCode.Combine(hashCode, value.GetHashCode()); } + return hashCode; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs index 65919ee83b6..87f5d624745 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs @@ -22,7 +22,7 @@ public RationalNumber(int numerator, int denominator) public int Denominator { get; } public double ToDouble() => (double)Numerator / Denominator; - + public override string ToString() => $"{Numerator}/{Denominator}"; public bool Equals(RationalNumber other) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs index 0db4983ed02..829c9b3b654 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs @@ -2,11 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; -using System.Buffers; using System.Collections.Generic; -using System.Reflection; using System.Text; -using System.Text.RegularExpressions; using Microsoft.Shared.Diagnostics; namespace Microsoft.Extensions.AI.Evaluation.NLP; @@ -81,7 +78,9 @@ public static IEnumerable WordTokenize(ReadOnlyMemory text) span.Length > 1 && span[1] == '\n') { +#pragma warning disable S109 // Magic numbers should not be used text = text.Slice(2); +#pragma warning restore S109 // Magic numbers should not be used continue; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs index 607486de6eb..90e928cd15e 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs @@ -17,5 +17,4 @@ public CollectionBuilderAttribute(Type builderType, string methodName) public string MethodName { get; } } - #endif diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs index 015a46d97a1..7ce95f8ba59 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs @@ -4,9 +4,9 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; using Xunit; using static Microsoft.Extensions.AI.Evaluation.NLP.BLEU.BLEUAlgorithm; -using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; @@ -36,7 +36,6 @@ public void ModifiedPrecisionTests() RationalNumber prec = ModifiedPrecision(references, hypothesis, 1); Assert.Equal(0.2857, prec.ToDouble(), 4); - references = [ "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), @@ -48,7 +47,6 @@ public void ModifiedPrecisionTests() prec = ModifiedPrecision(references, hypothesis, 2); Assert.Equal(1.0, prec.ToDouble(), 4); - references = [ "It is a guide to action that ensures that the military will forever heed Party commands".Split(' '), "It is the guiding principle which guarantees the military forces always being under the command of the Party".Split(' '), diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs index fe828e347ea..04fa4748765 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs @@ -64,10 +64,4 @@ public void ToString_FormatsCorrectly() Assert.Contains("x: 2", str); Assert.Contains("y: 1", str); } - - [Fact] - public void Constructor_ThrowsOnNull() - { - Assert.Throws(() => new MatchCounter(null)); - } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs index ea059ebec21..889eaa5cc15 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs @@ -19,12 +19,6 @@ public void Constructor_StoresValuesAndLength() Assert.Equal(3, ngram.Length); } - [Fact] - public void Constructor_ThrowsOnNull() - { - Assert.Throws(() => new NGram(null)); - } - [Fact] public void Constructor_ThrowsOnEmpty() { diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs index bdabc4b1351..46e5c9d9453 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; -using System.Linq; using Xunit; namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; @@ -31,12 +30,6 @@ public void TokenizeText() } } - [Fact] - public void HandlesNullInput_Throws() - { - Assert.Throws(() => SimpleWordTokenizer.WordTokenize((string)null)); - } - [Theory] [InlineData(" $41.23 ", new[] { "$", "41.23" })] [InlineData("word", new[] { "WORD" })] From d3f45a4b6d2a4c4ea721042a152ad190d7b8a4f7 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Tue, 24 Jun 2025 12:17:52 -0400 Subject: [PATCH 03/11] Support multiple references for a single evaluator --- .../BLEUEvaluator.cs | 6 +-- .../BLEUEvaluatorContext.cs | 38 +++++++++++++++---- .../BLEUEvaluatorTests.cs | 20 ++++++++++ 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index c2c5f60addb..ac082cb5f34 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -18,7 +18,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP; /// /// /// The computes the BLUE score of a response ("hypothesis") compared to a reference -/// . The score is returned in a +/// . The score is returned in a /// with a value between 0.0 and 1.0 where 0.0 represents no match at all and 1.0 indicates a perfect match. /// By default, the score is interpreted with a pass/fail cutoff of 0.5. So a score of 0.5 or higher is /// passing and a score below 0.5 is failing. @@ -66,9 +66,9 @@ public ValueTask EvaluateAsync( return new ValueTask(result); } - var reference = SimpleWordTokenizer.WordTokenize(context.ReferenceText); + var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference)); var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); - metric.Value = BLEUAlgorithm.SentenceBLEU([reference], hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); + metric.Value = BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); metric.AddOrUpdateContext(context); metric.Interpretation = InterpretScore(metric); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs index 03a07bcf23d..f98bc497ee9 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs @@ -6,20 +6,19 @@ // We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary // constructor syntax. +using System.Collections.Generic; +using System.Linq; + namespace Microsoft.Extensions.AI.Evaluation.NLP; /// /// Contextual information that the uses to compute the BLEU score for a response. /// -/// -/// The reference response against which the response that is being evaluated is compared. -/// /// /// measures the BLEU score of a response compared to a reference. BLEU (Bilingual Evaluation Understudy) /// is a metric used to evaluate the quality if machine-generated text. /// -public sealed class BLEUEvaluatorContext(string reference) - : EvaluationContext(name: BLEUContext, content: reference) +public sealed class BLEUEvaluatorContext : EvaluationContext { /// /// Gets the unique that is used for @@ -32,7 +31,32 @@ public sealed class BLEUEvaluatorContext(string reference) /// /// /// The measures the degree to which the response being evaluated is similar to - /// the response supplied via . The metric will be reported as a BLEU score. + /// the response supplied via . The metric will be reported as a BLEU score. /// - public string ReferenceText { get; } = reference; + public IReadOnlyList References { get; } + + /// + /// Initializes a new instance of the class. + /// + /// + /// The reference responses against which the response that is being evaluated is compared. + /// + public BLEUEvaluatorContext(params string[] references) + : this(references as IEnumerable) + { + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// The reference responses against which the response that is being evaluated is compared. + /// + public BLEUEvaluatorContext(IEnumerable references) + : base( + name: BLEUContext, + contents: [.. references.Select(c => new TextContent(c))]) + { + References = [.. references]; + } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs index b32f05e0089..22a02b110c1 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs @@ -74,6 +74,26 @@ public async Task EvaluateAsync_SampleCases(string reference, string hypothesis, Assert.Equal(score, (double)metric!.Value!, 4); } + [Fact] + public async Task EvaluateAsync_MultipleReferences() + { + string[] references = [ + "It is a guide to action that ensures that the military will forever heed Party commands", + "It is the guiding principle which guarantees the military forces always being under the command of the Party", + "It is the practical guide for the army always to heed the directions of the party", + ]; + string hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party"; + + var evaluator = new BLEUEvaluator(); + var context = new BLEUEvaluatorContext(references); + var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, hypothesis)); + var result = await evaluator.EvaluateAsync([], response, null, [context]); + var metric = Assert.Single(result.Metrics.Values) as NumericMetric; + Assert.NotNull(metric); + Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); + Assert.Equal(0.5046, (double)metric!.Value!, 4); + } + [Fact] public async Task EvaluateAsync_ReturnsErrorDiagnosticIfEmptyResponse() { From 666093d552e0f5e7bcf38539c4ca2f3b32f2c6d8 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Thu, 26 Jun 2025 07:20:38 -0400 Subject: [PATCH 04/11] Make some suggested updats. --- eng/MSBuild/LegacySupport.props | 4 ++ .../CollectionBuilderAttribute.cs | 5 +- src/LegacySupport/CollectionBuilder/README.md | 7 +++ .../BLEUEvaluator.cs | 2 +- .../{BLEU => Common}/BLEUAlgorithm.cs | 25 ++-------- .../{BLEU => Common}/MatchCounter.cs | 2 +- .../{BLEU => Common}/NGram.cs | 46 ++----------------- .../Common/NGramExtensions.cs | 37 +++++++++++++++ .../{BLEU => Common}/RationalNumber.cs | 2 +- .../{BLEU => Common}/SmoothingFunction.cs | 2 +- ...rosoft.Extensions.AI.Evaluation.NLP.csproj | 4 ++ .../BLEUAlgorithmicTests.cs | 12 +++-- .../MatchCounterTests.cs | 2 +- .../NGramTests.cs | 7 ++- .../RationalNumberTests.cs | 2 +- 15 files changed, 77 insertions(+), 82 deletions(-) rename src/{Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities => LegacySupport/CollectionBuilder}/CollectionBuilderAttribute.cs (83%) create mode 100644 src/LegacySupport/CollectionBuilder/README.md rename src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/{BLEU => Common}/BLEUAlgorithm.cs (89%) rename src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/{BLEU => Common}/MatchCounter.cs (95%) rename src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/{BLEU => Common}/NGram.cs (60%) create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs rename src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/{BLEU => Common}/RationalNumber.cs (94%) rename src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/{BLEU => Common}/SmoothingFunction.cs (97%) diff --git a/eng/MSBuild/LegacySupport.props b/eng/MSBuild/LegacySupport.props index 6b110acaaa1..7bda63a6607 100644 --- a/eng/MSBuild/LegacySupport.props +++ b/eng/MSBuild/LegacySupport.props @@ -74,4 +74,8 @@ + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs b/src/LegacySupport/CollectionBuilder/CollectionBuilderAttribute.cs similarity index 83% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs rename to src/LegacySupport/CollectionBuilder/CollectionBuilderAttribute.cs index 90e928cd15e..569daa70dff 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Utilities/CollectionBuilderAttribute.cs +++ b/src/LegacySupport/CollectionBuilder/CollectionBuilderAttribute.cs @@ -1,7 +1,6 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -#if !NET8_0_OR_GREATER namespace System.Runtime.CompilerServices; [AttributeUsage(AttributeTargets.Class | AttributeTargets.Struct | AttributeTargets.Interface)] @@ -16,5 +15,3 @@ public CollectionBuilderAttribute(Type builderType, string methodName) public Type BuilderType { get; } public string MethodName { get; } } - -#endif diff --git a/src/LegacySupport/CollectionBuilder/README.md b/src/LegacySupport/CollectionBuilder/README.md new file mode 100644 index 00000000000..15e9274d433 --- /dev/null +++ b/src/LegacySupport/CollectionBuilder/README.md @@ -0,0 +1,7 @@ +To use this source in your project, add the following to your `.csproj` file: + +```xml + + true + +``` diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index ac082cb5f34..e4ba47b72a5 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -6,7 +6,7 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; -using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Microsoft.Shared.Diagnostics; namespace Microsoft.Extensions.AI.Evaluation.NLP; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/BLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs similarity index 89% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/BLEUAlgorithm.cs rename to src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs index 6499f5af2fe..f8399eff071 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/BLEUAlgorithm.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs @@ -6,29 +6,10 @@ using System.Linq; using Microsoft.Shared.Diagnostics; -namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; internal static class BLEUAlgorithm { - internal static IEnumerable> NGrams(IEnumerable input, int n) - where T : IEquatable - { - if (n <= 0) - { - Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); - } - - var output = input.Take(n).ToArray(); - - while (output.Length == n) - { - yield return new NGram(output); - - input = input.Skip(1); - output = input.Take(n).ToArray(); - } - } - internal static int ClosestRefLength(IEnumerable> references, int hypLength) { if (!references.Any()) @@ -80,7 +61,7 @@ internal static RationalNumber ModifiedPrecision(IEnumerable return new RationalNumber(0, 0); } - var hyp = NGrams(hypothesis, n).ToArray(); + var hyp = hypothesis.CreateNGrams(n).ToArray(); var hypCounts = new MatchCounter>(hyp); Dictionary, int> maxCounts = []; @@ -89,7 +70,7 @@ internal static RationalNumber ModifiedPrecision(IEnumerable foreach (var rf in references) { - var refGrams = NGrams(rf, n).ToArray(); + IEnumerable> refGrams = rf.CreateNGrams(n); var refCounts = new MatchCounter>(refGrams); foreach (var ct in refCounts.Values) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs similarity index 95% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs rename to src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs index 1bbf117b9f3..98a5152fe08 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/MatchCounter.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs @@ -6,7 +6,7 @@ using System.Linq; using Microsoft.Shared.Diagnostics; -namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; internal readonly struct MatchCounter where T : IEquatable diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGram.cs similarity index 60% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs rename to src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGram.cs index 574919e3e18..da27655f306 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/NGram.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGram.cs @@ -4,39 +4,18 @@ using System; using System.Collections; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using System.Runtime.CompilerServices; using Microsoft.Shared.Diagnostics; -namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; +[DebuggerDisplay("{ToString(),nq}")] [CollectionBuilder(typeof(NGramBuilder), nameof(NGramBuilder.Create))] internal readonly struct NGram : IEquatable>, IEnumerable where T : IEquatable { - /// - /// Create a sequence of n-grams from the input sequence. - /// - /// The input sequence of items. - /// The size of each n-gram. - internal static IEnumerable> Create(IEnumerable input, int n) - { - if (n <= 0) - { - Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); - } - - var output = input.Take(n).ToArray(); - - while (output.Length == n) - { - yield return new NGram(output); - - input = input.Skip(1); - output = input.Take(n).ToArray(); - } - } - public NGram(ReadOnlySpan values) : this(values.ToArray()) { @@ -53,22 +32,7 @@ public NGram(params T[] values) public int Length => Values.Length; public bool Equals(NGram other) - { - if (other.Length != Length) - { - return false; - } - - for (int i = 0; i < Length; i++) - { - if (!Values[i].Equals(other.Values[i])) - { - return false; - } - } - - return true; - } + => Values.SequenceEqual(other.Values); public override bool Equals(object? obj) => obj is NGram other && Equals(other); @@ -88,6 +52,7 @@ public override int GetHashCode() IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); public override string ToString() => $"[{string.Join(",", Values.Select(v => v.ToString()))}]"; + } internal static class NGramBuilder @@ -95,4 +60,3 @@ internal static class NGramBuilder public static NGram Create(ReadOnlySpan values) where T : IEquatable => new(values); } - diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs new file mode 100644 index 00000000000..1c2f8448804 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs @@ -0,0 +1,37 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.Shared.Diagnostics; + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; + +internal static class NGramExtensions +{ + /// + /// Create a sequence of n-grams from the input sequence. + /// + /// The input sequence of items. + /// The size of each n-gram. + internal static IEnumerable> CreateNGrams(this IEnumerable input, int n) + where T : IEquatable + { + if (n <= 0) + { + Throw.ArgumentOutOfRangeException(nameof(n), $"'{nameof(n)}' must be greater than zero."); + } + + T[] output = [.. input.Take(n)]; + + while (output.Length == n) + { + yield return new NGram(output); + + input = input.Skip(1); + output = [.. input.Take(n)]; + } + } + +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs similarity index 94% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs rename to src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs index 87f5d624745..3ac29ca0dfa 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/RationalNumber.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs @@ -3,7 +3,7 @@ using System; -namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; internal readonly struct RationalNumber : IEquatable { diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/SmoothingFunction.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs similarity index 97% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/SmoothingFunction.cs rename to src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs index 7746248c36a..01c1d02d853 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEU/SmoothingFunction.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs @@ -1,7 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -namespace Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; /// /// Implementations of smoothing functions for BLEU scores taken from diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj index 99024c86c4e..784544e9a1c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj @@ -15,6 +15,10 @@ 0 + + true + + diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs index 7ce95f8ba59..461753d4df1 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs @@ -4,9 +4,9 @@ using System; using System.Collections.Generic; using System.Linq; -using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; -using static Microsoft.Extensions.AI.Evaluation.NLP.BLEU.BLEUAlgorithm; +using static Microsoft.Extensions.AI.Evaluation.NLP.Common.BLEUAlgorithm; namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; @@ -15,15 +15,17 @@ public class BLEUAlgorithmicTests [Fact] public void NGramGenerationNoPadding() { - IEnumerable> result = BLEUAlgorithm.NGrams([1, 2, 3, 4, 5], 1); + int[] input = [1, 2, 3, 4, 5]; + + IEnumerable> result = input.CreateNGrams(1); List> expected = [[1], [2], [3], [4], [5]]; Assert.True(result.SequenceEqual(expected)); - result = BLEUAlgorithm.NGrams([1, 2, 3, 4, 5], 2); + result = input.CreateNGrams(2); expected = [[1, 2], [2, 3], [3, 4], [4, 5]]; Assert.True(result.SequenceEqual(expected)); - result = BLEUAlgorithm.NGrams([1, 2, 3, 4, 5], 3); + result = input.CreateNGrams(3); expected = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]; Assert.True(result.SequenceEqual(expected)); } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs index 04fa4748765..a20f6281c1a 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs @@ -4,7 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; -using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs index 889eaa5cc15..f6374d2aa79 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs @@ -4,7 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; -using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; @@ -12,7 +12,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; public class NGramTests { [Fact] - public void Constructor_StoresValuesAndLength() + public void Constructor_ValuesAndLength() { var ngram = new NGram(1, 2, 3); Assert.Equal(new[] { 1, 2, 3 }, ngram.Values); @@ -55,8 +55,7 @@ public void ToString_FormatsCorrectly() [Fact] public void NGramBuilder_Create_Works() { - var arr = new[] { 1, 2 }; - var ngram = NGramBuilder.Create(arr); + NGram ngram = [1, 2]; Assert.Equal(new NGram(1, 2), ngram); } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs index b40c82734fc..ecfb832697f 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs @@ -2,7 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; -using Microsoft.Extensions.AI.Evaluation.NLP.BLEU; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; From e7539a6c7cbd6172946a7fa051acf215861e6031 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Thu, 26 Jun 2025 07:34:43 -0400 Subject: [PATCH 05/11] More review updates --- .../Common/NGram.cs | 13 +++---------- .../Common/NGramExtensions.cs | 6 ++++++ .../Common/RationalNumber.cs | 4 +++- .../Common/SmoothingFunction.cs | 4 +++- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGram.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGram.cs index da27655f306..5fb66461faf 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGram.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGram.cs @@ -11,8 +11,8 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; -[DebuggerDisplay("{ToString(),nq}")] -[CollectionBuilder(typeof(NGramBuilder), nameof(NGramBuilder.Create))] +[DebuggerDisplay("{ToDebugString(),nq}")] +[CollectionBuilder(typeof(NGramExtensions), nameof(NGramExtensions.CreateNGram))] internal readonly struct NGram : IEquatable>, IEnumerable where T : IEquatable { @@ -51,12 +51,5 @@ public override int GetHashCode() IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); - public override string ToString() => $"[{string.Join(",", Values.Select(v => v.ToString()))}]"; - -} - -internal static class NGramBuilder -{ - public static NGram Create(ReadOnlySpan values) - where T : IEquatable => new(values); + public string ToDebugString() => $"[{string.Join(",", Values.Select(v => v.ToString()))}]"; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs index 1c2f8448804..26c583f4fb7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs @@ -10,6 +10,12 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; internal static class NGramExtensions { + /// + /// Collection builder method + /// + public static NGram CreateNGram(ReadOnlySpan values) + where T : IEquatable => new(values); + /// /// Create a sequence of n-grams from the input sequence. /// diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs index 3ac29ca0dfa..e14ff662b50 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs @@ -2,9 +2,11 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Diagnostics; namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; +[DebuggerDisplay("{ToDebugString(),nq}")] internal readonly struct RationalNumber : IEquatable { public RationalNumber(int numerator, int denominator) @@ -23,7 +25,7 @@ public RationalNumber(int numerator, int denominator) public double ToDouble() => (double)Numerator / Denominator; - public override string ToString() => $"{Numerator}/{Denominator}"; + public string ToDebugString() => $"{Numerator}/{Denominator}"; public bool Equals(RationalNumber other) => other.Numerator == Numerator && other.Denominator == Denominator; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs index 01c1d02d853..d2b986afc5b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs @@ -1,6 +1,8 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System; + namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; /// @@ -57,7 +59,7 @@ internal static double[] Method4(RationalNumber[] precisions, int hypLen) RationalNumber p = precisions[i]; if (precisions[i].Numerator == 0 && hypLen > 1) { - double numerator = 1 / (System.Math.Pow(2.0, incvnt) * DefaultK / System.Math.Log(hypLen)); + double numerator = 1 / (Math.Pow(2.0, incvnt) * DefaultK / Math.Log(hypLen)); incvnt++; } else From e8a88d8b00943e2fe2e17acb18123236b3978fe2 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Thu, 26 Jun 2025 10:55:22 -0400 Subject: [PATCH 06/11] Feedback updates. --- .../BLEUEvaluator.cs | 2 + .../Common/BLEUAlgorithm.cs | 55 +++++++++++-------- .../Common/MatchCounter.cs | 18 ++++-- .../Common/NGramExtensions.cs | 4 +- .../Common/SmoothingFunction.cs | 9 +-- ...rosoft.Extensions.AI.Evaluation.NLP.csproj | 1 + .../SimpleWordTokenizer.cs | 2 + .../BLEUAlgorithmicTests.cs | 27 +++++++++ .../BLEUEvaluatorTests.cs | 8 ++- .../MatchCounterTests.cs | 12 ++-- .../NGramTests.cs | 2 +- .../RationalNumberTests.cs | 2 +- .../SimpleTokenizerTests.cs | 2 + 13 files changed, 99 insertions(+), 45 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index e4ba47b72a5..380f20d0392 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Linq; using System.Threading; using System.Threading.Tasks; @@ -24,6 +25,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP; /// passing and a score below 0.5 is failing. /// /// +[Experimental("AIEVAL001")] public sealed class BLEUEvaluator : IEvaluator { /// diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs index f8399eff071..ae659a32339 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs @@ -8,6 +8,12 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; +/// +/// Helper methods for calculating the BLEU score. +/// See BLEU on Wikipedia or +/// NLTK implementation +/// for more details. +/// internal static class BLEUAlgorithm { internal static int ClosestRefLength(IEnumerable> references, int hypLength) @@ -22,7 +28,7 @@ internal static int ClosestRefLength(IEnumerable> references foreach (var reference in references) { int refLength = reference.Count(); - int diff = System.Math.Abs(refLength - hypLength); + int diff = Math.Abs(refLength - hypLength); if (diff < closest || (diff == closest && refLength < closestRefLength)) { @@ -46,14 +52,14 @@ internal static double BrevityPenalty(int closestRefLength, int hypLength) return 1.0; } - return System.Math.Exp(1 - ((double)closestRefLength / hypLength)); + return Math.Exp(1 - ((double)closestRefLength / hypLength)); } internal static RationalNumber ModifiedPrecision(IEnumerable> references, IEnumerable hypothesis, int n = 1) { if (n <= 0) { - Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); + Throw.ArgumentOutOfRangeException(nameof(n), $"`{nameof(n)}` must be greater than zero."); } if (!references.Any() || !hypothesis.Any()) @@ -66,18 +72,16 @@ internal static RationalNumber ModifiedPrecision(IEnumerable Dictionary, int> maxCounts = []; - var matchCounts = new MatchCounter>(); - foreach (var rf in references) { IEnumerable> refGrams = rf.CreateNGrams(n); var refCounts = new MatchCounter>(refGrams); - foreach (var ct in refCounts.Values) + foreach (var ct in refCounts) { if (maxCounts.TryGetValue(ct.Key, out int val)) { - maxCounts[ct.Key] = System.Math.Max(val, ct.Value); + maxCounts[ct.Key] = Math.Max(val, ct.Value); } else { @@ -87,11 +91,11 @@ internal static RationalNumber ModifiedPrecision(IEnumerable } Dictionary, int> clippedCounts = []; - foreach (var h in hypCounts.Values) + foreach (var h in hypCounts) { if (maxCounts.TryGetValue(h.Key, out var v)) { - clippedCounts[h.Key] = System.Math.Min(h.Value, v); + clippedCounts[h.Key] = Math.Min(h.Value, v); } else { @@ -101,16 +105,21 @@ internal static RationalNumber ModifiedPrecision(IEnumerable } int numerator = clippedCounts.Values.Sum(); - int denominator = System.Math.Max(1, hypCounts.Sum); + int denominator = Math.Max(1, hypCounts.Sum); return new RationalNumber(numerator, denominator); } + /// + /// Generate an n-sized array of equal weights that sum to 1.0. + /// + /// Number of weights to return. + /// Array of equal sized values that sum to 1.0. internal static double[] EqualWeights(int n) { if (n <= 0) { - Throw.ArgumentOutOfRangeException(nameof(n), "N must be greater than zero."); + Throw.ArgumentOutOfRangeException(nameof(n), $"'{nameof(n)}' must be greater than zero."); } double[] weights = new double[n]; @@ -124,25 +133,27 @@ internal static double[] EqualWeights(int n) internal static readonly double[] DefaultBLEUWeights = EqualWeights(4); - internal static double SentenceBLEU(IEnumerable> references, IEnumerable hypothesis) - => SentenceBLEU(references, hypothesis, DefaultBLEUWeights); - - internal static double SentenceBLEU(IEnumerable> references, IEnumerable hypothesis, double[] weights, - Func? smoothingFunction = null) + internal static double SentenceBLEU(IEnumerable> references, IEnumerable hypothesis, + double[]? weights = null, Func? smoothingFunction = null) { if (references == null || !references.Any()) { - Throw.ArgumentNullException(nameof(references), "References cannot be null or empty."); + Throw.ArgumentNullException(nameof(references), $"'{nameof(references)}' cannot be null or empty."); } if (hypothesis == null || !hypothesis.Any()) { - Throw.ArgumentNullException(nameof(hypothesis), "Hypothesis cannot be null or empty."); + Throw.ArgumentNullException(nameof(hypothesis), $"'{nameof(hypothesis)}' cannot be null or empty."); + } + + if (weights is null) + { + weights = DefaultBLEUWeights; } - if (weights == null || !weights.Any()) + if (weights.Length == 0) { - Throw.ArgumentNullException(nameof(weights), "Weights cannot be null or empty."); + Throw.ArgumentNullException(nameof(weights), $"'{nameof(weights)}' cannot be empty."); } var precisionValues = new RationalNumber[weights.Length]; @@ -176,11 +187,11 @@ internal static double SentenceBLEU(IEnumerable> references, { if (smoothedValues[i] > 0) { - score += weights[i] * System.Math.Log(smoothedValues[i]); + score += weights[i] * Math.Log(smoothedValues[i]); } } - return brevityPenalty * System.Math.Exp(score); + return brevityPenalty * Math.Exp(score); } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs index 98a5152fe08..6bba1d55648 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs @@ -2,19 +2,20 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Collections; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using Microsoft.Shared.Diagnostics; namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; -internal readonly struct MatchCounter +[DebuggerDisplay("{ToDebugString(),nq}")] +internal readonly struct MatchCounter : IEnumerable> where T : IEquatable { private readonly Dictionary _counts = []; - public readonly IEnumerable> Values => _counts; - public readonly int Sum => _counts.Values.Sum(); public MatchCounter() @@ -41,11 +42,20 @@ public void Add(T item) public void AddRange(IEnumerable items) { + if (items == null) + { + return; + } + foreach (var item in items) { Add(item); } } - public override string ToString() => string.Concat(Values.Select(v => $"{v.Key}: {v.Value}, ")); + public string ToDebugString() => string.Concat(_counts.Select(v => $"{v.Key}: {v.Value}, ")); + + public IEnumerator> GetEnumerator() => _counts.GetEnumerator(); + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)_counts).GetEnumerator(); } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs index 26c583f4fb7..c4b3949e95a 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs @@ -10,9 +10,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; internal static class NGramExtensions { - /// - /// Collection builder method - /// + // Collection builder method. public static NGram CreateNGram(ReadOnlySpan values) where T : IEquatable => new(values); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs index d2b986afc5b..0e3071f6bdd 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SmoothingFunction.cs @@ -53,14 +53,15 @@ internal static double[] Method4(RationalNumber[] precisions, int hypLen) double[] smoothed = new double[precisions.Length]; - int incvnt = 0; + int inc = 1; for (int i = 0; i < precisions.Length; i++) { RationalNumber p = precisions[i]; - if (precisions[i].Numerator == 0 && hypLen > 1) + if (p.Numerator == 0 && hypLen > 1) { - double numerator = 1 / (Math.Pow(2.0, incvnt) * DefaultK / Math.Log(hypLen)); - incvnt++; + double numerator = 1 / (Math.Pow(2.0, inc) * DefaultK / Math.Log(hypLen)); + smoothed[i] = numerator / p.Denominator; + inc++; } else { diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj index 784544e9a1c..2b1384e166d 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj @@ -16,6 +16,7 @@ + true true diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs index 829c9b3b654..1e2e0ec5a25 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Text; using Microsoft.Shared.Diagnostics; @@ -11,6 +12,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP; /// /// Tokenizes a string into segments using the common rules established by the NLTK word tokenizer. /// +[Experimental("AIEVAL001")] public static class SimpleWordTokenizer { /// diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs index 461753d4df1..21dd806a834 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Linq; using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; @@ -10,6 +11,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; +[Experimental("AIEVAL001")] public class BLEUAlgorithmicTests { [Fact] @@ -66,6 +68,31 @@ public void ModifiedPrecisionTests() Assert.Equal(0.07692, prec.ToDouble(), 4); } + [Theory] + [InlineData(new int[] { 0, 1, 0, 2 }, 10, new[] { 0.2303, 0.0576 })] + [InlineData(new int[] { 4, 5, 2, 4 }, 10, new[] { 0.8000, 0.5 })] + [InlineData(new int[] { 10, 14, 7, 13, 5, 12, 4, 11 }, 20, new[] { 0.7143, 0.5385, 0.4167, 0.3636 })] + [InlineData(new int[] { 10, 14, 7, 13, 0, 12, 0, 11 }, 20, new[] { 0.7143, 0.5385, 0.02496, 0.01362 })] + public void SmoothingMethod4Tests(int[] num_denom, int hypLen, double[] vals) + { + Assert.Equal(num_denom.Length, vals.Length * 2); + + RationalNumber[] prec = new RationalNumber[vals.Length]; + for (int i = 0; i < num_denom.Length - 1; i += 2) + { + prec[i / 2] = new RationalNumber(num_denom[i], num_denom[i + 1]); + } + + double[] smoothed = SmoothingFunction.Method4(prec, hypLen); + + Assert.Equal(vals.Length, smoothed.Length); + + for (int i = 0; i < vals.Length; i++) + { + Assert.Equal(vals[i], smoothed[i], 4); + } + } + [Fact] public void TestBrevityPenalty() { diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs index 22a02b110c1..6288d2bb4ef 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs @@ -8,6 +8,8 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; +#pragma warning disable AIEVAL001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + public class BLEUEvaluatorTests { [Fact] @@ -36,7 +38,7 @@ public async Task EvaluateAsync_ReturnsLowScoreForCompletelyDifferentText() var metric = Assert.Single(result.Metrics.Values) as NumericMetric; Assert.NotNull(metric); Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); - Assert.Equal(0.1578, (double)metric!.Value!, 4); + Assert.Equal(0.0136, (double)metric!.Value!, 4); Assert.NotNull(metric.Interpretation); Assert.Equal(EvaluationRating.Unacceptable, metric.Interpretation.Rating); Assert.True(metric.Interpretation.Failed); @@ -57,11 +59,11 @@ public async Task EvaluateAsync_ReturnsErrorDiagnosticIfNoContext() [Theory] [InlineData("the cat is on the mat", - "the the the the the the the", 0.7311)] + "the the the the the the the", 0.0385)] [InlineData("It is a guide to action that ensures that the military will forever heed Party commands", "It is a guide to action which ensures that the military always obeys the commands of the party", 0.4209)] [InlineData("It is the practical guide for the army always to heed the directions of the party", - "It is to insure the troops forever hearing the activity guidebook that party direct", 0.3694)] + "It is to insure the troops forever hearing the activity guidebook that party direct", 0.0471)] public async Task EvaluateAsync_SampleCases(string reference, string hypothesis, double score) { var evaluator = new BLEUEvaluator(); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs index a20f6281c1a..f14e167af44 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs @@ -1,8 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System; -using System.Collections.Generic; using System.Linq; using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; @@ -15,7 +13,7 @@ public class MatchCounterTests public void EmptyConstructor_InitializesEmptyCounter() { var counter = new MatchCounter(); - Assert.Empty(counter.Values); + Assert.Empty(counter); Assert.Equal(0, counter.Sum); } @@ -23,7 +21,7 @@ public void EmptyConstructor_InitializesEmptyCounter() public void ConstructorWithItems_CountsCorrectly() { var counter = new MatchCounter(new[] { "a", "b", "a", "c", "b", "a" }); - var dict = counter.Values.ToDictionary(kv => kv.Key, kv => kv.Value); + var dict = counter.ToDictionary(kv => kv.Key, kv => kv.Value); Assert.Equal(3, dict["a"]); Assert.Equal(2, dict["b"]); Assert.Equal(1, dict["c"]); @@ -37,7 +35,7 @@ public void Add_AddsSingleItemCorrectly() counter.Add(5); counter.Add(5); counter.Add(3); - var dict = counter.Values.ToDictionary(kv => kv.Key, kv => kv.Value); + var dict = counter.ToDictionary(kv => kv.Key, kv => kv.Value); Assert.Equal(2, dict[5]); Assert.Equal(1, dict[3]); Assert.Equal(3, counter.Sum); @@ -48,7 +46,7 @@ public void AddRange_AddsMultipleItemsCorrectly() { var counter = new MatchCounter(); counter.AddRange("hello"); - var dict = counter.Values.ToDictionary(kv => kv.Key, kv => kv.Value); + var dict = counter.ToDictionary(kv => kv.Key, kv => kv.Value); Assert.Equal(1, dict['h']); Assert.Equal(1, dict['e']); Assert.Equal(2, dict['l']); @@ -60,7 +58,7 @@ public void AddRange_AddsMultipleItemsCorrectly() public void ToString_FormatsCorrectly() { var counter = new MatchCounter(new[] { "x", "y", "x" }); - var str = counter.ToString(); + var str = counter.ToDebugString(); Assert.Contains("x: 2", str); Assert.Contains("y: 1", str); } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs index f6374d2aa79..4661f0b490c 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs @@ -49,7 +49,7 @@ public void Enumerator_And_IEnumerable() public void ToString_FormatsCorrectly() { var ngram = new NGram("x", "y"); - Assert.Equal("[x,y]", ngram.ToString()); + Assert.Equal("[x,y]", ngram.ToDebugString()); } [Fact] diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs index ecfb832697f..1cd0a36b7af 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs @@ -37,7 +37,7 @@ public void ToDouble_ReturnsExpected(int num, int denom, double expected) public void ToString_FormatsCorrectly() { var r = new RationalNumber(7, 9); - Assert.Equal("7/9", r.ToString()); + Assert.Equal("7/9", r.ToDebugString()); } [Fact] diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs index 46e5c9d9453..712f0c5ff5a 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs @@ -7,6 +7,8 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; +#pragma warning disable AIEVAL001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + public class SimpleTokenizerTests { [Fact] From 8b8067fa7218d09b25398b61eecf62eea196a627 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Thu, 26 Jun 2025 11:01:23 -0400 Subject: [PATCH 07/11] Update READMEs --- .../Microsoft.Extensions.AI.Evaluation.Console/README.md | 4 ++++ .../Microsoft.Extensions.AI.Evaluation.NLP/README.md | 4 ++++ .../Microsoft.Extensions.AI.Evaluation.Quality/README.md | 4 ++++ .../README.md | 4 ++++ .../Microsoft.Extensions.AI.Evaluation.Safety/README.md | 4 ++++ src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md index c21e2a299ad..580facd6294 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md @@ -5,6 +5,8 @@ * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural +language processing tasks. Evaluators currently include BLEU score, with more planned. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. @@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation dotnet add package Microsoft.Extensions.AI.Evaluation.Quality dotnet add package Microsoft.Extensions.AI.Evaluation.Safety dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +dotnet add package Microsoft.Extensions.AI.Evaluation.NLP ``` Or directly in the C# project file: @@ -28,6 +31,7 @@ Or directly in the C# project file: + ``` diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md index c21e2a299ad..580facd6294 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/README.md @@ -5,6 +5,8 @@ * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural +language processing tasks. Evaluators currently include BLEU score, with more planned. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. @@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation dotnet add package Microsoft.Extensions.AI.Evaluation.Quality dotnet add package Microsoft.Extensions.AI.Evaluation.Safety dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +dotnet add package Microsoft.Extensions.AI.Evaluation.NLP ``` Or directly in the C# project file: @@ -28,6 +31,7 @@ Or directly in the C# project file: + ``` diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md index c21e2a299ad..580facd6294 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md @@ -5,6 +5,8 @@ * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural +language processing tasks. Evaluators currently include BLEU score, with more planned. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. @@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation dotnet add package Microsoft.Extensions.AI.Evaluation.Quality dotnet add package Microsoft.Extensions.AI.Evaluation.Safety dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +dotnet add package Microsoft.Extensions.AI.Evaluation.NLP ``` Or directly in the C# project file: @@ -28,6 +31,7 @@ Or directly in the C# project file: + ``` diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md index c21e2a299ad..580facd6294 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md @@ -5,6 +5,8 @@ * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural +language processing tasks. Evaluators currently include BLEU score, with more planned. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. @@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation dotnet add package Microsoft.Extensions.AI.Evaluation.Quality dotnet add package Microsoft.Extensions.AI.Evaluation.Safety dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +dotnet add package Microsoft.Extensions.AI.Evaluation.NLP ``` Or directly in the C# project file: @@ -28,6 +31,7 @@ Or directly in the C# project file: + ``` diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md index c042da70deb..e135ed24cfe 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/README.md @@ -5,6 +5,8 @@ * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural +language processing tasks. Evaluators currently include BLEU score, with more planned. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. @@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation dotnet add package Microsoft.Extensions.AI.Evaluation.Quality dotnet add package Microsoft.Extensions.AI.Evaluation.Safety dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +dotnet add package Microsoft.Extensions.AI.Evaluation.NLP ``` Or directly in the C# project file: @@ -28,6 +31,7 @@ Or directly in the C# project file: + ``` diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md b/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md index c21e2a299ad..580facd6294 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/README.md @@ -5,6 +5,8 @@ * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation. * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Retrieval, Equivalence and Groundedness. * [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Foundry Evaluation service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack. +* [`Microsoft.Extensions.AI.Evaluation.NLP`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.NLP) - Contains a set of evaluators that implement common algorithms for evaluating machine translation and natural +language processing tasks. Evaluators currently include BLEU score, with more planned. * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data. * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container. * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data. @@ -18,6 +20,7 @@ dotnet add package Microsoft.Extensions.AI.Evaluation dotnet add package Microsoft.Extensions.AI.Evaluation.Quality dotnet add package Microsoft.Extensions.AI.Evaluation.Safety dotnet add package Microsoft.Extensions.AI.Evaluation.Reporting +dotnet add package Microsoft.Extensions.AI.Evaluation.NLP ``` Or directly in the C# project file: @@ -28,6 +31,7 @@ Or directly in the C# project file: + ``` From bcccea89331926e0a202960ad3a8d43afc7ce40d Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Thu, 26 Jun 2025 15:50:07 -0400 Subject: [PATCH 08/11] Make word tokenizer internal. --- .../SimpleWordTokenizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs index 1e2e0ec5a25..ffffe4ba3f0 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs @@ -13,7 +13,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP; /// Tokenizes a string into segments using the common rules established by the NLTK word tokenizer. /// [Experimental("AIEVAL001")] -public static class SimpleWordTokenizer +internal static class SimpleWordTokenizer { /// /// Tokenizes the input text into individual words based on specific rules for text normalization and segmentation. From 47dba703c3efa4c7cd15a15c32dcb29e0de61064 Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Thu, 26 Jun 2025 16:11:17 -0400 Subject: [PATCH 09/11] Feedback updates. --- .../BLEUEvaluator.cs | 8 ++++++++ .../Common/BLEUAlgorithm.cs | 12 ++++++------ .../Common/MatchCounter.cs | 2 +- .../Common/NGramExtensions.cs | 2 +- .../Microsoft.Extensions.AI.Evaluation.NLP.csproj | 4 ++++ .../SimpleWordTokenizer.cs | 2 -- .../MatchCounterTests.cs | 8 ++++---- 7 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index 380f20d0392..a45c5768074 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -3,7 +3,9 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Globalization; using System.Linq; using System.Threading; using System.Threading.Tasks; @@ -45,6 +47,8 @@ public ValueTask EvaluateAsync( IEnumerable? additionalContext = null, CancellationToken cancellationToken = default) { + Stopwatch stopwatch = Stopwatch.StartNew(); + _ = Throw.IfNull(modelResponse); var metric = new NumericMetric(BLEUMetricName); @@ -72,7 +76,11 @@ public ValueTask EvaluateAsync( var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); metric.Value = BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); + stopwatch.Stop(); + string durationText = $"{stopwatch.Elapsed.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + metric.AddOrUpdateContext(context); + metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.Interpretation = InterpretScore(metric); return new ValueTask(result); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs index ae659a32339..044248bac18 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs @@ -24,15 +24,15 @@ internal static int ClosestRefLength(IEnumerable> references } int closestRefLength = 0; - int closest = int.MaxValue; + int smallestDiff = int.MaxValue; foreach (var reference in references) { int refLength = reference.Count(); int diff = Math.Abs(refLength - hypLength); - if (diff < closest || - (diff == closest && refLength < closestRefLength)) + if (diff < smallestDiff || + (diff == smallestDiff && refLength < closestRefLength)) { - closest = diff; + smallestDiff = diff; closestRefLength = refLength; } } @@ -67,7 +67,7 @@ internal static RationalNumber ModifiedPrecision(IEnumerable return new RationalNumber(0, 0); } - var hyp = hypothesis.CreateNGrams(n).ToArray(); + var hyp = hypothesis.CreateNGrams(n); var hypCounts = new MatchCounter>(hyp); Dictionary, int> maxCounts = []; @@ -105,7 +105,7 @@ internal static RationalNumber ModifiedPrecision(IEnumerable } int numerator = clippedCounts.Values.Sum(); - int denominator = Math.Max(1, hypCounts.Sum); + int denominator = Math.Max(1, hypCounts.Sum()); return new RationalNumber(numerator, denominator); } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs index 6bba1d55648..bbca2252057 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/MatchCounter.cs @@ -16,7 +16,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; { private readonly Dictionary _counts = []; - public readonly int Sum => _counts.Values.Sum(); + public readonly int Sum() => _counts.Values.Sum(); public MatchCounter() { diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs index c4b3949e95a..149d3820328 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NGramExtensions.cs @@ -11,7 +11,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; internal static class NGramExtensions { // Collection builder method. - public static NGram CreateNGram(ReadOnlySpan values) + public static NGram CreateNGram(this ReadOnlySpan values) where T : IEquatable => new(values); /// diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj index 2b1384e166d..ed1769bed39 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj @@ -33,4 +33,8 @@ + + + + diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs index ffffe4ba3f0..e4d2d3930a7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; using System.Text; using Microsoft.Shared.Diagnostics; @@ -12,7 +11,6 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP; /// /// Tokenizes a string into segments using the common rules established by the NLTK word tokenizer. /// -[Experimental("AIEVAL001")] internal static class SimpleWordTokenizer { /// diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs index f14e167af44..1534280a103 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs @@ -14,7 +14,7 @@ public void EmptyConstructor_InitializesEmptyCounter() { var counter = new MatchCounter(); Assert.Empty(counter); - Assert.Equal(0, counter.Sum); + Assert.Equal(0, counter.Sum()); } [Fact] @@ -25,7 +25,7 @@ public void ConstructorWithItems_CountsCorrectly() Assert.Equal(3, dict["a"]); Assert.Equal(2, dict["b"]); Assert.Equal(1, dict["c"]); - Assert.Equal(6, counter.Sum); + Assert.Equal(6, counter.Sum()); } [Fact] @@ -38,7 +38,7 @@ public void Add_AddsSingleItemCorrectly() var dict = counter.ToDictionary(kv => kv.Key, kv => kv.Value); Assert.Equal(2, dict[5]); Assert.Equal(1, dict[3]); - Assert.Equal(3, counter.Sum); + Assert.Equal(3, counter.Sum()); } [Fact] @@ -51,7 +51,7 @@ public void AddRange_AddsMultipleItemsCorrectly() Assert.Equal(1, dict['e']); Assert.Equal(2, dict['l']); Assert.Equal(1, dict['o']); - Assert.Equal(5, counter.Sum); + Assert.Equal(5, counter.Sum()); } [Fact] From a0dbe6e6f73c6c2a288e33bd2c5d9ffa122caeee Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Fri, 27 Jun 2025 10:57:43 -0400 Subject: [PATCH 10/11] More tweaks based on feedback --- .../BLEUEvaluator.cs | 67 +++++++------------ .../BLEUEvaluatorContext.cs | 8 +-- .../Common/BLEUAlgorithm.cs | 2 +- .../Common/NLPScoreInterpretation.cs | 36 ++++++++++ .../Common/RationalNumber.cs | 2 + .../{ => Common}/SimpleWordTokenizer.cs | 15 ++++- ...rosoft.Extensions.AI.Evaluation.NLP.csproj | 1 - ...orithmicTests.cs => BLEUAlgorithmTests.cs} | 22 +----- .../BLEUEvaluatorTests.cs | 24 +++---- .../MatchCounterTests.cs | 2 +- .../NGramTests.cs | 21 +++++- .../RationalNumberTests.cs | 2 +- .../SimpleTokenizerTests.cs | 32 +++------ 13 files changed, 122 insertions(+), 112 deletions(-) create mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NLPScoreInterpretation.cs rename src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/{ => Common}/SimpleWordTokenizer.cs (96%) rename test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/{BLEUAlgorithmicTests.cs => BLEUAlgorithmTests.cs} (93%) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index a45c5768074..8ce43d48e52 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -1,33 +1,31 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System; using System.Collections.Generic; -using System.Diagnostics; -using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.Linq; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.AI.Evaluation.NLP.Common; +using Microsoft.Extensions.AI.Evaluation.Utilities; using Microsoft.Shared.Diagnostics; namespace Microsoft.Extensions.AI.Evaluation.NLP; /// /// An that evaluates the quality of a response produced by an AI model by comparing -/// it to a reference response using the BLEU (Bilingual Evaluation Understudy) algorithm. +/// it to a reference response using the BLEU (Bilingual Evaluation Understudy) algorithm. It is often used +/// to evaluate the quality of machine translation or text generation tasks. /// /// /// -/// The computes the BLUE score of a response ("hypothesis") compared to a reference -/// . The score is returned in a +/// The computes the BLEU score of a response ("hypothesis") compared to a reference +/// supplied via . The score is returned in a /// with a value between 0.0 and 1.0 where 0.0 represents no match at all and 1.0 indicates a perfect match. /// By default, the score is interpreted with a pass/fail cutoff of 0.5. So a score of 0.5 or higher is /// passing and a score below 0.5 is failing. /// /// -[Experimental("AIEVAL001")] public sealed class BLEUEvaluator : IEvaluator { /// @@ -47,8 +45,6 @@ public ValueTask EvaluateAsync( IEnumerable? additionalContext = null, CancellationToken cancellationToken = default) { - Stopwatch stopwatch = Stopwatch.StartNew(); - _ = Throw.IfNull(modelResponse); var metric = new NumericMetric(BLEUMetricName); @@ -72,46 +68,29 @@ public ValueTask EvaluateAsync( return new ValueTask(result); } - var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference)); - var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); - metric.Value = BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); + if (context.References.Count is 0) + { + metric.AddDiagnostics( + EvaluationDiagnostic.Error( + $"Supplied '{nameof(BLEUEvaluatorContext)}' did not contain any '{nameof(BLEUEvaluatorContext.References)}'.")); - stopwatch.Stop(); - string durationText = $"{stopwatch.Elapsed.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + return new ValueTask(result); + } - metric.AddOrUpdateContext(context); + var (score, duration) = TimingHelper.ExecuteWithTiming(() => + { + var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference)); + var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); + return BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); + }); + + metric.Value = score; + string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); - metric.Interpretation = InterpretScore(metric); + metric.AddOrUpdateContext(context); + metric.Interpretation = NLPScoreInterpretation.Interpret(metric); return new ValueTask(result); } - private static EvaluationMetricInterpretation InterpretScore(NumericMetric metric) - { - // BLEU scores range from 0.0 to 1.0, where: - // - 0.0 means no match at all, - // - 1.0 means a perfect match. - // 0.5 is considered the minimum passing score for BLEU evaluation. - - EvaluationRating rating = metric.Value switch - { - null => EvaluationRating.Inconclusive, - > 1.0 => EvaluationRating.Inconclusive, - > 0.8 and <= 1.0 => EvaluationRating.Exceptional, - > 0.6 and <= 0.8 => EvaluationRating.Good, - > 0.4 and <= 0.6 => EvaluationRating.Average, - > 0.2 and <= 0.4 => EvaluationRating.Poor, - >= 0.0 and <= 0.2 => EvaluationRating.Unacceptable, - < 0.0 => EvaluationRating.Inconclusive, - _ => EvaluationRating.Inconclusive, - }; - - const double MinimumPassingScore = 0.5; - return metric.Value is double value && value < MinimumPassingScore - ? new EvaluationMetricInterpretation( - rating, - failed: true, - reason: $"{metric.Name} is less than {MinimumPassingScore}.") - : new EvaluationMetricInterpretation(rating); - } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs index f98bc497ee9..320b20e9116 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluatorContext.cs @@ -16,7 +16,7 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP; /// /// /// measures the BLEU score of a response compared to a reference. BLEU (Bilingual Evaluation Understudy) -/// is a metric used to evaluate the quality if machine-generated text. +/// is a metric used to evaluate the quality of machine-generated text. /// public sealed class BLEUEvaluatorContext : EvaluationContext { @@ -24,10 +24,10 @@ public sealed class BLEUEvaluatorContext : EvaluationContext /// Gets the unique that is used for /// . /// - public static string BLEUContext => "BLEU Context"; + public static string BLEUContextName => "BLEU Context"; /// - /// Gets the reference response against which the provided chat response will be scored. + /// Gets the reference responses against which the provided model response will be scored. /// /// /// The measures the degree to which the response being evaluated is similar to @@ -54,7 +54,7 @@ public BLEUEvaluatorContext(params string[] references) /// public BLEUEvaluatorContext(IEnumerable references) : base( - name: BLEUContext, + name: BLEUContextName, contents: [.. references.Select(c => new TextContent(c))]) { References = [.. references]; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs index 044248bac18..c7420d0be7a 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/BLEUAlgorithm.cs @@ -64,7 +64,7 @@ internal static RationalNumber ModifiedPrecision(IEnumerable if (!references.Any() || !hypothesis.Any()) { - return new RationalNumber(0, 0); + return RationalNumber.Zero; } var hyp = hypothesis.CreateNGrams(n); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NLPScoreInterpretation.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NLPScoreInterpretation.cs new file mode 100644 index 00000000000..4ef1d08b468 --- /dev/null +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/NLPScoreInterpretation.cs @@ -0,0 +1,36 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; + +internal static class NLPScoreInterpretation +{ + internal static EvaluationMetricInterpretation Interpret(NumericMetric metric) + { + // Many NLP scores range from 0.0 to 1.0, where: + // - 0.0 means no match at all, + // - 1.0 means a perfect match. + // 0.5 is considered the minimum passing score for evaluation. + + EvaluationRating rating = metric.Value switch + { + null => EvaluationRating.Inconclusive, + > 1.0 => EvaluationRating.Inconclusive, + > 0.8 and <= 1.0 => EvaluationRating.Exceptional, + > 0.6 and <= 0.8 => EvaluationRating.Good, + > 0.4 and <= 0.6 => EvaluationRating.Average, + > 0.2 and <= 0.4 => EvaluationRating.Poor, + >= 0.0 and <= 0.2 => EvaluationRating.Unacceptable, + < 0.0 => EvaluationRating.Inconclusive, + _ => EvaluationRating.Inconclusive, + }; + + const double MinimumPassingScore = 0.5; + return metric.Value is double value && value < MinimumPassingScore + ? new EvaluationMetricInterpretation( + rating, + failed: true, + reason: $"{metric.Name} is less than {MinimumPassingScore}.") + : new EvaluationMetricInterpretation(rating); + } +} diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs index e14ff662b50..500b042b17b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/RationalNumber.cs @@ -9,6 +9,8 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; [DebuggerDisplay("{ToDebugString(),nq}")] internal readonly struct RationalNumber : IEquatable { + public static readonly RationalNumber Zero = new(0, 1); + public RationalNumber(int numerator, int denominator) { if (denominator == 0) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs similarity index 96% rename from src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs rename to src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs index e4d2d3930a7..4f4717852bd 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/SimpleWordTokenizer.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Common/SimpleWordTokenizer.cs @@ -6,7 +6,9 @@ using System.Text; using Microsoft.Shared.Diagnostics; -namespace Microsoft.Extensions.AI.Evaluation.NLP; +#pragma warning disable S109 // Magic numbers should not be used + +namespace Microsoft.Extensions.AI.Evaluation.NLP.Common; /// /// Tokenizes a string into segments using the common rules established by the NLTK word tokenizer. @@ -78,9 +80,16 @@ public static IEnumerable WordTokenize(ReadOnlyMemory text) span.Length > 1 && span[1] == '\n') { -#pragma warning disable S109 // Magic numbers should not be used text = text.Slice(2); -#pragma warning restore S109 // Magic numbers should not be used + continue; + } + + if (span[0] == '-' && + span.Length > 2 && + span[1] == '\r' && + span[2] == '\n') + { + text = text.Slice(3); continue; } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj index ed1769bed39..0bab1cf7fb0 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Microsoft.Extensions.AI.Evaluation.NLP.csproj @@ -16,7 +16,6 @@ - true true diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs similarity index 93% rename from test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs rename to test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs index 21dd806a834..1b029dc4a37 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmicTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUAlgorithmTests.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; using System.Linq; using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; @@ -11,27 +10,8 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; -[Experimental("AIEVAL001")] -public class BLEUAlgorithmicTests +public class BLEUAlgorithmTests { - [Fact] - public void NGramGenerationNoPadding() - { - int[] input = [1, 2, 3, 4, 5]; - - IEnumerable> result = input.CreateNGrams(1); - List> expected = [[1], [2], [3], [4], [5]]; - Assert.True(result.SequenceEqual(expected)); - - result = input.CreateNGrams(2); - expected = [[1, 2], [2, 3], [3, 4], [4, 5]]; - Assert.True(result.SequenceEqual(expected)); - - result = input.CreateNGrams(3); - expected = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]; - Assert.True(result.SequenceEqual(expected)); - } - [Fact] public void ModifiedPrecisionTests() { diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs index 6288d2bb4ef..48fda1357ab 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/BLEUEvaluatorTests.cs @@ -13,12 +13,12 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; public class BLEUEvaluatorTests { [Fact] - public async Task EvaluateAsync_ReturnsPerfectScoreForIdenticalText() + public async Task ReturnsPerfectScoreForIdenticalText() { var evaluator = new BLEUEvaluator(); var context = new BLEUEvaluatorContext("The quick brown fox jumps over the lazy dog."); var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "The quick brown fox jumps over the lazy dog.")); - var result = await evaluator.EvaluateAsync([], response, null, [context]); + var result = await evaluator.EvaluateAsync(response, chatConfiguration: null, [context]); var metric = Assert.Single(result.Metrics.Values) as NumericMetric; Assert.NotNull(metric); Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); @@ -29,12 +29,12 @@ public async Task EvaluateAsync_ReturnsPerfectScoreForIdenticalText() } [Fact] - public async Task EvaluateAsync_ReturnsLowScoreForCompletelyDifferentText() + public async Task ReturnsLowScoreForCompletelyDifferentText() { var evaluator = new BLEUEvaluator(); var context = new BLEUEvaluatorContext("The quick brown fox jumps over the lazy dog."); var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Completely unrelated sentence.")); - var result = await evaluator.EvaluateAsync([], response, null, [context]); + var result = await evaluator.EvaluateAsync(response, chatConfiguration: null, [context]); var metric = Assert.Single(result.Metrics.Values) as NumericMetric; Assert.NotNull(metric); Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); @@ -45,11 +45,11 @@ public async Task EvaluateAsync_ReturnsLowScoreForCompletelyDifferentText() } [Fact] - public async Task EvaluateAsync_ReturnsErrorDiagnosticIfNoContext() + public async Task ReturnsErrorDiagnosticIfNoContext() { var evaluator = new BLEUEvaluator(); var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "Some text.")); - var result = await evaluator.EvaluateAsync([], response, null, null); + var result = await evaluator.EvaluateAsync(response, chatConfiguration: null, additionalContext: null); var metric = Assert.Single(result.Metrics.Values) as NumericMetric; Assert.NotNull(metric); Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); @@ -64,12 +64,12 @@ public async Task EvaluateAsync_ReturnsErrorDiagnosticIfNoContext() "It is a guide to action which ensures that the military always obeys the commands of the party", 0.4209)] [InlineData("It is the practical guide for the army always to heed the directions of the party", "It is to insure the troops forever hearing the activity guidebook that party direct", 0.0471)] - public async Task EvaluateAsync_SampleCases(string reference, string hypothesis, double score) + public async Task SampleCases(string reference, string hypothesis, double score) { var evaluator = new BLEUEvaluator(); var context = new BLEUEvaluatorContext(reference); var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, hypothesis)); - var result = await evaluator.EvaluateAsync([], response, null, [context]); + var result = await evaluator.EvaluateAsync(response, chatConfiguration: null, [context]); var metric = Assert.Single(result.Metrics.Values) as NumericMetric; Assert.NotNull(metric); Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); @@ -77,7 +77,7 @@ public async Task EvaluateAsync_SampleCases(string reference, string hypothesis, } [Fact] - public async Task EvaluateAsync_MultipleReferences() + public async Task MultipleReferences() { string[] references = [ "It is a guide to action that ensures that the military will forever heed Party commands", @@ -89,7 +89,7 @@ public async Task EvaluateAsync_MultipleReferences() var evaluator = new BLEUEvaluator(); var context = new BLEUEvaluatorContext(references); var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, hypothesis)); - var result = await evaluator.EvaluateAsync([], response, null, [context]); + var result = await evaluator.EvaluateAsync(response, chatConfiguration: null, [context]); var metric = Assert.Single(result.Metrics.Values) as NumericMetric; Assert.NotNull(metric); Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); @@ -97,12 +97,12 @@ public async Task EvaluateAsync_MultipleReferences() } [Fact] - public async Task EvaluateAsync_ReturnsErrorDiagnosticIfEmptyResponse() + public async Task ReturnsErrorDiagnosticIfEmptyResponse() { var evaluator = new BLEUEvaluator(); var context = new BLEUEvaluatorContext("Reference text."); var response = new ChatResponse(new ChatMessage(ChatRole.Assistant, "")); - var result = await evaluator.EvaluateAsync([], response, null, [context]); + var result = await evaluator.EvaluateAsync(response, chatConfiguration: null, [context]); var metric = Assert.Single(result.Metrics.Values) as NumericMetric; Assert.NotNull(metric); Assert.Equal(BLEUEvaluator.BLEUMetricName, metric.Name); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs index 1534280a103..9c2a5b68900 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/MatchCounterTests.cs @@ -55,7 +55,7 @@ public void AddRange_AddsMultipleItemsCorrectly() } [Fact] - public void ToString_FormatsCorrectly() + public void ToDebugString_FormatsCorrectly() { var counter = new MatchCounter(new[] { "x", "y", "x" }); var str = counter.ToDebugString(); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs index 4661f0b490c..d782c3c8f88 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/NGramTests.cs @@ -46,7 +46,7 @@ public void Enumerator_And_IEnumerable() } [Fact] - public void ToString_FormatsCorrectly() + public void ToDebugString_FormatsCorrectly() { var ngram = new NGram("x", "y"); Assert.Equal("[x,y]", ngram.ToDebugString()); @@ -58,4 +58,23 @@ public void NGramBuilder_Create_Works() NGram ngram = [1, 2]; Assert.Equal(new NGram(1, 2), ngram); } + + [Fact] + public void NGramGenerationNoPadding() + { + int[] input = [1, 2, 3, 4, 5]; + + IEnumerable> result = input.CreateNGrams(1); + List> expected = [[1], [2], [3], [4], [5]]; + Assert.True(result.SequenceEqual(expected)); + + result = input.CreateNGrams(2); + expected = [[1, 2], [2, 3], [3, 4], [4, 5]]; + Assert.True(result.SequenceEqual(expected)); + + result = input.CreateNGrams(3); + expected = [[1, 2, 3], [2, 3, 4], [3, 4, 5]]; + Assert.True(result.SequenceEqual(expected)); + } + } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs index 1cd0a36b7af..8776b97811f 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/RationalNumberTests.cs @@ -34,7 +34,7 @@ public void ToDouble_ReturnsExpected(int num, int denom, double expected) } [Fact] - public void ToString_FormatsCorrectly() + public void ToDebugString_FormatsCorrectly() { var r = new RationalNumber(7, 9); Assert.Equal("7/9", r.ToDebugString()); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs index 712f0c5ff5a..3451a6c38c9 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.NLP.Tests/SimpleTokenizerTests.cs @@ -1,8 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System; -using System.Collections.Generic; +using Microsoft.Extensions.AI.Evaluation.NLP.Common; using Xunit; namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; @@ -11,27 +10,6 @@ namespace Microsoft.Extensions.AI.Evaluation.NLP.Tests; public class SimpleTokenizerTests { - [Fact] - public void TokenizeText() - { - (string, IEnumerable)[] cases = [ - ("It is a guide to action that ensures that the military will forever heed Party commands.", - ["IT", "IS", "A", "GUIDE", "TO", "ACTION", "THAT", "ENSURES", "THAT", "THE", "MILITARY", "WILL", "FOREVER", "HEED", "PARTY", "COMMANDS", "."]), - ("Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.", - ["GOOD", "MUFFINS", "COST", "$", "3.88", "(", "ROUGHLY", "3,36", "EUROS", ")", "IN", "NEW", "YORK", ".", "PLEASE", "BUY", "ME", "TWO", "OF", "THEM", ".", "THANKS", "."]), - ("", []), - ("Hello, world! How's it going?", ["HELLO", ",", "WORLD", "!", "HOW", "'", "S", "IT", "GOING", "?"]), - (""Quotes" and & symbols < > '", ["\"", "QUOTES", "\"", "AND", "&", "SYMBOLS", "<", ">", "'"]), - ("-\nThis is a test.", ["THIS", "IS", "A", "TEST", "."]), - ]; - - foreach (var (text, expected) in cases) - { - IEnumerable result = SimpleWordTokenizer.WordTokenize(text); - Assert.Equal(expected, result); - } - } - [Theory] [InlineData(" $41.23 ", new[] { "$", "41.23" })] [InlineData("word", new[] { "WORD" })] @@ -40,7 +18,15 @@ public void TokenizeText() [InlineData("word1.word2", new[] { "WORD1", ".", "WORD2" })] [InlineData("word1!word2?", new[] { "WORD1", "!", "WORD2", "?" })] [InlineData("word1-word2", new[] { "WORD1", "-", "WORD2" })] + [InlineData("word1 - word2", new[] { "WORD1", "-", "WORD2" })] + [InlineData("word1-\n word2", new[] { "WORD1", "WORD2" })] + [InlineData("word1-\r\n word2", new[] { "WORD1", "WORD2" })] + [InlineData("word1-\r\nword2", new[] { "WORD1WORD2" })] + [InlineData("word1-\nword2", new[] { "WORD1WORD2" })] [InlineData("word1\nword2", new[] { "WORD1", "WORD2" })] + [InlineData("word1 \n word2", new[] { "WORD1", "WORD2" })] + [InlineData("word1\r\nword2", new[] { "WORD1", "WORD2" })] + [InlineData("word1 \r\n word2", new[] { "WORD1", "WORD2" })] [InlineData("word1\tword2", new[] { "WORD1", "WORD2" })] [InlineData("It is a guide to action that ensures that the military will forever heed Party commands.", new[] { "IT", "IS", "A", "GUIDE", "TO", "ACTION", "THAT", "ENSURES", "THAT", "THE", "MILITARY", "WILL", "FOREVER", "HEED", "PARTY", "COMMANDS", "." })] From 2ee2ef15bc80f9575a271b8580dcc14f7ff1418b Mon Sep 17 00:00:00 2001 From: Peter Waldschmidt Date: Fri, 27 Jun 2025 11:07:09 -0400 Subject: [PATCH 11/11] Remove version from NLP library --- .../Directory.Build.targets | 33 ------------------- 1 file changed, 33 deletions(-) delete mode 100644 src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets deleted file mode 100644 index 3e3526f1ac8..00000000000 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/Directory.Build.targets +++ /dev/null @@ -1,33 +0,0 @@ - - - - - - - <_ConstantsFilePath>$(IntermediateOutputPath)Constants.g.cs - <_Lines> -// -// This file is auto-generated by MSBuild. -// - -namespace $(RootNamespace)%3B - -internal static class Constants -{ - public const string Version = "$(Version)"%3B -} - - - - - - - - - - - -