diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs index e1419bd630e..f3030ec7cfb 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs @@ -86,7 +86,7 @@ public ValueTask EvaluateAsync( }); metric.Value = score; - string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.AddOrUpdateContext(context); metric.Interpretation = metric.Interpret(); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs index b0806be6d66..e070577c448 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs @@ -77,7 +77,7 @@ public ValueTask EvaluateAsync( }); metric.Value = score; - string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.AddOrUpdateContext(context); metric.Interpretation = metric.Interpret(); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs index 0c9805ee108..60df30879a4 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs @@ -86,7 +86,7 @@ public ValueTask EvaluateAsync( }); metric.Value = score; - string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); metric.AddOrUpdateContext(context); metric.Interpretation = metric.Interpret(); diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs index d3012030cec..534f5e300f7 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs @@ -177,7 +177,7 @@ public static void AddOrUpdateChatMetadata( if (duration is not null) { - string durationText = $"{duration.Value.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; + string durationText = $"{duration.Value.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s"; metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj index c08667ff421..6e3332ebca6 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj @@ -28,6 +28,7 @@ + diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs new file mode 100644 index 00000000000..a4f3b75045a --- /dev/null +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/NLPEvaluatorTests.cs @@ -0,0 +1,162 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#pragma warning disable CA2016 // Forward the 'CancellationToken' parameter to methods that take it. +#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. + +using System; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.Extensions.AI.Evaluation.NLP; +using Microsoft.Extensions.AI.Evaluation.Reporting; +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; +using Microsoft.TestUtilities; +using Xunit; + +namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; + +[Experimental("AIEVAL001")] +public class NLPEvaluatorTests +{ + private static readonly ReportingConfiguration? _nlpReportingConfiguration; + + static NLPEvaluatorTests() + { + if (Settings.Current.Configured) + { + string version = $"Product Version: {Constants.Version}"; + string date = $"Date: {DateTime.UtcNow:dddd, dd MMMM yyyy}"; + string projectName = $"Project: Integration Tests"; + string testClass = $"Test Class: {nameof(NLPEvaluatorTests)}"; + string usesContext = $"Feature: Context"; + + IEvaluator bleuEvaluator = new BLEUEvaluator(); + IEvaluator gleuEvaluator = new GLEUEvaluator(); + IEvaluator f1Evaluator = new F1Evaluator(); + + _nlpReportingConfiguration = + DiskBasedReportingConfiguration.Create( + storageRootPath: Settings.Current.StorageRootPath, + evaluators: [bleuEvaluator, gleuEvaluator, f1Evaluator], + executionName: Constants.Version, + tags: [version, date, projectName, testClass, usesContext]); + } + } + + [ConditionalFact] + public async Task ExactMatch() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(ExactMatch)}"); + + var referenceText = "The quick brown fox jumps over the lazy dog."; + var bleuContext = new BLEUEvaluatorContext(referenceText); + var gleuContext = new GLEUEvaluatorContext(referenceText); + var f1Context = new F1EvaluatorContext(referenceText); + + EvaluationResult result = await scenarioRun.EvaluateAsync(referenceText, [bleuContext, gleuContext, f1Context]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task PartialMatch() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(PartialMatch)}"); + + var referenceText = "The quick brown fox jumps over the lazy dog."; + var bleuContext = new BLEUEvaluatorContext(referenceText); + var gleuContext = new GLEUEvaluatorContext(referenceText); + var f1Context = new F1EvaluatorContext(referenceText); + + var similarText = "The brown fox quickly jumps over a lazy dog."; + EvaluationResult result = await scenarioRun.EvaluateAsync(similarText, [bleuContext, gleuContext, f1Context]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task Unmatched() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(Unmatched)}"); + + var referenceText = "The quick brown fox jumps over the lazy dog."; + var bleuContext = new BLEUEvaluatorContext(referenceText); + var gleuContext = new GLEUEvaluatorContext(referenceText); + var f1Context = new F1EvaluatorContext(referenceText); + + EvaluationResult result = await scenarioRun.EvaluateAsync("What is life's meaning?", [bleuContext, gleuContext, f1Context]); + + Assert.False( + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); + } + + [ConditionalFact] + public async Task AdditionalContextIsNotPassed() + { + SkipIfNotConfigured(); + + await using ScenarioRun scenarioRun = + await _nlpReportingConfiguration.CreateScenarioRunAsync( + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(AdditionalContextIsNotPassed)}"); + + EvaluationResult result = await scenarioRun.EvaluateAsync("What is the meaning of life?"); + + Assert.True( + result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); + + Assert.Equal(3, result.Metrics.Count); + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? bleu)); + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? gleu)); + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? f1)); + + Assert.Null(bleu.Context); + Assert.Null(gleu.Context); + Assert.Null(f1.Context); + + } + + [MemberNotNull(nameof(_nlpReportingConfiguration))] + private static void SkipIfNotConfigured() + { + if (!Settings.Current.Configured) + { + throw new SkipTestException("Test is not configured"); + } + + Assert.NotNull(_nlpReportingConfiguration); + } +}