From 0c9f66f48429c5f1796276214de5d2e57568d5ca Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 16 Apr 2025 20:24:48 -0700 Subject: [PATCH 1/5] Add a shorter alias for --format --- .../Microsoft.Extensions.AI.Evaluation.Console/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs index bdae87d9d53..8150f3a4d8c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs @@ -90,7 +90,7 @@ private static async Task Main(string[] args) var formatOpt = new Option( - "--format", + ["-f", "--format"], () => ReportCommand.Format.html, "Specify the format for the generated report."); From d42ccecdfa4127ab73321f53f23fc987dda1728d Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 16 Apr 2025 16:51:15 -0700 Subject: [PATCH 2/5] Remove extension methods for adding single diagnostic Since the same can be achieved using the params overload --- .../ChatConversationEvaluator.cs | 6 +- .../RelevanceTruthAndCompletenessEvaluator.cs | 8 +- .../SingleNumericMetricEvaluator.cs | 4 +- .../EvaluationMetricExtensions.cs | 2 +- .../CompositeEvaluator.cs | 2 +- .../EvaluationMetricExtensions.cs | 23 ++---- .../EvaluationResultExtensions.cs | 18 ----- .../ResultsTests.cs | 78 +++++++++---------- .../ResultStoreTester.cs | 2 +- .../ScenarioRunResultTests.cs | 20 ++--- 10 files changed, 68 insertions(+), 95 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs index cbc904277ab..6550454806f 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs @@ -49,7 +49,7 @@ public async ValueTask EvaluateAsync( if (string.IsNullOrWhiteSpace(modelResponse.Text)) { - result.AddDiagnosticToAllMetrics( + result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( "Evaluation failed because the model response supplied for evaluation was null or empty.")); @@ -73,7 +73,7 @@ void OnTokenBudgetExceeded() EvaluationDiagnostic.Error( $"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded."); - result.AddDiagnosticToAllMetrics(tokenBudgetExceeded); + result.AddDiagnosticsToAllMetrics(tokenBudgetExceeded); } if (!string.IsNullOrWhiteSpace(SystemPrompt)) @@ -176,7 +176,7 @@ await PerformEvaluationAsync( if (inputTokenLimit > 0 && ignoredMessagesCount > 0) { #pragma warning disable S103 // Lines should not be too long - result.AddDiagnosticToAllMetrics( + result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Warning( $"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens.")); #pragma warning restore S103 diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs index c6c38cf583a..ee586b9b242 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs @@ -145,7 +145,7 @@ await chatConfiguration.ChatClient.GetResponseAsync( if (string.IsNullOrEmpty(evaluationResponseText)) { rating = Rating.Inconclusive; - result.AddDiagnosticToAllMetrics( + result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( "Evaluation failed because the model failed to produce a valid evaluation response.")); } @@ -168,7 +168,7 @@ await JsonOutputFixer.RepairJsonAsync( if (string.IsNullOrEmpty(repairedJson)) { rating = Rating.Inconclusive; - result.AddDiagnosticToAllMetrics( + result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( $""" Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.: @@ -183,7 +183,7 @@ await JsonOutputFixer.RepairJsonAsync( catch (JsonException ex) { rating = Rating.Inconclusive; - result.AddDiagnosticToAllMetrics( + result.AddDiagnosticsToAllMetrics( EvaluationDiagnostic.Error( $""" Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.: @@ -281,7 +281,7 @@ void UpdateResult() if (!string.IsNullOrWhiteSpace(rating.Error)) { - result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!)); + result.AddDiagnosticsToAllMetrics(EvaluationDiagnostic.Error(rating.Error!)); } } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs index 6c81250ed1c..d4abf52a9e3 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs @@ -105,7 +105,7 @@ await chatConfiguration.ChatClient.GetResponseAsync( if (string.IsNullOrEmpty(evaluationResponseText)) { - metric.AddDiagnostic( + metric.AddDiagnostics( EvaluationDiagnostic.Error( "Evaluation failed because the model failed to produce a valid evaluation response.")); } @@ -115,7 +115,7 @@ await chatConfiguration.ChatClient.GetResponseAsync( } else { - metric.AddDiagnostic( + metric.AddDiagnostics( EvaluationDiagnostic.Error( $"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'.")); } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs index 20246e3aaa2..8a0ffcbd31b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/EvaluationMetricExtensions.cs @@ -97,6 +97,6 @@ internal static void LogJsonData(this EvaluationMetric metric, string data) internal static void LogJsonData(this EvaluationMetric metric, JsonNode data) { string serializedData = data.ToJsonString(new JsonSerializerOptions { WriteIndented = true }); - metric.AddDiagnostic(EvaluationDiagnostic.Informational(serializedData)); + metric.AddDiagnostics(EvaluationDiagnostic.Informational(serializedData)); } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/CompositeEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/CompositeEvaluator.cs index 7dc544c66c8..6feba92d6c3 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/CompositeEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/CompositeEvaluator.cs @@ -159,7 +159,7 @@ async ValueTask EvaluateAsync(IEvaluator e) foreach (string metricName in e.EvaluationMetricNames) { var metric = new EvaluationMetric(metricName); - metric.AddDiagnostic(EvaluationDiagnostic.Error(message)); + metric.AddDiagnostics(EvaluationDiagnostic.Error(message)); result.Metrics.Add(metric.Name, metric); } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs index 29a22a2bd60..e4980d39f9f 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs @@ -40,20 +40,6 @@ metric.Diagnostics is not null && : metric.Diagnostics.Any(predicate)); } - /// - /// Adds the supplied to the supplied 's - /// collection. - /// - /// The . - /// The to be added. - public static void AddDiagnostic(this EvaluationMetric metric, EvaluationDiagnostic diagnostic) - { - _ = Throw.IfNull(metric); - - metric.Diagnostics ??= new List(); - metric.Diagnostics.Add(diagnostic); - } - /// /// Adds the supplied s to the supplied 's /// collection. @@ -65,9 +51,14 @@ public static void AddDiagnostics(this EvaluationMetric metric, IEnumerable(); + + foreach (EvaluationDiagnostic diagnostic in diagnostics) + { + metric.Diagnostics.Add(diagnostic); + } } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs index 5ca59b16584..bbe0f1bc0a2 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs @@ -13,24 +13,6 @@ namespace Microsoft.Extensions.AI.Evaluation; /// public static class EvaluationResultExtensions { - /// - /// Adds the supplied to all s contained in the - /// supplied . - /// - /// - /// The containing the s that are to be altered. - /// - /// The that is to be added. - public static void AddDiagnosticToAllMetrics(this EvaluationResult result, EvaluationDiagnostic diagnostic) - { - _ = Throw.IfNull(result); - - foreach (EvaluationMetric metric in result.Metrics.Values) - { - metric.AddDiagnostic(diagnostic); - } - } - /// /// Adds the supplied to all s contained in the /// supplied . diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs index 8ce98133586..1b52ee9d6d5 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/ResultsTests.cs @@ -419,32 +419,32 @@ public async Task ResultWithDiagnosticsOnUninterpretedMetrics() ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator); var metric1 = new BooleanMetric("Metric with all diagnostic severities"); - metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); - metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.AddDiagnostics(EvaluationDiagnostic.Error("Error 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Error("Error 2")); + metric1.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 2")); metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics"); - metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); - metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric2.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 2")); + metric2.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 2")); metric2.Reason = "Reason for metric 2"; var metric3 = new EvaluationMetric("Metric with error diagnostics only"); - metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); - metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.AddDiagnostics(EvaluationDiagnostic.Error("Error 1")); + metric3.AddDiagnostics(EvaluationDiagnostic.Error("Error 2")); metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only"); - metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric4.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 2")); metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only"); - metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 1")); metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; @@ -472,32 +472,32 @@ public async Task ResultWithDiagnosticsOnFailingMetrics() ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator); var metric1 = new BooleanMetric("Metric with all diagnostic severities"); - metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); - metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.AddDiagnostics(EvaluationDiagnostic.Error("Error 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Error("Error 2")); + metric1.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 2")); metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics"); - metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); - metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric2.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 2")); + metric2.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 2")); metric2.Reason = "Reason for metric 2"; var metric3 = new EvaluationMetric("Metric with error diagnostics only"); - metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); - metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.AddDiagnostics(EvaluationDiagnostic.Error("Error 1")); + metric3.AddDiagnostics(EvaluationDiagnostic.Error("Error 2")); metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only"); - metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric4.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 2")); metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only"); - metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 1")); metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; @@ -531,32 +531,32 @@ public async Task ResultWithDiagnosticsOnPassingMetrics() ReportingConfiguration reportingConfiguration = CreateReportingConfiguration(evaluator); var metric1 = new BooleanMetric("Metric with all diagnostic severities", value: true); - metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); - metric1.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); - metric1.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric1.AddDiagnostics(EvaluationDiagnostic.Error("Error 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Error("Error 2")); + metric1.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 1")); + metric1.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 2")); metric1.Reason = "Reason for metric 1"; var metric2 = new BooleanMetric("Metric with warning and informational diagnostics", value: true); - metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric2.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); - metric2.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 2")); + metric2.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric2.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 2")); + metric2.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 2")); metric2.Reason = "Reason for metric 2"; var metric3 = new NumericMetric("Metric with error diagnostics only", value: 5); - metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 1")); - metric3.AddDiagnostic(EvaluationDiagnostic.Error("Error 2")); + metric3.AddDiagnostics(EvaluationDiagnostic.Error("Error 1")); + metric3.AddDiagnostics(EvaluationDiagnostic.Error("Error 2")); metric3.Reason = "Reason for metric 3"; HashSet allowedValues = ["A", "B", "C"]; var metric4 = new StringMetric("Metric with warning diagnostics only", value: "A"); - metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 1")); - metric4.AddDiagnostic(EvaluationDiagnostic.Warning("Warning 2")); + metric4.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 1")); + metric4.AddDiagnostics(EvaluationDiagnostic.Warning("Warning 2")); metric4.Reason = "Reason for metric 4"; var metric5 = new NumericMetric("Metric with informational diagnostics only", value: 4); - metric5.AddDiagnostic(EvaluationDiagnostic.Informational("Informational 1")); + metric5.AddDiagnostics(EvaluationDiagnostic.Informational("Informational 1")); metric5.Reason = "Reason for metric 5"; evaluator.TestMetrics = [metric1, metric2, metric3, metric4, metric5]; diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ResultStoreTester.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ResultStoreTester.cs index c2547dea47e..1ce033b3cd7 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ResultStoreTester.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ResultStoreTester.cs @@ -22,7 +22,7 @@ private static ScenarioRunResult CreateTestResult(string scenarioName, string it BooleanMetric booleanMetric = new BooleanMetric("boolean", value: true); NumericMetric numericMetric = new NumericMetric("numeric", value: 3); - numericMetric.AddDiagnostic(EvaluationDiagnostic.Informational("Informational Message")); + numericMetric.AddDiagnostics(EvaluationDiagnostic.Informational("Informational Message")); StringMetric stringMetric = new StringMetric("string", value: "Good"); diff --git a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ScenarioRunResultTests.cs b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ScenarioRunResultTests.cs index d31e966f096..e5aa47e9fc8 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ScenarioRunResultTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Tests/ScenarioRunResultTests.cs @@ -18,17 +18,17 @@ public class ScenarioRunResultTests public void SerializeScenarioRunResult() { var booleanMetric = new BooleanMetric("boolean", value: true); - booleanMetric.AddDiagnostic(EvaluationDiagnostic.Error("error")); - booleanMetric.AddDiagnostic(EvaluationDiagnostic.Warning("warning")); + booleanMetric.AddDiagnostics(EvaluationDiagnostic.Error("error")); + booleanMetric.AddDiagnostics(EvaluationDiagnostic.Warning("warning")); var numericMetric = new NumericMetric("numeric", value: 3); - numericMetric.AddDiagnostic(EvaluationDiagnostic.Informational("info")); + numericMetric.AddDiagnostics(EvaluationDiagnostic.Informational("info")); var stringMetric = new StringMetric("string", value: "A"); var metricWithNoValue = new EvaluationMetric("none"); - metricWithNoValue.AddDiagnostic(EvaluationDiagnostic.Error("error")); - metricWithNoValue.AddDiagnostic(EvaluationDiagnostic.Informational("info")); + metricWithNoValue.AddDiagnostics(EvaluationDiagnostic.Error("error")); + metricWithNoValue.AddDiagnostics(EvaluationDiagnostic.Informational("info")); var turn1 = new ChatTurnDetails( @@ -82,17 +82,17 @@ public void SerializeScenarioRunResult() public void SerializeDatasetCompact() { var booleanMetric = new BooleanMetric("boolean", value: true); - booleanMetric.AddDiagnostic(EvaluationDiagnostic.Error("error")); - booleanMetric.AddDiagnostic(EvaluationDiagnostic.Warning("warning")); + booleanMetric.AddDiagnostics(EvaluationDiagnostic.Error("error")); + booleanMetric.AddDiagnostics(EvaluationDiagnostic.Warning("warning")); var numericMetric = new NumericMetric("numeric", value: 3); - numericMetric.AddDiagnostic(EvaluationDiagnostic.Informational("info")); + numericMetric.AddDiagnostics(EvaluationDiagnostic.Informational("info")); var stringMetric = new StringMetric("string", value: "A"); var metricWithNoValue = new EvaluationMetric("none"); - metricWithNoValue.AddDiagnostic(EvaluationDiagnostic.Error("error")); - metricWithNoValue.AddDiagnostic(EvaluationDiagnostic.Informational("info")); + metricWithNoValue.AddDiagnostics(EvaluationDiagnostic.Error("error")); + metricWithNoValue.AddDiagnostics(EvaluationDiagnostic.Informational("info")); var turn1 = new ChatTurnDetails( From a9d3710fc561f5d7e9fcf29345083a78ad3ccbb6 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 16 Apr 2025 18:42:44 -0700 Subject: [PATCH 3/5] Introduce Context property on EvaluationMetric --- .../TypeScript/components/EvalTypes.d.ts | 3 + .../EvaluationMetric.cs | 14 ++++ .../EvaluationMetricExtensions.cs | 29 ++++++- .../EvaluationResultExtensions.cs | 76 +++++++++++++++++++ 4 files changed, 120 insertions(+), 2 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts index c9accb7a90d..4de977ddc4b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/EvalTypes.d.ts @@ -94,6 +94,9 @@ type BaseEvaluationMetric = { $type: string; name: string; interpretation?: EvaluationMetricInterpretation; + context?: { + [K: string]: AIContent[] + }; diagnostics?: EvaluationDiagnostic[]; metadata: { [K: string]: string diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs index 7ff604347ba..f9256ca125c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetric.cs @@ -44,6 +44,20 @@ public class EvaluationMetric(string name, string? reason = null) public EvaluationMetricInterpretation? Interpretation { get; set; } #pragma warning disable CA2227 + /// + /// Gets or sets any contextual information that was considered by the as part of the + /// evaluation that produced the current . + /// + /// + /// Each entry in the returned dictionary has a name (key), and a collection of objects + /// (value). An can use this dictionary to record one or more + /// s that it considred as part of the evaluation that produced this + /// . For example, it can do so by including an entry with a name for the considered + /// as the key, and the objects returned from + /// as the value. + /// + public IDictionary>? Context { get; set; } + // CA2227: Collection properties should be read only. // We disable this warning because we want this type to be fully mutable for serialization purposes and for general // convenience. diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs index e4980d39f9f..f5b6bb4c33b 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs @@ -13,6 +13,31 @@ namespace Microsoft.Extensions.AI.Evaluation; /// public static class EvaluationMetricExtensions { + /// + /// Adds or updates contextual information with the specified and + /// in the supplied 's collection. + /// + /// The . + /// The name for the contextual information to be added or updated. + /// The contextual information to be added or updated. + public static void AddOrUpdateContext(this EvaluationMetric metric, string name, params AIContent[] value) + => metric.AddOrUpdateContext(name, value as IEnumerable); + + /// + /// Adds or updates contextual information with the specified and + /// in the supplied 's collection. + /// + /// The . + /// The name for the contextual information to be added or updated. + /// The contextual information to be added or updated. + public static void AddOrUpdateContext(this EvaluationMetric metric, string name, IEnumerable value) + { + _ = Throw.IfNull(metric); + + metric.Context ??= new Dictionary>(); + metric.Context[name] = [.. value]; + } + /// /// Determines if the supplied contains any /// matching the supplied . @@ -73,7 +98,7 @@ public static void AddDiagnostics(this EvaluationMetric metric, params Evaluatio /// /// Adds or updates metadata with the specified and in the - /// supplied 's collection. + /// supplied 's collection. /// /// The . /// The name of the metadata. @@ -87,7 +112,7 @@ public static void AddOrUpdateMetadata(this EvaluationMetric metric, string name } /// - /// Adds or updates the supplied to the supplied 's + /// Adds or updates the supplied in the supplied 's /// collection. /// /// The . diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs index bbe0f1bc0a2..2e737434ea6 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationResultExtensions.cs @@ -13,6 +13,43 @@ namespace Microsoft.Extensions.AI.Evaluation; /// public static class EvaluationResultExtensions { + /// + /// Adds or updates contextual information with the specified and + /// in all s contained in the supplied . + /// + /// + /// The containing the s that are to be altered. + /// + /// The name for the contextual information to be added or updated. + /// The contextual information to be added or updated. + public static void AddOrUpdateContextInAllMetrics( + this EvaluationResult result, + string name, + params AIContent[] value) + => result.AddOrUpdateContextInAllMetrics(name, value as IEnumerable); + + /// + /// Adds or updates contextual information with the specified and + /// in all s contained in the supplied . + /// + /// + /// The containing the s that are to be altered. + /// + /// The name for the contextual information to be added or updated. + /// The contextual information to be added or updated. + public static void AddOrUpdateContextInAllMetrics( + this EvaluationResult result, + string name, + IEnumerable value) + { + _ = Throw.IfNull(result); + + foreach (EvaluationMetric metric in result.Metrics.Values) + { + metric.AddOrUpdateContext(name, value); + } + } + /// /// Adds the supplied to all s contained in the /// supplied . @@ -92,4 +129,43 @@ public static void Interpret( } } } + + /// + /// Adds or updates metadata with the specified and in all + /// s contained in the supplied . + /// + /// + /// The containing the s that are to be altered. + /// + /// The name of the metadata. + /// The value of the metadata. + public static void AddOrUpdateMetadataInAllMetrics(this EvaluationResult result, string name, string value) + { + _ = Throw.IfNull(result); + + foreach (EvaluationMetric metric in result.Metrics.Values) + { + metric.AddOrUpdateMetadata(name, value); + } + } + + /// + /// Adds or updates the supplied in all s contained in + /// the supplied . + /// + /// + /// The containing the s that are to be altered. + /// + /// The metadata to be added or updated. + public static void AddOrUpdateMetadataInAllMetrics( + this EvaluationResult result, + IDictionary metadata) + { + _ = Throw.IfNull(result); + + foreach (EvaluationMetric metric in result.Metrics.Values) + { + metric.AddOrUpdateMetadata(metadata); + } + } } From 223e08754f69682d6c2eb2110236e41dbfb13ddb Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 16 Apr 2025 20:02:11 -0700 Subject: [PATCH 4/5] Update Quality and Safety evaluators that use context to record the context used in the metrics they produce --- .../ChatConversationEvaluator.cs | 3 +- .../EquivalenceEvaluator.cs | 48 ++++++++++++++----- .../GroundednessEvaluator.cs | 38 ++++++++++++++- .../GroundednessProEvaluator.cs | 20 +++++--- .../UngroundedAttributesEvaluator.cs | 20 +++++--- 5 files changed, 101 insertions(+), 28 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs index 6550454806f..63e17ebe1ac 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System; using System.Collections.Generic; using System.Linq; using System.Text; @@ -35,7 +34,7 @@ public abstract class ChatConversationEvaluator : IEvaluator protected virtual string? SystemPrompt => null; /// - public async ValueTask EvaluateAsync( + public virtual async ValueTask EvaluateAsync( IEnumerable messages, ChatResponse modelResponse, ChatConfiguration? chatConfiguration = null, diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs index 15b7f8b3f31..4a15f3640ee 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs @@ -49,6 +49,28 @@ public sealed class EquivalenceEvaluator : SingleNumericMetricEvaluator /// protected override bool IgnoresHistory => true; + /// + public override async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + EvaluationResult result = + await base.EvaluateAsync( + messages, + modelResponse, + chatConfiguration, + additionalContext, + cancellationToken).ConfigureAwait(false); + + EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext); + result.AddOrUpdateContextInAllMetrics("Ground Truth", context.GetContents()); + + return result; + } + /// protected override async ValueTask RenderEvaluationPromptAsync( ChatMessage? userRequest, @@ -66,18 +88,8 @@ userRequest is not null ? await RenderAsync(userRequest, cancellationToken).ConfigureAwait(false) : string.Empty; - string groundTruth; - - if (additionalContext?.OfType().FirstOrDefault() - is EquivalenceEvaluatorContext context) - { - groundTruth = context.GroundTruth; - } - else - { - throw new InvalidOperationException( - $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); - } + EquivalenceEvaluatorContext context = GetRelevantContext(additionalContext); + string groundTruth = context.GroundTruth; string prompt = $$""" @@ -149,4 +161,16 @@ alleviating stress and augmenting general mood. return prompt; } + + private static EquivalenceEvaluatorContext GetRelevantContext(IEnumerable? additionalContext) + { + if (additionalContext?.OfType().FirstOrDefault() + is EquivalenceEvaluatorContext context) + { + return context; + } + + throw new InvalidOperationException( + $"A value of type '{nameof(EquivalenceEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs index bd09774150e..c8881864d40 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs @@ -49,6 +49,30 @@ public sealed class GroundednessEvaluator : SingleNumericMetricEvaluator /// protected override bool IgnoresHistory => false; + /// + public override async ValueTask EvaluateAsync( + IEnumerable messages, + ChatResponse modelResponse, + ChatConfiguration? chatConfiguration = null, + IEnumerable? additionalContext = null, + CancellationToken cancellationToken = default) + { + EvaluationResult result = + await base.EvaluateAsync( + messages, + modelResponse, + chatConfiguration, + additionalContext, + cancellationToken).ConfigureAwait(false); + + if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context) + { + result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents()); + } + + return result; + } + /// protected override async ValueTask RenderEvaluationPromptAsync( ChatMessage? userRequest, @@ -68,8 +92,7 @@ userRequest is not null var builder = new StringBuilder(); - if (additionalContext?.OfType().FirstOrDefault() - is GroundednessEvaluatorContext context) + if (GetRelevantContext(additionalContext) is GroundednessEvaluatorContext context) { _ = builder.Append(context.GroundingContext); _ = builder.AppendLine(); @@ -162,4 +185,15 @@ is not French. return prompt; } + + private static GroundednessEvaluatorContext? GetRelevantContext(IEnumerable? additionalContext) + { + if (additionalContext?.OfType().FirstOrDefault() + is GroundednessEvaluatorContext context) + { + return context; + } + + return null; + } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs index 6af681d751f..c857e263763 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/GroundednessProEvaluator.cs @@ -63,22 +63,30 @@ await EvaluateContentSafetyAsync( contentSafetyServicePayloadFormat: ContentSafetyServicePayloadFormat.QuestionAnswer.ToString(), cancellationToken: cancellationToken).ConfigureAwait(false); + GroundednessProEvaluatorContext context = GetRelevantContext(additionalContext); + result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents()); + return result; } /// protected override IReadOnlyList? FilterAdditionalContext( IEnumerable? additionalContext) + { + GroundednessProEvaluatorContext context = GetRelevantContext(additionalContext); + return [context]; + } + + private static GroundednessProEvaluatorContext GetRelevantContext( + IEnumerable? additionalContext) { if (additionalContext?.OfType().FirstOrDefault() is GroundednessProEvaluatorContext context) { - return [context]; - } - else - { - throw new InvalidOperationException( - $"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + return context; } + + throw new InvalidOperationException( + $"A value of type '{nameof(GroundednessProEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs index 79a5deb4888..656d5280c7c 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/UngroundedAttributesEvaluator.cs @@ -67,22 +67,30 @@ await EvaluateContentSafetyAsync( contentSafetyServicePayloadFormat: ContentSafetyServicePayloadFormat.QueryResponse.ToString(), cancellationToken: cancellationToken).ConfigureAwait(false); + UngroundedAttributesEvaluatorContext context = GetRelevantContext(additionalContext); + result.AddOrUpdateContextInAllMetrics("Grounding Context", context.GetContents()); + return result; } /// protected override IReadOnlyList? FilterAdditionalContext( IEnumerable? additionalContext) + { + UngroundedAttributesEvaluatorContext context = GetRelevantContext(additionalContext); + return [context]; + } + + private static UngroundedAttributesEvaluatorContext GetRelevantContext( + IEnumerable? additionalContext) { if (additionalContext?.OfType().FirstOrDefault() is UngroundedAttributesEvaluatorContext context) { - return [context]; - } - else - { - throw new InvalidOperationException( - $"A value of type '{nameof(UngroundedAttributesEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); + return context; } + + throw new InvalidOperationException( + $"A value of type '{nameof(UngroundedAttributesEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection."); } } From 38f4bedf1e5a6d7137d12c573e991d8c09e8a415 Mon Sep 17 00:00:00 2001 From: Shyam Namboodiripad Date: Wed, 16 Apr 2025 21:20:15 -0700 Subject: [PATCH 5/5] Update report to display context bubbles under the conversation details for selected metrics --- .../components/ConversationDetails.tsx | 39 ++++++++++++++++++- .../TypeScript/components/ScoreDetail.tsx | 2 +- .../TypeScript/components/Styles.ts | 8 ++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx index 325f66ee7e2..eddc15b598f 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ConversationDetails.tsx @@ -8,11 +8,13 @@ import ReactMarkdown from "react-markdown"; import { useReportContext } from "./ReportContext"; import { useStyles } from "./Styles"; import { ChatMessageDisplay, isTextContent, isImageContent } from "./Summary"; +import type { MetricType } from "./MetricCard"; -export const ConversationDetails = ({ messages, model, usage }: { +export const ConversationDetails = ({ messages, model, usage, selectedMetric }: { messages: ChatMessageDisplay[]; model?: string; usage?: UsageDetails; + selectedMetric?: MetricType | null; }) => { const classes = useStyles(); const [isExpanded, setIsExpanded] = useState(true); @@ -59,7 +61,27 @@ export const ConversationDetails = ({ messages, model, usage }: { return result; }; + const getContextGroups = () => { + if (!selectedMetric || !selectedMetric.context) { + return []; + } + + const contextGroups: { key: string, contents: AIContent[] }[] = []; + + for (const [key, contents] of Object.entries(selectedMetric.context)) { + if (contents && contents.length > 0) { + contextGroups.push({ + key: key.toLowerCase(), + contents: contents + }); + } + } + + return contextGroups; + }; + const messageGroups = groupMessages(); + const contextGroups = getContextGroups(); return (
@@ -79,7 +101,7 @@ export const ConversationDetails = ({ messages, model, usage }: { ); return ( -
+
{group.participantName}
{group.contents.map((content, contentIndex) => ( @@ -91,6 +113,19 @@ export const ConversationDetails = ({ messages, model, usage }: {
); })} + + {contextGroups.map((group, index) => ( +
+
{`supplied evaluation context (${group.key})`}
+
+ {group.contents.map((content, contentIndex) => ( +
+ {renderContent(content)} +
+ ))} +
+
+ ))}
)}
diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScoreDetail.tsx b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScoreDetail.tsx index 1147cce6669..7d614abea33 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScoreDetail.tsx +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ScoreDetail.tsx @@ -32,7 +32,7 @@ export const ScoreDetail = ({ scenario, scoreSummary }: { scenario: ScenarioRunR onMetricSelect={setSelectedMetric} selectedMetric={selectedMetric} /> {selectedMetric && } - + {scenario.chatDetails && scenario.chatDetails.turnDetails.length > 0 && }
); }; diff --git a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Styles.ts b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Styles.ts index 7ddc980a372..89867593479 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Styles.ts +++ b/src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/Styles.ts @@ -127,6 +127,14 @@ export const useStyles = makeStyles({ backgroundColor: tokens.colorNeutralBackground3, border: '1px solid ' + tokens.colorNeutralStroke2, }, + contextBubble: { + padding: '0.75rem 1rem', + borderRadius: '12px', + overflow: 'hidden', + wordBreak: 'break-word', + backgroundColor: tokens.colorBrandBackground2, + border: '1px solid ' + tokens.colorNeutralStroke2, + }, cacheHitIcon: { color: tokens.colorPaletteGreenForeground1, },