Skip to content

Commit 7f8056f

Browse files
committed
lower passing threshold
1 parent 4e0b8fa commit 7f8056f

File tree

4 files changed

+47
-6
lines changed

4 files changed

+47
-6
lines changed

evals/mcp-eval-basic.config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"passThreshold": 0.7,
23
"server": {
34
"transport": "stdio",
45
"command": "node",

evals/mcp-eval-minimal.config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"passThreshold": 0.7,
23
"server": {
34
"transport": "stdio",
45
"command": "node",

evals/mcp-eval.config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
{
2+
"passThreshold": 0.7,
23
"server": {
34
"transport": "stdio",
45
"command": "node",

evals/run-evals.ts

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ interface TestResult {
4444
}[];
4545
}
4646

47+
interface EvalConfig {
48+
workflows: Array<{ name?: string }>;
49+
passThreshold?: number;
50+
[key: string]: unknown;
51+
}
52+
4753
const program = new Command();
4854

4955
program
@@ -63,6 +69,10 @@ program
6369
.option("-j, --json", "Output results as JSON")
6470
.option("-l, --llm", "Enable LLM judge")
6571
.option("-o, --output <path>", "Save results to file")
72+
.option(
73+
"-p, --pass-threshold <number>",
74+
"Minimum average score (0-1) required to pass. Can also be set via EVAL_PASS_THRESHOLD env var.",
75+
)
6676
.option("-t, --timeout <ms>", "Override timeout in milliseconds")
6777
.action(async (options) => {
6878
try {
@@ -134,7 +144,7 @@ program
134144

135145
// Load config to get workflow count for display
136146
const configContent = await fs.readFile(configPath, "utf-8");
137-
const config = JSON.parse(configContent);
147+
const config: EvalConfig = JSON.parse(configContent);
138148

139149
console.log(chalk.blue(`Running evaluation tests from: ${configPath}`));
140150
console.log(chalk.gray(`Workflows to test: ${config.workflows.length}`));
@@ -179,19 +189,42 @@ program
179189
const allEvaluations = reports.flatMap((r) => r.evaluations);
180190
const duration = Date.now() - startTime;
181191

192+
// Determine pass/fail based on threshold instead of strict all-pass
193+
const avgScore =
194+
allEvaluations.length === 0
195+
? 0
196+
: allEvaluations.reduce((sum, e) => sum + e.overallScore, 0) /
197+
allEvaluations.length;
198+
199+
const thresholdFromEnv =
200+
(process.env.EVAL_PASS_THRESHOLD || process.env.PASS_THRESHOLD) ?? "";
201+
const thresholdFromCli = options.passThreshold ?? "";
202+
const thresholdFromConfig =
203+
typeof config.passThreshold === "number"
204+
? String(config.passThreshold)
205+
: "";
206+
const threshold = (() => {
207+
const raw = String(
208+
thresholdFromCli || thresholdFromEnv || thresholdFromConfig,
209+
).trim();
210+
const parsed = Number.parseFloat(raw);
211+
if (!Number.isFinite(parsed)) return 0.6; // default lowered threshold
212+
return parsed;
213+
})();
214+
215+
const passed = avgScore >= threshold;
216+
182217
const finalReport: EvaluationReport = {
183218
config: { parallel: true, source: configPath },
184219
evaluations: allEvaluations,
185-
passed: reports.every((r) => r.passed),
220+
passed,
186221
timestamp: new Date(),
187222
};
188223

189224
const finalResult: TestResult = {
190225
config: configPath,
191-
passed: finalReport.passed,
192-
score:
193-
allEvaluations.reduce((sum, e) => sum + e.overallScore, 0) /
194-
Math.max(1, allEvaluations.length),
226+
passed,
227+
score: avgScore,
195228
duration,
196229
workflows: allEvaluations.map((e) => ({
197230
name: e.workflowName,
@@ -217,6 +250,11 @@ program
217250
`\nTest execution completed in ${(finalResult.duration / 1000).toFixed(2)}s`,
218251
),
219252
);
253+
console.log(
254+
chalk.gray(
255+
`Threshold for pass: ${threshold.toFixed(2)} | Average score: ${finalResult.score.toFixed(3)}`,
256+
),
257+
);
220258
console.log(
221259
chalk[finalResult.passed ? "green" : "red"](
222260
`Overall result: ${finalResult.passed ? "PASSED" : "FAILED"} (${(finalResult.score * 100).toFixed(1)}%)`,

0 commit comments

Comments
 (0)