lower passing threshold

Kylejeong2 · Kylejeong2 · commit 7f8056f9ac65 · 2025-08-20T13:57:51.000-07:00
diff --git a/evals/mcp-eval-basic.config.json b/evals/mcp-eval-basic.config.json
@@ -1,4 +1,5 @@
 {
+  "passThreshold": 0.7,
   "server": {
     "transport": "stdio",
     "command": "node",
diff --git a/evals/mcp-eval-minimal.config.json b/evals/mcp-eval-minimal.config.json
@@ -1,4 +1,5 @@
 {
+  "passThreshold": 0.7,
   "server": {
     "transport": "stdio",
     "command": "node",
diff --git a/evals/mcp-eval.config.json b/evals/mcp-eval.config.json
@@ -1,4 +1,5 @@
 {
+  "passThreshold": 0.7,
   "server": {
     "transport": "stdio",
     "command": "node",
diff --git a/evals/run-evals.ts b/evals/run-evals.ts
@@ -44,6 +44,12 @@ interface TestResult {
   }[];
 }
 
+interface EvalConfig {
+  workflows: Array<{ name?: string }>;
+  passThreshold?: number;
+  [key: string]: unknown;
+}
+
 const program = new Command();
 
 program
@@ -63,6 +69,10 @@ program
   .option("-j, --json", "Output results as JSON")
   .option("-l, --llm", "Enable LLM judge")
   .option("-o, --output <path>", "Save results to file")
+  .option(
+    "-p, --pass-threshold <number>",
+    "Minimum average score (0-1) required to pass. Can also be set via EVAL_PASS_THRESHOLD env var.",
+  )
   .option("-t, --timeout <ms>", "Override timeout in milliseconds")
   .action(async (options) => {
     try {
@@ -134,7 +144,7 @@ program
 
       // Load config to get workflow count for display
       const configContent = await fs.readFile(configPath, "utf-8");
-      const config = JSON.parse(configContent);
+      const config: EvalConfig = JSON.parse(configContent);
 
       console.log(chalk.blue(`Running evaluation tests from: ${configPath}`));
       console.log(chalk.gray(`Workflows to test: ${config.workflows.length}`));
@@ -179,19 +189,42 @@ program
       const allEvaluations = reports.flatMap((r) => r.evaluations);
       const duration = Date.now() - startTime;
 
+      // Determine pass/fail based on threshold instead of strict all-pass
+      const avgScore =
+        allEvaluations.length === 0
+          ? 0
+          : allEvaluations.reduce((sum, e) => sum + e.overallScore, 0) /
+            allEvaluations.length;
+
+      const thresholdFromEnv =
+        (process.env.EVAL_PASS_THRESHOLD || process.env.PASS_THRESHOLD) ?? "";
+      const thresholdFromCli = options.passThreshold ?? "";
+      const thresholdFromConfig =
+        typeof config.passThreshold === "number"
+          ? String(config.passThreshold)
+          : "";
+      const threshold = (() => {
+        const raw = String(
+          thresholdFromCli || thresholdFromEnv || thresholdFromConfig,
+        ).trim();
+        const parsed = Number.parseFloat(raw);
+        if (!Number.isFinite(parsed)) return 0.6; // default lowered threshold
+        return parsed;
+      })();
+
+      const passed = avgScore >= threshold;
+
       const finalReport: EvaluationReport = {
         config: { parallel: true, source: configPath },
         evaluations: allEvaluations,
-        passed: reports.every((r) => r.passed),
+        passed,
         timestamp: new Date(),
       };
 
       const finalResult: TestResult = {
         config: configPath,
-        passed: finalReport.passed,
-        score:
-          allEvaluations.reduce((sum, e) => sum + e.overallScore, 0) /
-          Math.max(1, allEvaluations.length),
+        passed,
+        score: avgScore,
         duration,
         workflows: allEvaluations.map((e) => ({
           name: e.workflowName,
@@ -217,6 +250,11 @@ program
             `\nTest execution completed in ${(finalResult.duration / 1000).toFixed(2)}s`,
           ),
         );
+        console.log(
+          chalk.gray(
+            `Threshold for pass: ${threshold.toFixed(2)} | Average score: ${finalResult.score.toFixed(3)}`,
+          ),
+        );
         console.log(
           chalk[finalResult.passed ? "green" : "red"](
             `Overall result: ${finalResult.passed ? "PASSED" : "FAILED"} (${(finalResult.score * 100).toFixed(1)}%)`,

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`{`
	`2`	`+ "passThreshold": 0.7,`
`2`	`3`	`"server": {`
`3`	`4`	`"transport": "stdio",`
`4`	`5`	`"command": "node",`