@@ -44,6 +44,12 @@ interface TestResult {
44
44
} [ ] ;
45
45
}
46
46
47
+ interface EvalConfig {
48
+ workflows : Array < { name ?: string } > ;
49
+ passThreshold ?: number ;
50
+ [ key : string ] : unknown ;
51
+ }
52
+
47
53
const program = new Command ( ) ;
48
54
49
55
program
@@ -63,6 +69,10 @@ program
63
69
. option ( "-j, --json" , "Output results as JSON" )
64
70
. option ( "-l, --llm" , "Enable LLM judge" )
65
71
. option ( "-o, --output <path>" , "Save results to file" )
72
+ . option (
73
+ "-p, --pass-threshold <number>" ,
74
+ "Minimum average score (0-1) required to pass. Can also be set via EVAL_PASS_THRESHOLD env var." ,
75
+ )
66
76
. option ( "-t, --timeout <ms>" , "Override timeout in milliseconds" )
67
77
. action ( async ( options ) => {
68
78
try {
@@ -134,7 +144,7 @@ program
134
144
135
145
// Load config to get workflow count for display
136
146
const configContent = await fs . readFile ( configPath , "utf-8" ) ;
137
- const config = JSON . parse ( configContent ) ;
147
+ const config : EvalConfig = JSON . parse ( configContent ) ;
138
148
139
149
console . log ( chalk . blue ( `Running evaluation tests from: ${ configPath } ` ) ) ;
140
150
console . log ( chalk . gray ( `Workflows to test: ${ config . workflows . length } ` ) ) ;
@@ -179,19 +189,42 @@ program
179
189
const allEvaluations = reports . flatMap ( ( r ) => r . evaluations ) ;
180
190
const duration = Date . now ( ) - startTime ;
181
191
192
+ // Determine pass/fail based on threshold instead of strict all-pass
193
+ const avgScore =
194
+ allEvaluations . length === 0
195
+ ? 0
196
+ : allEvaluations . reduce ( ( sum , e ) => sum + e . overallScore , 0 ) /
197
+ allEvaluations . length ;
198
+
199
+ const thresholdFromEnv =
200
+ ( process . env . EVAL_PASS_THRESHOLD || process . env . PASS_THRESHOLD ) ?? "" ;
201
+ const thresholdFromCli = options . passThreshold ?? "" ;
202
+ const thresholdFromConfig =
203
+ typeof config . passThreshold === "number"
204
+ ? String ( config . passThreshold )
205
+ : "" ;
206
+ const threshold = ( ( ) => {
207
+ const raw = String (
208
+ thresholdFromCli || thresholdFromEnv || thresholdFromConfig ,
209
+ ) . trim ( ) ;
210
+ const parsed = Number . parseFloat ( raw ) ;
211
+ if ( ! Number . isFinite ( parsed ) ) return 0.6 ; // default lowered threshold
212
+ return parsed ;
213
+ } ) ( ) ;
214
+
215
+ const passed = avgScore >= threshold ;
216
+
182
217
const finalReport : EvaluationReport = {
183
218
config : { parallel : true , source : configPath } ,
184
219
evaluations : allEvaluations ,
185
- passed : reports . every ( ( r ) => r . passed ) ,
220
+ passed,
186
221
timestamp : new Date ( ) ,
187
222
} ;
188
223
189
224
const finalResult : TestResult = {
190
225
config : configPath ,
191
- passed : finalReport . passed ,
192
- score :
193
- allEvaluations . reduce ( ( sum , e ) => sum + e . overallScore , 0 ) /
194
- Math . max ( 1 , allEvaluations . length ) ,
226
+ passed,
227
+ score : avgScore ,
195
228
duration,
196
229
workflows : allEvaluations . map ( ( e ) => ( {
197
230
name : e . workflowName ,
@@ -217,6 +250,11 @@ program
217
250
`\nTest execution completed in ${ ( finalResult . duration / 1000 ) . toFixed ( 2 ) } s` ,
218
251
) ,
219
252
) ;
253
+ console . log (
254
+ chalk . gray (
255
+ `Threshold for pass: ${ threshold . toFixed ( 2 ) } | Average score: ${ finalResult . score . toFixed ( 3 ) } ` ,
256
+ ) ,
257
+ ) ;
220
258
console . log (
221
259
chalk [ finalResult . passed ? "green" : "red" ] (
222
260
`Overall result: ${ finalResult . passed ? "PASSED" : "FAILED" } (${ ( finalResult . score * 100 ) . toFixed ( 1 ) } %)` ,
0 commit comments