From cd287642d6b0b8aff9740925e0cf7d7d0bf3b329 Mon Sep 17 00:00:00 2001 From: RafaelGSS Date: Tue, 22 Jul 2025 19:23:52 -0300 Subject: [PATCH] benchmark: add calibrate-n script This script should help identify the best N when creating/updating benchmarks Signed-off-by: RafaelGSS --- benchmark/calibrate-n.js | 292 ++++++++++++++++++ .../writing-and-running-benchmarks.md | 41 +++ 2 files changed, 333 insertions(+) create mode 100644 benchmark/calibrate-n.js diff --git a/benchmark/calibrate-n.js b/benchmark/calibrate-n.js new file mode 100644 index 00000000000000..16bb512cc50a8c --- /dev/null +++ b/benchmark/calibrate-n.js @@ -0,0 +1,292 @@ +'use strict'; + +const path = require('node:path'); +const { fork } = require('node:child_process'); +const fs = require('node:fs'); +const { styleText } = require('node:util'); + +const DEFAULT_RUNS = 30; // Number of runs for each n value +const CV_THRESHOLD = 0.05; // 5% coefficient of variation threshold +const MAX_N_INCREASE = 6; // Maximum number of times to increase n (10**6) +const INCREASE_FACTOR = 10; // Factor by which to increase n + +const args = process.argv.slice(2); +if (args.length === 0) { + console.log(` +Usage: node calibrate-n.js [options] + +Options: + --runs=N Number of runs for each n value (default: ${DEFAULT_RUNS}) + --cv-threshold=N Target coefficient of variation threshold (default: ${CV_THRESHOLD}) + --max-increases=N Maximum number of n increases to try (default: ${MAX_N_INCREASE}) + --start-n=N Initial n value to start with (default: autodetect) + --increase=N Factor by which to increase n (default: ${INCREASE_FACTOR}) + +Example: + node calibrate-n.js buffers/buffer-compare.js + node calibrate-n.js --runs=10 --cv-threshold=0.02 buffers/buffer-compare.js + `); + process.exit(1); +} + +// Extract options +let benchmarkPath; +let runs = DEFAULT_RUNS; +let cvThreshold = CV_THRESHOLD; +let maxIncreases = MAX_N_INCREASE; +let startN = 10; +let increaseFactor = INCREASE_FACTOR; + +for (const arg of args) { + if (arg.startsWith('--runs=')) { + runs = parseInt(arg.substring(7), 10); + } else if (arg.startsWith('--cv-threshold=')) { + cvThreshold = parseFloat(arg.substring(14)); + } else if (arg.startsWith('--max-increases=')) { + maxIncreases = parseInt(arg.substring(15), 10); + if (isNaN(maxIncreases)) { + console.error(`Error: Invalid value for --max-increases. Using default: ${MAX_N_INCREASE}`); + maxIncreases = MAX_N_INCREASE; + } + } else if (arg.startsWith('--start-n=')) { + startN = parseInt(arg.substring(10), 10); + if (isNaN(startN)) { + console.error(`Error: Invalid value for --start-n. Using default: 10`); + startN = 10; + } + } else if (arg.startsWith('--increase=')) { + increaseFactor = parseInt(arg.substring(11), 10); + if (isNaN(increaseFactor)) { + console.error(`Error: Invalid value for --increase. Using default: ${INCREASE_FACTOR}`); + increaseFactor = INCREASE_FACTOR; + } + } else { + benchmarkPath = arg; + } +} + +if (!benchmarkPath) { + console.error('Error: No benchmark path specified'); + process.exit(1); +} + +const fullBenchmarkPath = path.resolve(benchmarkPath); +if (!fs.existsSync(fullBenchmarkPath)) { + console.error(`Error: Benchmark file not found: ${fullBenchmarkPath}`); + process.exit(1); +} + +function calculateStats(values) { + const mean = values.reduce((sum, val) => sum + val, 0) / values.length; + + const squaredDiffs = values.map((val) => { + const diff = val - mean; + const squared = diff ** 2; + return squared; + }); + + const variance = squaredDiffs.reduce((sum, val) => sum + val, 0) / values.length; + const stdDev = Math.sqrt(variance); + const cv = stdDev / mean; + + return { mean, stdDev, cv, variance }; +} + +function runBenchmark(n) { + return new Promise((resolve, reject) => { + const child = fork( + fullBenchmarkPath, + [`n=${n}`], + { stdio: ['inherit', 'pipe', 'inherit', 'ipc'] }, + ); + + const results = []; + child.on('message', (data) => { + if (data.type === 'report' && data.rate && data.conf) { + results.push({ + rate: data.rate, + conf: data.conf, + }); + } + }); + + child.on('close', (code) => { + if (code !== 0) { + reject(new Error(`Benchmark exited with code ${code}`)); + } else { + resolve(results); + } + }); + }); +} + +async function main(n = startN) { + let increaseCount = 0; + let bestN = n; + let bestCV = Infinity; + let bestGroupStats = null; + + console.log(` +-------------------------------------------------------- +Benchmark: ${benchmarkPath} +-------------------------------------------------------- +What we are trying to find: The optimal number of iterations (n) +that produces consistent benchmark results without wasting time. + +How it works: +1. Run the benchmark multiple times with a specific n value +2. Group results by configuration +3. If overall CV is above 5% or any configuration has CV above 10%, increase n and try again + +Configuration: +- Starting n: ${n.toLocaleString()} iterations +- Runs per n value: ${runs} +- Target CV threshold: ${cvThreshold * 100}% (lower CV = more stable results) +- Max increases: ${maxIncreases} +- Increase factor: ${increaseFactor}x`); + + while (increaseCount < maxIncreases) { + console.log(`\nTesting with n=${n}:`); + + const resultsData = []; + for (let i = 0; i < runs; i++) { + const results = await runBenchmark(n); + // Each run might return multiple results (one per configuration) + if (Array.isArray(results) && results.length > 0) { + resultsData.push(...results); + } else if (results) { + resultsData.push(results); + } + process.stdout.write('.'); + } + process.stdout.write('\n'); + + const groupedResults = {}; + resultsData.forEach((result) => { + if (!result || !result.conf) return; + + const confKey = JSON.stringify(result.conf); + groupedResults[confKey] ||= { + conf: result.conf, + rates: [], + }; + + groupedResults[confKey].rates.push(result.rate); + }); + + const groupStats = []; + for (const [confKey, group] of Object.entries(groupedResults)) { + console.log(`\nConfiguration: ${JSON.stringify(group.conf)}`); + + const stats = calculateStats(group.rates); + console.log(` CV: ${(stats.cv * 100).toFixed(2)}% (lower values mean more stable results)`); + + const isStable = stats.cv <= cvThreshold; + console.log(` Stability: ${isStable ? + styleText(['bold', 'green'], '✓ Stable') : + styleText(['bold', 'red'], '✗ Unstable')}`); + + groupStats.push({ + confKey, + stats, + isStable, + }); + } + + if (groupStats.length > 0) { + // Check if any configuration has CV > 10% (too unstable) + const tooUnstableConfigs = groupStats.filter((g) => g.stats.cv > 0.10); + + const avgCV = groupStats.reduce((sum, g) => sum + g.stats.cv, 0) / groupStats.length; + console.log(`\nOverall average CV: ${(avgCV * 100).toFixed(2)}%`); + + const isOverallStable = avgCV < CV_THRESHOLD; + const hasVeryUnstableConfigs = tooUnstableConfigs.length > 0; + + // Check if overall CV is below CV_THRESHOLD and no configuration has CV > 10% + if (isOverallStable && !hasVeryUnstableConfigs) { + console.log(styleText(['bold', 'green'], ` ✓ Overall CV is below 5% and no configuration has CV above 10%`)); + } else { + if (!isOverallStable) { + console.log(styleText(['bold', 'red'], ` ✗ Overall CV (${(avgCV * 100).toFixed(2)}%) is above 5%`)); + } + if (hasVeryUnstableConfigs) { + console.log(styleText(['bold', 'red'], ` ✗ ${tooUnstableConfigs.length} configuration(s) have CV above 10%`)); + } + } + + if (avgCV < bestCV || !bestGroupStats) { + bestN = n; + bestCV = avgCV; + + bestGroupStats = []; + for (const group of Object.values(groupedResults)) { + if (group.rates.length >= 3) { + const stats = calculateStats(group.rates); + bestGroupStats.push({ + conf: group.conf, + stats: stats, + isStable: stats.cv <= 0.10, + }); + } + } + console.log(` → New best n: ${n} with average CV: ${(avgCV * 100).toFixed(2)}%`); + } else { + console.log(` → Current best n remains: ${bestN} with average CV: ${(bestCV * 100).toFixed(2)}%`); + } + } + + // Check if we've reached acceptable stability based on new criteria + // 1. Overall CV should be below CV_THRESHOLD + // 2. No configuration should have a CV greater than 10% + const avgCV = groupStats.length > 0 ? + groupStats.reduce((sum, g) => sum + g.stats.cv, 0) / groupStats.length : Infinity; + const hasUnstableConfig = groupStats.some((g) => g.stats.cv > 0.10); + const isOverallStable = avgCV < CV_THRESHOLD; + + if (isOverallStable && !hasUnstableConfig) { + console.log(`\n✓ Found optimal n=${n} (Overall CV=${(avgCV * 100).toFixed(2)}% < 5% and no configuration has CV > 10%)`); + console.log('\nFinal CV for each configuration:'); + groupStats.forEach((g) => { + console.log(` ${JSON.stringify(groupedResults[g.confKey].conf)}: ${(g.stats.cv * 100).toFixed(2)}%`); + }); + + return n; + } + + increaseCount++; + n *= increaseFactor; + } + + if (increaseCount >= maxIncreases) { + const finalAvgCV = bestGroupStats && bestGroupStats.length > 0 ? + bestGroupStats.reduce((sum, g) => sum + g.stats.cv, 0) / bestGroupStats.length : Infinity; + + console.log(`Maximum number of increases (${maxIncreases}) reached without achieving target stability`); + console.log(`Best n found: ${bestN} with average CV=${(finalAvgCV * 100).toFixed(2)}%`); + console.log(`\nCV by configuration at best n:`); + + if (bestGroupStats) { + bestGroupStats.forEach((g) => { + if (g.conf) { + console.log(` ${JSON.stringify(g.conf)}: ${(g.stats.cv * 100).toFixed(2)}%`); + if (g.stats.cv > cvThreshold) { + console.log(` ⚠️ This configuration is above the target threshold of ${cvThreshold * 100}%`); + } + } + }); + } + } + + console.log(` +Recommendation: You might want to try increasing --max-increases to +continue testing with larger n values, or adjust --cv-threshold to +accept the current best result, or investigate if specific configurations +are contributing to instability.`); + return bestN; +} + +main().catch((err) => { + console.error('Error:', err); + process.exit(1); +}); diff --git a/doc/contributing/writing-and-running-benchmarks.md b/doc/contributing/writing-and-running-benchmarks.md index 2664116aefc4b3..a60a06b695f242 100644 --- a/doc/contributing/writing-and-running-benchmarks.md +++ b/doc/contributing/writing-and-running-benchmarks.md @@ -9,6 +9,7 @@ * [Benchmark analysis requirements](#benchmark-analysis-requirements) * [Running benchmarks](#running-benchmarks) * [Running individual benchmarks](#running-individual-benchmarks) + * [Calibrating the number of iterations with calibrate-n.js](#calibrating-the-number-of-iterations-with-calibrate-njs) * [Running all benchmarks](#running-all-benchmarks) * [Specifying CPU Cores for Benchmarks with run.js](#specifying-cpu-cores-for-benchmarks-with-runjs) * [Filtering benchmarks](#filtering-benchmarks) @@ -142,6 +143,46 @@ buffers/buffer-tostring.js n=10000000 len=1024 arg=true: 3498295.68561504 buffers/buffer-tostring.js n=10000000 len=1024 arg=false: 3783071.1678948295 ``` +### Calibrating the number of iterations with calibrate-n.js + +Before running benchmarks, it's often useful to determine the optimal number of iterations (`n`) +that provides statistically stable results. The `calibrate-n.js` tool helps find this value by +running a benchmark multiple times with increasing `n` values until the coefficient of variation (CV) +falls below a target threshold. + +```console +$ node benchmark/calibrate-n.js benchmark/buffers/buffer-compare.js + +-------------------------------------------------------- +Benchmark: buffers/buffer-compare.js +-------------------------------------------------------- +What we are trying to find: The optimal number of iterations (n) +that produces consistent benchmark results without wasting time. + +How it works: +1. Run the benchmark multiple times with a specific n value +2. Group results by configuration +3. If overall CV is above 5% or any configuration has CV above 10%, increase n and try again +4. Stop when we have stable results (overall CV < 5% and all configs CV < 10%) or max increases reached + +Configuration: +- Starting n: 10 iterations +- Runs per n value: 30 +- Target CV threshold: 5% (lower CV = more stable results) +- Max increases: 6 +- Increase factor: 10x +``` + +The tool accepts several options: + +* `--runs=N`: Number of runs for each n value (default: 30) +* `--cv-threshold=N`: Target coefficient of variation threshold (default: 0.05) +* `--max-increases=N`: Maximum number of n increases to try (default: 6) +* `--start-n=N`: Initial n value to start with (default: 10) +* `--increase=N`: Factor by which to increase n (default: 10) + +Once you've determined a stable `n` value, you can use it when running your benchmarks. + ### Running all benchmarks Similar to running individual benchmarks, a group of benchmarks can be executed