aggregate_benchmark.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. #!/usr/bin/env python3
  2. """
  3. Aggregate individual run results into benchmark summary statistics.
  4. Reads grading.json files from run directories and produces:
  5. - run_summary with mean, stddev, min, max for each metric
  6. - delta between with_skill and without_skill configurations
  7. Usage:
  8. python aggregate_benchmark.py <benchmark_dir>
  9. Example:
  10. python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
  11. The script supports two directory layouts:
  12. Workspace layout (from skill-creator iterations):
  13. <benchmark_dir>/
  14. └── eval-N/
  15. ├── with_skill/
  16. │ ├── run-1/grading.json
  17. │ └── run-2/grading.json
  18. └── without_skill/
  19. ├── run-1/grading.json
  20. └── run-2/grading.json
  21. Legacy layout (with runs/ subdirectory):
  22. <benchmark_dir>/
  23. └── runs/
  24. └── eval-N/
  25. ├── with_skill/
  26. │ └── run-1/grading.json
  27. └── without_skill/
  28. └── run-1/grading.json
  29. """
  30. import argparse
  31. import json
  32. import math
  33. import sys
  34. from datetime import datetime, timezone
  35. from pathlib import Path
  36. def calculate_stats(values: list[float]) -> dict:
  37. """Calculate mean, stddev, min, max for a list of values."""
  38. if not values:
  39. return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
  40. n = len(values)
  41. mean = sum(values) / n
  42. if n > 1:
  43. variance = sum((x - mean) ** 2 for x in values) / (n - 1)
  44. stddev = math.sqrt(variance)
  45. else:
  46. stddev = 0.0
  47. return {
  48. "mean": round(mean, 4),
  49. "stddev": round(stddev, 4),
  50. "min": round(min(values), 4),
  51. "max": round(max(values), 4)
  52. }
  53. def load_run_results(benchmark_dir: Path) -> dict:
  54. """
  55. Load all run results from a benchmark directory.
  56. Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
  57. or "new_skill"/"old_skill"), each containing a list of run results.
  58. """
  59. # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
  60. runs_dir = benchmark_dir / "runs"
  61. if runs_dir.exists():
  62. search_dir = runs_dir
  63. elif list(benchmark_dir.glob("eval-*")):
  64. search_dir = benchmark_dir
  65. else:
  66. print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
  67. return {}
  68. results: dict[str, list] = {}
  69. for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
  70. metadata_path = eval_dir / "eval_metadata.json"
  71. if metadata_path.exists():
  72. try:
  73. with open(metadata_path) as mf:
  74. eval_id = json.load(mf).get("eval_id", eval_idx)
  75. except (json.JSONDecodeError, OSError):
  76. eval_id = eval_idx
  77. else:
  78. try:
  79. eval_id = int(eval_dir.name.split("-")[1])
  80. except ValueError:
  81. eval_id = eval_idx
  82. # Discover config directories dynamically rather than hardcoding names
  83. for config_dir in sorted(eval_dir.iterdir()):
  84. if not config_dir.is_dir():
  85. continue
  86. # Skip non-config directories (inputs, outputs, etc.)
  87. if not list(config_dir.glob("run-*")):
  88. continue
  89. config = config_dir.name
  90. if config not in results:
  91. results[config] = []
  92. for run_dir in sorted(config_dir.glob("run-*")):
  93. run_number = int(run_dir.name.split("-")[1])
  94. grading_file = run_dir / "grading.json"
  95. if not grading_file.exists():
  96. print(f"Warning: grading.json not found in {run_dir}")
  97. continue
  98. try:
  99. with open(grading_file) as f:
  100. grading = json.load(f)
  101. except json.JSONDecodeError as e:
  102. print(f"Warning: Invalid JSON in {grading_file}: {e}")
  103. continue
  104. # Extract metrics
  105. result = {
  106. "eval_id": eval_id,
  107. "run_number": run_number,
  108. "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
  109. "passed": grading.get("summary", {}).get("passed", 0),
  110. "failed": grading.get("summary", {}).get("failed", 0),
  111. "total": grading.get("summary", {}).get("total", 0),
  112. }
  113. # Extract timing — check grading.json first, then sibling timing.json
  114. timing = grading.get("timing", {})
  115. result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
  116. timing_file = run_dir / "timing.json"
  117. if result["time_seconds"] == 0.0 and timing_file.exists():
  118. try:
  119. with open(timing_file) as tf:
  120. timing_data = json.load(tf)
  121. result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
  122. result["tokens"] = timing_data.get("total_tokens", 0)
  123. except json.JSONDecodeError:
  124. pass
  125. # Extract metrics if available
  126. metrics = grading.get("execution_metrics", {})
  127. result["tool_calls"] = metrics.get("total_tool_calls", 0)
  128. if not result.get("tokens"):
  129. result["tokens"] = metrics.get("output_chars", 0)
  130. result["errors"] = metrics.get("errors_encountered", 0)
  131. # Extract expectations — viewer requires fields: text, passed, evidence
  132. raw_expectations = grading.get("expectations", [])
  133. for exp in raw_expectations:
  134. if "text" not in exp or "passed" not in exp:
  135. print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
  136. result["expectations"] = raw_expectations
  137. # Extract notes from user_notes_summary
  138. notes_summary = grading.get("user_notes_summary", {})
  139. notes = []
  140. notes.extend(notes_summary.get("uncertainties", []))
  141. notes.extend(notes_summary.get("needs_review", []))
  142. notes.extend(notes_summary.get("workarounds", []))
  143. result["notes"] = notes
  144. results[config].append(result)
  145. return results
  146. def aggregate_results(results: dict) -> dict:
  147. """
  148. Aggregate run results into summary statistics.
  149. Returns run_summary with stats for each configuration and delta.
  150. """
  151. run_summary = {}
  152. configs = list(results.keys())
  153. for config in configs:
  154. runs = results.get(config, [])
  155. if not runs:
  156. run_summary[config] = {
  157. "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
  158. "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
  159. "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
  160. }
  161. continue
  162. pass_rates = [r["pass_rate"] for r in runs]
  163. times = [r["time_seconds"] for r in runs]
  164. tokens = [r.get("tokens", 0) for r in runs]
  165. run_summary[config] = {
  166. "pass_rate": calculate_stats(pass_rates),
  167. "time_seconds": calculate_stats(times),
  168. "tokens": calculate_stats(tokens)
  169. }
  170. # Calculate delta between the first two configs (if two exist)
  171. if len(configs) >= 2:
  172. primary = run_summary.get(configs[0], {})
  173. baseline = run_summary.get(configs[1], {})
  174. else:
  175. primary = run_summary.get(configs[0], {}) if configs else {}
  176. baseline = {}
  177. delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
  178. delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
  179. delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
  180. run_summary["delta"] = {
  181. "pass_rate": f"{delta_pass_rate:+.2f}",
  182. "time_seconds": f"{delta_time:+.1f}",
  183. "tokens": f"{delta_tokens:+.0f}"
  184. }
  185. return run_summary
  186. def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
  187. """
  188. Generate complete benchmark.json from run results.
  189. """
  190. results = load_run_results(benchmark_dir)
  191. run_summary = aggregate_results(results)
  192. # Build runs array for benchmark.json
  193. runs = []
  194. for config in results:
  195. for result in results[config]:
  196. runs.append({
  197. "eval_id": result["eval_id"],
  198. "configuration": config,
  199. "run_number": result["run_number"],
  200. "result": {
  201. "pass_rate": result["pass_rate"],
  202. "passed": result["passed"],
  203. "failed": result["failed"],
  204. "total": result["total"],
  205. "time_seconds": result["time_seconds"],
  206. "tokens": result.get("tokens", 0),
  207. "tool_calls": result.get("tool_calls", 0),
  208. "errors": result.get("errors", 0)
  209. },
  210. "expectations": result["expectations"],
  211. "notes": result["notes"]
  212. })
  213. # Determine eval IDs from results
  214. eval_ids = sorted(set(
  215. r["eval_id"]
  216. for config in results.values()
  217. for r in config
  218. ))
  219. benchmark = {
  220. "metadata": {
  221. "skill_name": skill_name or "<skill-name>",
  222. "skill_path": skill_path or "<path/to/skill>",
  223. "executor_model": "<model-name>",
  224. "analyzer_model": "<model-name>",
  225. "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
  226. "evals_run": eval_ids,
  227. "runs_per_configuration": 3
  228. },
  229. "runs": runs,
  230. "run_summary": run_summary,
  231. "notes": [] # To be filled by analyzer
  232. }
  233. return benchmark
  234. def generate_markdown(benchmark: dict) -> str:
  235. """Generate human-readable benchmark.md from benchmark data."""
  236. metadata = benchmark["metadata"]
  237. run_summary = benchmark["run_summary"]
  238. # Determine config names (excluding "delta")
  239. configs = [k for k in run_summary if k != "delta"]
  240. config_a = configs[0] if len(configs) >= 1 else "config_a"
  241. config_b = configs[1] if len(configs) >= 2 else "config_b"
  242. label_a = config_a.replace("_", " ").title()
  243. label_b = config_b.replace("_", " ").title()
  244. lines = [
  245. f"# Skill Benchmark: {metadata['skill_name']}",
  246. "",
  247. f"**Model**: {metadata['executor_model']}",
  248. f"**Date**: {metadata['timestamp']}",
  249. f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
  250. "",
  251. "## Summary",
  252. "",
  253. f"| Metric | {label_a} | {label_b} | Delta |",
  254. "|--------|------------|---------------|-------|",
  255. ]
  256. a_summary = run_summary.get(config_a, {})
  257. b_summary = run_summary.get(config_b, {})
  258. delta = run_summary.get("delta", {})
  259. # Format pass rate
  260. a_pr = a_summary.get("pass_rate", {})
  261. b_pr = b_summary.get("pass_rate", {})
  262. lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
  263. # Format time
  264. a_time = a_summary.get("time_seconds", {})
  265. b_time = b_summary.get("time_seconds", {})
  266. lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
  267. # Format tokens
  268. a_tokens = a_summary.get("tokens", {})
  269. b_tokens = b_summary.get("tokens", {})
  270. lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
  271. # Notes section
  272. if benchmark.get("notes"):
  273. lines.extend([
  274. "",
  275. "## Notes",
  276. ""
  277. ])
  278. for note in benchmark["notes"]:
  279. lines.append(f"- {note}")
  280. return "\n".join(lines)
  281. def main():
  282. parser = argparse.ArgumentParser(
  283. description="Aggregate benchmark run results into summary statistics"
  284. )
  285. parser.add_argument(
  286. "benchmark_dir",
  287. type=Path,
  288. help="Path to the benchmark directory"
  289. )
  290. parser.add_argument(
  291. "--skill-name",
  292. default="",
  293. help="Name of the skill being benchmarked"
  294. )
  295. parser.add_argument(
  296. "--skill-path",
  297. default="",
  298. help="Path to the skill being benchmarked"
  299. )
  300. parser.add_argument(
  301. "--output", "-o",
  302. type=Path,
  303. help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
  304. )
  305. args = parser.parse_args()
  306. if not args.benchmark_dir.exists():
  307. print(f"Directory not found: {args.benchmark_dir}")
  308. sys.exit(1)
  309. # Generate benchmark
  310. benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
  311. # Determine output paths
  312. output_json = args.output or (args.benchmark_dir / "benchmark.json")
  313. output_md = output_json.with_suffix(".md")
  314. # Write benchmark.json
  315. with open(output_json, "w") as f:
  316. json.dump(benchmark, f, indent=2)
  317. print(f"Generated: {output_json}")
  318. # Write benchmark.md
  319. markdown = generate_markdown(benchmark)
  320. with open(output_md, "w") as f:
  321. f.write(markdown)
  322. print(f"Generated: {output_md}")
  323. # Print summary
  324. run_summary = benchmark["run_summary"]
  325. configs = [k for k in run_summary if k != "delta"]
  326. delta = run_summary.get("delta", {})
  327. print(f"\nSummary:")
  328. for config in configs:
  329. pr = run_summary[config]["pass_rate"]["mean"]
  330. label = config.replace("_", " ").title()
  331. print(f" {label}: {pr*100:.1f}% pass rate")
  332. print(f" Delta: {delta.get('pass_rate', '—')}")
  333. if __name__ == "__main__":
  334. main()