generate_review.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. #!/usr/bin/env python3
  2. """Generate and serve a review page for eval results.
  3. Reads the workspace directory, discovers runs (directories with outputs/),
  4. embeds all output data into a self-contained HTML page, and serves it via
  5. a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
  6. Usage:
  7. python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
  8. python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
  9. No dependencies beyond the Python stdlib are required.
  10. """
  11. import argparse
  12. import base64
  13. import json
  14. import mimetypes
  15. import os
  16. import re
  17. import signal
  18. import subprocess
  19. import sys
  20. import time
  21. import webbrowser
  22. from functools import partial
  23. from http.server import HTTPServer, BaseHTTPRequestHandler
  24. from pathlib import Path
  25. # Files to exclude from output listings
  26. METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
  27. # Extensions we render as inline text
  28. TEXT_EXTENSIONS = {
  29. ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
  30. ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
  31. ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
  32. }
  33. # Extensions we render as inline images
  34. IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
  35. # MIME type overrides for common types
  36. MIME_OVERRIDES = {
  37. ".svg": "image/svg+xml",
  38. ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  39. ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
  40. ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
  41. }
  42. def get_mime_type(path: Path) -> str:
  43. ext = path.suffix.lower()
  44. if ext in MIME_OVERRIDES:
  45. return MIME_OVERRIDES[ext]
  46. mime, _ = mimetypes.guess_type(str(path))
  47. return mime or "application/octet-stream"
  48. def find_runs(workspace: Path) -> list[dict]:
  49. """Recursively find directories that contain an outputs/ subdirectory."""
  50. runs: list[dict] = []
  51. _find_runs_recursive(workspace, workspace, runs)
  52. runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
  53. return runs
  54. def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
  55. if not current.is_dir():
  56. return
  57. outputs_dir = current / "outputs"
  58. if outputs_dir.is_dir():
  59. run = build_run(root, current)
  60. if run:
  61. runs.append(run)
  62. return
  63. skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
  64. for child in sorted(current.iterdir()):
  65. if child.is_dir() and child.name not in skip:
  66. _find_runs_recursive(root, child, runs)
  67. def build_run(root: Path, run_dir: Path) -> dict | None:
  68. """Build a run dict with prompt, outputs, and grading data."""
  69. prompt = ""
  70. eval_id = None
  71. # Try eval_metadata.json
  72. for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
  73. if candidate.exists():
  74. try:
  75. metadata = json.loads(candidate.read_text())
  76. prompt = metadata.get("prompt", "")
  77. eval_id = metadata.get("eval_id")
  78. except (json.JSONDecodeError, OSError):
  79. pass
  80. if prompt:
  81. break
  82. # Fall back to transcript.md
  83. if not prompt:
  84. for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
  85. if candidate.exists():
  86. try:
  87. text = candidate.read_text()
  88. match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
  89. if match:
  90. prompt = match.group(1).strip()
  91. except OSError:
  92. pass
  93. if prompt:
  94. break
  95. if not prompt:
  96. prompt = "(No prompt found)"
  97. run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
  98. # Collect output files
  99. outputs_dir = run_dir / "outputs"
  100. output_files: list[dict] = []
  101. if outputs_dir.is_dir():
  102. for f in sorted(outputs_dir.iterdir()):
  103. if f.is_file() and f.name not in METADATA_FILES:
  104. output_files.append(embed_file(f))
  105. # Load grading if present
  106. grading = None
  107. for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
  108. if candidate.exists():
  109. try:
  110. grading = json.loads(candidate.read_text())
  111. except (json.JSONDecodeError, OSError):
  112. pass
  113. if grading:
  114. break
  115. return {
  116. "id": run_id,
  117. "prompt": prompt,
  118. "eval_id": eval_id,
  119. "outputs": output_files,
  120. "grading": grading,
  121. }
  122. def embed_file(path: Path) -> dict:
  123. """Read a file and return an embedded representation."""
  124. ext = path.suffix.lower()
  125. mime = get_mime_type(path)
  126. if ext in TEXT_EXTENSIONS:
  127. try:
  128. content = path.read_text(errors="replace")
  129. except OSError:
  130. content = "(Error reading file)"
  131. return {
  132. "name": path.name,
  133. "type": "text",
  134. "content": content,
  135. }
  136. elif ext in IMAGE_EXTENSIONS:
  137. try:
  138. raw = path.read_bytes()
  139. b64 = base64.b64encode(raw).decode("ascii")
  140. except OSError:
  141. return {"name": path.name, "type": "error", "content": "(Error reading file)"}
  142. return {
  143. "name": path.name,
  144. "type": "image",
  145. "mime": mime,
  146. "data_uri": f"data:{mime};base64,{b64}",
  147. }
  148. elif ext == ".pdf":
  149. try:
  150. raw = path.read_bytes()
  151. b64 = base64.b64encode(raw).decode("ascii")
  152. except OSError:
  153. return {"name": path.name, "type": "error", "content": "(Error reading file)"}
  154. return {
  155. "name": path.name,
  156. "type": "pdf",
  157. "data_uri": f"data:{mime};base64,{b64}",
  158. }
  159. elif ext == ".xlsx":
  160. try:
  161. raw = path.read_bytes()
  162. b64 = base64.b64encode(raw).decode("ascii")
  163. except OSError:
  164. return {"name": path.name, "type": "error", "content": "(Error reading file)"}
  165. return {
  166. "name": path.name,
  167. "type": "xlsx",
  168. "data_b64": b64,
  169. }
  170. else:
  171. # Binary / unknown — base64 download link
  172. try:
  173. raw = path.read_bytes()
  174. b64 = base64.b64encode(raw).decode("ascii")
  175. except OSError:
  176. return {"name": path.name, "type": "error", "content": "(Error reading file)"}
  177. return {
  178. "name": path.name,
  179. "type": "binary",
  180. "mime": mime,
  181. "data_uri": f"data:{mime};base64,{b64}",
  182. }
  183. def load_previous_iteration(workspace: Path) -> dict[str, dict]:
  184. """Load previous iteration's feedback and outputs.
  185. Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
  186. """
  187. result: dict[str, dict] = {}
  188. # Load feedback
  189. feedback_map: dict[str, str] = {}
  190. feedback_path = workspace / "feedback.json"
  191. if feedback_path.exists():
  192. try:
  193. data = json.loads(feedback_path.read_text())
  194. feedback_map = {
  195. r["run_id"]: r["feedback"]
  196. for r in data.get("reviews", [])
  197. if r.get("feedback", "").strip()
  198. }
  199. except (json.JSONDecodeError, OSError, KeyError):
  200. pass
  201. # Load runs (to get outputs)
  202. prev_runs = find_runs(workspace)
  203. for run in prev_runs:
  204. result[run["id"]] = {
  205. "feedback": feedback_map.get(run["id"], ""),
  206. "outputs": run.get("outputs", []),
  207. }
  208. # Also add feedback for run_ids that had feedback but no matching run
  209. for run_id, fb in feedback_map.items():
  210. if run_id not in result:
  211. result[run_id] = {"feedback": fb, "outputs": []}
  212. return result
  213. def generate_html(
  214. runs: list[dict],
  215. skill_name: str,
  216. previous: dict[str, dict] | None = None,
  217. benchmark: dict | None = None,
  218. ) -> str:
  219. """Generate the complete standalone HTML page with embedded data."""
  220. template_path = Path(__file__).parent / "viewer.html"
  221. template = template_path.read_text()
  222. # Build previous_feedback and previous_outputs maps for the template
  223. previous_feedback: dict[str, str] = {}
  224. previous_outputs: dict[str, list[dict]] = {}
  225. if previous:
  226. for run_id, data in previous.items():
  227. if data.get("feedback"):
  228. previous_feedback[run_id] = data["feedback"]
  229. if data.get("outputs"):
  230. previous_outputs[run_id] = data["outputs"]
  231. embedded = {
  232. "skill_name": skill_name,
  233. "runs": runs,
  234. "previous_feedback": previous_feedback,
  235. "previous_outputs": previous_outputs,
  236. }
  237. if benchmark:
  238. embedded["benchmark"] = benchmark
  239. data_json = json.dumps(embedded)
  240. return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
  241. # ---------------------------------------------------------------------------
  242. # HTTP server (stdlib only, zero dependencies)
  243. # ---------------------------------------------------------------------------
  244. def _kill_port(port: int) -> None:
  245. """Kill any process listening on the given port."""
  246. try:
  247. result = subprocess.run(
  248. ["lsof", "-ti", f":{port}"],
  249. capture_output=True, text=True, timeout=5,
  250. )
  251. for pid_str in result.stdout.strip().split("\n"):
  252. if pid_str.strip():
  253. try:
  254. os.kill(int(pid_str.strip()), signal.SIGTERM)
  255. except (ProcessLookupError, ValueError):
  256. pass
  257. if result.stdout.strip():
  258. time.sleep(0.5)
  259. except subprocess.TimeoutExpired:
  260. pass
  261. except FileNotFoundError:
  262. print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
  263. class ReviewHandler(BaseHTTPRequestHandler):
  264. """Serves the review HTML and handles feedback saves.
  265. Regenerates the HTML on each page load so that refreshing the browser
  266. picks up new eval outputs without restarting the server.
  267. """
  268. def __init__(
  269. self,
  270. workspace: Path,
  271. skill_name: str,
  272. feedback_path: Path,
  273. previous: dict[str, dict],
  274. benchmark_path: Path | None,
  275. *args,
  276. **kwargs,
  277. ):
  278. self.workspace = workspace
  279. self.skill_name = skill_name
  280. self.feedback_path = feedback_path
  281. self.previous = previous
  282. self.benchmark_path = benchmark_path
  283. super().__init__(*args, **kwargs)
  284. def do_GET(self) -> None:
  285. if self.path == "/" or self.path == "/index.html":
  286. # Regenerate HTML on each request (re-scans workspace for new outputs)
  287. runs = find_runs(self.workspace)
  288. benchmark = None
  289. if self.benchmark_path and self.benchmark_path.exists():
  290. try:
  291. benchmark = json.loads(self.benchmark_path.read_text())
  292. except (json.JSONDecodeError, OSError):
  293. pass
  294. html = generate_html(runs, self.skill_name, self.previous, benchmark)
  295. content = html.encode("utf-8")
  296. self.send_response(200)
  297. self.send_header("Content-Type", "text/html; charset=utf-8")
  298. self.send_header("Content-Length", str(len(content)))
  299. self.end_headers()
  300. self.wfile.write(content)
  301. elif self.path == "/api/feedback":
  302. data = b"{}"
  303. if self.feedback_path.exists():
  304. data = self.feedback_path.read_bytes()
  305. self.send_response(200)
  306. self.send_header("Content-Type", "application/json")
  307. self.send_header("Content-Length", str(len(data)))
  308. self.end_headers()
  309. self.wfile.write(data)
  310. else:
  311. self.send_error(404)
  312. def do_POST(self) -> None:
  313. if self.path == "/api/feedback":
  314. length = int(self.headers.get("Content-Length", 0))
  315. body = self.rfile.read(length)
  316. try:
  317. data = json.loads(body)
  318. if not isinstance(data, dict) or "reviews" not in data:
  319. raise ValueError("Expected JSON object with 'reviews' key")
  320. self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
  321. resp = b'{"ok":true}'
  322. self.send_response(200)
  323. except (json.JSONDecodeError, OSError, ValueError) as e:
  324. resp = json.dumps({"error": str(e)}).encode()
  325. self.send_response(500)
  326. self.send_header("Content-Type", "application/json")
  327. self.send_header("Content-Length", str(len(resp)))
  328. self.end_headers()
  329. self.wfile.write(resp)
  330. else:
  331. self.send_error(404)
  332. def log_message(self, format: str, *args: object) -> None:
  333. # Suppress request logging to keep terminal clean
  334. pass
  335. def main() -> None:
  336. parser = argparse.ArgumentParser(description="Generate and serve eval review")
  337. parser.add_argument("workspace", type=Path, help="Path to workspace directory")
  338. parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
  339. parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
  340. parser.add_argument(
  341. "--previous-workspace", type=Path, default=None,
  342. help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
  343. )
  344. parser.add_argument(
  345. "--benchmark", type=Path, default=None,
  346. help="Path to benchmark.json to show in the Benchmark tab",
  347. )
  348. parser.add_argument(
  349. "--static", "-s", type=Path, default=None,
  350. help="Write standalone HTML to this path instead of starting a server",
  351. )
  352. args = parser.parse_args()
  353. workspace = args.workspace.resolve()
  354. if not workspace.is_dir():
  355. print(f"Error: {workspace} is not a directory", file=sys.stderr)
  356. sys.exit(1)
  357. runs = find_runs(workspace)
  358. if not runs:
  359. print(f"No runs found in {workspace}", file=sys.stderr)
  360. sys.exit(1)
  361. skill_name = args.skill_name or workspace.name.replace("-workspace", "")
  362. feedback_path = workspace / "feedback.json"
  363. previous: dict[str, dict] = {}
  364. if args.previous_workspace:
  365. previous = load_previous_iteration(args.previous_workspace.resolve())
  366. benchmark_path = args.benchmark.resolve() if args.benchmark else None
  367. benchmark = None
  368. if benchmark_path and benchmark_path.exists():
  369. try:
  370. benchmark = json.loads(benchmark_path.read_text())
  371. except (json.JSONDecodeError, OSError):
  372. pass
  373. if args.static:
  374. html = generate_html(runs, skill_name, previous, benchmark)
  375. args.static.parent.mkdir(parents=True, exist_ok=True)
  376. args.static.write_text(html)
  377. print(f"\n Static viewer written to: {args.static}\n")
  378. sys.exit(0)
  379. # Kill any existing process on the target port
  380. port = args.port
  381. _kill_port(port)
  382. handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
  383. try:
  384. server = HTTPServer(("127.0.0.1", port), handler)
  385. except OSError:
  386. # Port still in use after kill attempt — find a free one
  387. server = HTTPServer(("127.0.0.1", 0), handler)
  388. port = server.server_address[1]
  389. url = f"http://localhost:{port}"
  390. print(f"\n Eval Viewer")
  391. print(f" ─────────────────────────────────")
  392. print(f" URL: {url}")
  393. print(f" Workspace: {workspace}")
  394. print(f" Feedback: {feedback_path}")
  395. if previous:
  396. print(f" Previous: {args.previous_workspace} ({len(previous)} runs)")
  397. if benchmark_path:
  398. print(f" Benchmark: {benchmark_path}")
  399. print(f"\n Press Ctrl+C to stop.\n")
  400. webbrowser.open(url)
  401. try:
  402. server.serve_forever()
  403. except KeyboardInterrupt:
  404. print("\nStopped.")
  405. server.server_close()
  406. if __name__ == "__main__":
  407. main()