generate_report.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. #!/usr/bin/env python3
  2. """Generate an HTML report from run_loop.py output.
  3. Takes the JSON output from run_loop.py and generates a visual HTML report
  4. showing each description attempt with check/x for each test case.
  5. Distinguishes between train and test queries.
  6. """
  7. import argparse
  8. import html
  9. import json
  10. import sys
  11. from pathlib import Path
  12. def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
  13. """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
  14. history = data.get("history", [])
  15. holdout = data.get("holdout", 0)
  16. title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
  17. # Get all unique queries from train and test sets, with should_trigger info
  18. train_queries: list[dict] = []
  19. test_queries: list[dict] = []
  20. if history:
  21. for r in history[0].get("train_results", history[0].get("results", [])):
  22. train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
  23. if history[0].get("test_results"):
  24. for r in history[0].get("test_results", []):
  25. test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
  26. refresh_tag = ' <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
  27. html_parts = ["""<!DOCTYPE html>
  28. <html>
  29. <head>
  30. <meta charset="utf-8">
  31. """ + refresh_tag + """ <title>""" + title_prefix + """Skill Description Optimization</title>
  32. <link rel="preconnect" href="https://fonts.googleapis.com">
  33. <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  34. <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
  35. <style>
  36. body {
  37. font-family: 'Lora', Georgia, serif;
  38. max-width: 100%;
  39. margin: 0 auto;
  40. padding: 20px;
  41. background: #faf9f5;
  42. color: #141413;
  43. }
  44. h1 { font-family: 'Poppins', sans-serif; color: #141413; }
  45. .explainer {
  46. background: white;
  47. padding: 15px;
  48. border-radius: 6px;
  49. margin-bottom: 20px;
  50. border: 1px solid #e8e6dc;
  51. color: #b0aea5;
  52. font-size: 0.875rem;
  53. line-height: 1.6;
  54. }
  55. .summary {
  56. background: white;
  57. padding: 15px;
  58. border-radius: 6px;
  59. margin-bottom: 20px;
  60. border: 1px solid #e8e6dc;
  61. }
  62. .summary p { margin: 5px 0; }
  63. .best { color: #788c5d; font-weight: bold; }
  64. .table-container {
  65. overflow-x: auto;
  66. width: 100%;
  67. }
  68. table {
  69. border-collapse: collapse;
  70. background: white;
  71. border: 1px solid #e8e6dc;
  72. border-radius: 6px;
  73. font-size: 12px;
  74. min-width: 100%;
  75. }
  76. th, td {
  77. padding: 8px;
  78. text-align: left;
  79. border: 1px solid #e8e6dc;
  80. white-space: normal;
  81. word-wrap: break-word;
  82. }
  83. th {
  84. font-family: 'Poppins', sans-serif;
  85. background: #141413;
  86. color: #faf9f5;
  87. font-weight: 500;
  88. }
  89. th.test-col {
  90. background: #6a9bcc;
  91. }
  92. th.query-col { min-width: 200px; }
  93. td.description {
  94. font-family: monospace;
  95. font-size: 11px;
  96. word-wrap: break-word;
  97. max-width: 400px;
  98. }
  99. td.result {
  100. text-align: center;
  101. font-size: 16px;
  102. min-width: 40px;
  103. }
  104. td.test-result {
  105. background: #f0f6fc;
  106. }
  107. .pass { color: #788c5d; }
  108. .fail { color: #c44; }
  109. .rate {
  110. font-size: 9px;
  111. color: #b0aea5;
  112. display: block;
  113. }
  114. tr:hover { background: #faf9f5; }
  115. .score {
  116. display: inline-block;
  117. padding: 2px 6px;
  118. border-radius: 4px;
  119. font-weight: bold;
  120. font-size: 11px;
  121. }
  122. .score-good { background: #eef2e8; color: #788c5d; }
  123. .score-ok { background: #fef3c7; color: #d97706; }
  124. .score-bad { background: #fceaea; color: #c44; }
  125. .train-label { color: #b0aea5; font-size: 10px; }
  126. .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
  127. .best-row { background: #f5f8f2; }
  128. th.positive-col { border-bottom: 3px solid #788c5d; }
  129. th.negative-col { border-bottom: 3px solid #c44; }
  130. th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
  131. th.test-col.negative-col { border-bottom: 3px solid #c44; }
  132. .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
  133. .legend-item { display: flex; align-items: center; gap: 6px; }
  134. .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
  135. .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
  136. .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
  137. .swatch-test { background: #6a9bcc; }
  138. .swatch-train { background: #141413; }
  139. </style>
  140. </head>
  141. <body>
  142. <h1>""" + title_prefix + """Skill Description Optimization</h1>
  143. <div class="explainer">
  144. <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
  145. </div>
  146. """]
  147. # Summary section
  148. best_test_score = data.get('best_test_score')
  149. best_train_score = data.get('best_train_score')
  150. html_parts.append(f"""
  151. <div class="summary">
  152. <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
  153. <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
  154. <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
  155. <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
  156. </div>
  157. """)
  158. # Legend
  159. html_parts.append("""
  160. <div class="legend">
  161. <span style="font-weight:600">Query columns:</span>
  162. <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
  163. <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
  164. <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
  165. <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
  166. </div>
  167. """)
  168. # Table header
  169. html_parts.append("""
  170. <div class="table-container">
  171. <table>
  172. <thead>
  173. <tr>
  174. <th>Iter</th>
  175. <th>Train</th>
  176. <th>Test</th>
  177. <th class="query-col">Description</th>
  178. """)
  179. # Add column headers for train queries
  180. for qinfo in train_queries:
  181. polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
  182. html_parts.append(f' <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
  183. # Add column headers for test queries (different color)
  184. for qinfo in test_queries:
  185. polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
  186. html_parts.append(f' <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
  187. html_parts.append(""" </tr>
  188. </thead>
  189. <tbody>
  190. """)
  191. # Find best iteration for highlighting
  192. if test_queries:
  193. best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
  194. else:
  195. best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
  196. # Add rows for each iteration
  197. for h in history:
  198. iteration = h.get("iteration", "?")
  199. train_passed = h.get("train_passed", h.get("passed", 0))
  200. train_total = h.get("train_total", h.get("total", 0))
  201. test_passed = h.get("test_passed")
  202. test_total = h.get("test_total")
  203. description = h.get("description", "")
  204. train_results = h.get("train_results", h.get("results", []))
  205. test_results = h.get("test_results", [])
  206. # Create lookups for results by query
  207. train_by_query = {r["query"]: r for r in train_results}
  208. test_by_query = {r["query"]: r for r in test_results} if test_results else {}
  209. # Compute aggregate correct/total runs across all retries
  210. def aggregate_runs(results: list[dict]) -> tuple[int, int]:
  211. correct = 0
  212. total = 0
  213. for r in results:
  214. runs = r.get("runs", 0)
  215. triggers = r.get("triggers", 0)
  216. total += runs
  217. if r.get("should_trigger", True):
  218. correct += triggers
  219. else:
  220. correct += runs - triggers
  221. return correct, total
  222. train_correct, train_runs = aggregate_runs(train_results)
  223. test_correct, test_runs = aggregate_runs(test_results)
  224. # Determine score classes
  225. def score_class(correct: int, total: int) -> str:
  226. if total > 0:
  227. ratio = correct / total
  228. if ratio >= 0.8:
  229. return "score-good"
  230. elif ratio >= 0.5:
  231. return "score-ok"
  232. return "score-bad"
  233. train_class = score_class(train_correct, train_runs)
  234. test_class = score_class(test_correct, test_runs)
  235. row_class = "best-row" if iteration == best_iter else ""
  236. html_parts.append(f""" <tr class="{row_class}">
  237. <td>{iteration}</td>
  238. <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
  239. <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
  240. <td class="description">{html.escape(description)}</td>
  241. """)
  242. # Add result for each train query
  243. for qinfo in train_queries:
  244. r = train_by_query.get(qinfo["query"], {})
  245. did_pass = r.get("pass", False)
  246. triggers = r.get("triggers", 0)
  247. runs = r.get("runs", 0)
  248. icon = "✓" if did_pass else "✗"
  249. css_class = "pass" if did_pass else "fail"
  250. html_parts.append(f' <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
  251. # Add result for each test query (with different background)
  252. for qinfo in test_queries:
  253. r = test_by_query.get(qinfo["query"], {})
  254. did_pass = r.get("pass", False)
  255. triggers = r.get("triggers", 0)
  256. runs = r.get("runs", 0)
  257. icon = "✓" if did_pass else "✗"
  258. css_class = "pass" if did_pass else "fail"
  259. html_parts.append(f' <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
  260. html_parts.append(" </tr>\n")
  261. html_parts.append(""" </tbody>
  262. </table>
  263. </div>
  264. """)
  265. html_parts.append("""
  266. </body>
  267. </html>
  268. """)
  269. return "".join(html_parts)
  270. def main():
  271. parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
  272. parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
  273. parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
  274. parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
  275. args = parser.parse_args()
  276. if args.input == "-":
  277. data = json.load(sys.stdin)
  278. else:
  279. data = json.loads(Path(args.input).read_text())
  280. html_output = generate_html(data, skill_name=args.skill_name)
  281. if args.output:
  282. Path(args.output).write_text(html_output)
  283. print(f"Report written to {args.output}", file=sys.stderr)
  284. else:
  285. print(html_output)
  286. if __name__ == "__main__":
  287. main()