viewer.html 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325
  1. <!DOCTYPE html>
  2. <html lang="en">
  3. <head>
  4. <meta charset="UTF-8">
  5. <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6. <title>Eval Review</title>
  7. <link rel="preconnect" href="https://fonts.googleapis.com">
  8. <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  9. <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
  10. <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
  11. <style>
  12. :root {
  13. --bg: #faf9f5;
  14. --surface: #ffffff;
  15. --border: #e8e6dc;
  16. --text: #141413;
  17. --text-muted: #b0aea5;
  18. --accent: #d97757;
  19. --accent-hover: #c4613f;
  20. --green: #788c5d;
  21. --green-bg: #eef2e8;
  22. --red: #c44;
  23. --red-bg: #fceaea;
  24. --header-bg: #141413;
  25. --header-text: #faf9f5;
  26. --radius: 6px;
  27. }
  28. * { box-sizing: border-box; margin: 0; padding: 0; }
  29. body {
  30. font-family: 'Lora', Georgia, serif;
  31. background: var(--bg);
  32. color: var(--text);
  33. height: 100vh;
  34. display: flex;
  35. flex-direction: column;
  36. }
  37. /* ---- Header ---- */
  38. .header {
  39. background: var(--header-bg);
  40. color: var(--header-text);
  41. padding: 1rem 2rem;
  42. display: flex;
  43. justify-content: space-between;
  44. align-items: center;
  45. flex-shrink: 0;
  46. }
  47. .header h1 {
  48. font-family: 'Poppins', sans-serif;
  49. font-size: 1.25rem;
  50. font-weight: 600;
  51. }
  52. .header .instructions {
  53. font-size: 0.8rem;
  54. opacity: 0.7;
  55. margin-top: 0.25rem;
  56. }
  57. .header .progress {
  58. font-size: 0.875rem;
  59. opacity: 0.8;
  60. text-align: right;
  61. }
  62. /* ---- Main content ---- */
  63. .main {
  64. flex: 1;
  65. overflow-y: auto;
  66. padding: 1.5rem 2rem;
  67. display: flex;
  68. flex-direction: column;
  69. gap: 1.25rem;
  70. }
  71. /* ---- Sections ---- */
  72. .section {
  73. background: var(--surface);
  74. border: 1px solid var(--border);
  75. border-radius: var(--radius);
  76. flex-shrink: 0;
  77. }
  78. .section-header {
  79. font-family: 'Poppins', sans-serif;
  80. padding: 0.75rem 1rem;
  81. font-size: 0.75rem;
  82. font-weight: 500;
  83. text-transform: uppercase;
  84. letter-spacing: 0.05em;
  85. color: var(--text-muted);
  86. border-bottom: 1px solid var(--border);
  87. background: var(--bg);
  88. }
  89. .section-body {
  90. padding: 1rem;
  91. }
  92. /* ---- Config badge ---- */
  93. .config-badge {
  94. display: inline-block;
  95. padding: 0.2rem 0.625rem;
  96. border-radius: 9999px;
  97. font-family: 'Poppins', sans-serif;
  98. font-size: 0.6875rem;
  99. font-weight: 600;
  100. text-transform: uppercase;
  101. letter-spacing: 0.03em;
  102. margin-left: 0.75rem;
  103. vertical-align: middle;
  104. }
  105. .config-badge.config-primary {
  106. background: rgba(33, 150, 243, 0.12);
  107. color: #1976d2;
  108. }
  109. .config-badge.config-baseline {
  110. background: rgba(255, 193, 7, 0.15);
  111. color: #f57f17;
  112. }
  113. /* ---- Prompt ---- */
  114. .prompt-text {
  115. white-space: pre-wrap;
  116. font-size: 0.9375rem;
  117. line-height: 1.6;
  118. }
  119. /* ---- Outputs ---- */
  120. .output-file {
  121. border: 1px solid var(--border);
  122. border-radius: var(--radius);
  123. overflow: hidden;
  124. }
  125. .output-file + .output-file {
  126. margin-top: 1rem;
  127. }
  128. .output-file-header {
  129. padding: 0.5rem 0.75rem;
  130. font-size: 0.8rem;
  131. font-weight: 600;
  132. color: var(--text-muted);
  133. background: var(--bg);
  134. border-bottom: 1px solid var(--border);
  135. font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
  136. display: flex;
  137. justify-content: space-between;
  138. align-items: center;
  139. }
  140. .output-file-header .dl-btn {
  141. font-size: 0.7rem;
  142. color: var(--accent);
  143. text-decoration: none;
  144. cursor: pointer;
  145. font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
  146. font-weight: 500;
  147. opacity: 0.8;
  148. }
  149. .output-file-header .dl-btn:hover {
  150. opacity: 1;
  151. text-decoration: underline;
  152. }
  153. .output-file-content {
  154. padding: 0.75rem;
  155. overflow-x: auto;
  156. }
  157. .output-file-content pre {
  158. font-size: 0.8125rem;
  159. line-height: 1.5;
  160. white-space: pre-wrap;
  161. word-break: break-word;
  162. font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
  163. }
  164. .output-file-content img {
  165. max-width: 100%;
  166. height: auto;
  167. border-radius: 4px;
  168. }
  169. .output-file-content iframe {
  170. width: 100%;
  171. height: 600px;
  172. border: none;
  173. }
  174. .output-file-content table {
  175. border-collapse: collapse;
  176. font-size: 0.8125rem;
  177. width: 100%;
  178. }
  179. .output-file-content table td,
  180. .output-file-content table th {
  181. border: 1px solid var(--border);
  182. padding: 0.375rem 0.5rem;
  183. text-align: left;
  184. }
  185. .output-file-content table th {
  186. background: var(--bg);
  187. font-weight: 600;
  188. }
  189. .output-file-content .download-link {
  190. display: inline-flex;
  191. align-items: center;
  192. gap: 0.5rem;
  193. padding: 0.5rem 1rem;
  194. background: var(--bg);
  195. border: 1px solid var(--border);
  196. border-radius: 4px;
  197. color: var(--accent);
  198. text-decoration: none;
  199. font-size: 0.875rem;
  200. cursor: pointer;
  201. }
  202. .output-file-content .download-link:hover {
  203. background: var(--border);
  204. }
  205. .empty-state {
  206. color: var(--text-muted);
  207. font-style: italic;
  208. padding: 2rem;
  209. text-align: center;
  210. }
  211. /* ---- Feedback ---- */
  212. .prev-feedback {
  213. background: var(--bg);
  214. border: 1px solid var(--border);
  215. border-radius: 4px;
  216. padding: 0.625rem 0.75rem;
  217. margin-top: 0.75rem;
  218. font-size: 0.8125rem;
  219. color: var(--text-muted);
  220. line-height: 1.5;
  221. }
  222. .prev-feedback-label {
  223. font-size: 0.7rem;
  224. font-weight: 600;
  225. text-transform: uppercase;
  226. letter-spacing: 0.04em;
  227. margin-bottom: 0.25rem;
  228. color: var(--text-muted);
  229. }
  230. .feedback-textarea {
  231. width: 100%;
  232. min-height: 100px;
  233. padding: 0.75rem;
  234. border: 1px solid var(--border);
  235. border-radius: 4px;
  236. font-family: inherit;
  237. font-size: 0.9375rem;
  238. line-height: 1.5;
  239. resize: vertical;
  240. color: var(--text);
  241. }
  242. .feedback-textarea:focus {
  243. outline: none;
  244. border-color: var(--accent);
  245. box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
  246. }
  247. .feedback-status {
  248. font-size: 0.75rem;
  249. color: var(--text-muted);
  250. margin-top: 0.5rem;
  251. min-height: 1.1em;
  252. }
  253. /* ---- Grades (collapsible) ---- */
  254. .grades-toggle {
  255. display: flex;
  256. align-items: center;
  257. cursor: pointer;
  258. user-select: none;
  259. }
  260. .grades-toggle:hover {
  261. color: var(--accent);
  262. }
  263. .grades-toggle .arrow {
  264. margin-right: 0.5rem;
  265. transition: transform 0.15s;
  266. font-size: 0.75rem;
  267. }
  268. .grades-toggle .arrow.open {
  269. transform: rotate(90deg);
  270. }
  271. .grades-content {
  272. display: none;
  273. margin-top: 0.75rem;
  274. }
  275. .grades-content.open {
  276. display: block;
  277. }
  278. .grades-summary {
  279. font-size: 0.875rem;
  280. margin-bottom: 0.75rem;
  281. display: flex;
  282. align-items: center;
  283. gap: 0.5rem;
  284. }
  285. .grade-badge {
  286. display: inline-block;
  287. padding: 0.125rem 0.5rem;
  288. border-radius: 9999px;
  289. font-size: 0.75rem;
  290. font-weight: 600;
  291. }
  292. .grade-pass { background: var(--green-bg); color: var(--green); }
  293. .grade-fail { background: var(--red-bg); color: var(--red); }
  294. .assertion-list {
  295. list-style: none;
  296. }
  297. .assertion-item {
  298. padding: 0.625rem 0;
  299. border-bottom: 1px solid var(--border);
  300. font-size: 0.8125rem;
  301. }
  302. .assertion-item:last-child { border-bottom: none; }
  303. .assertion-status {
  304. font-weight: 600;
  305. margin-right: 0.5rem;
  306. }
  307. .assertion-status.pass { color: var(--green); }
  308. .assertion-status.fail { color: var(--red); }
  309. .assertion-evidence {
  310. color: var(--text-muted);
  311. font-size: 0.75rem;
  312. margin-top: 0.25rem;
  313. padding-left: 1.5rem;
  314. }
  315. /* ---- View tabs ---- */
  316. .view-tabs {
  317. display: flex;
  318. gap: 0;
  319. padding: 0 2rem;
  320. background: var(--bg);
  321. border-bottom: 1px solid var(--border);
  322. flex-shrink: 0;
  323. }
  324. .view-tab {
  325. font-family: 'Poppins', sans-serif;
  326. padding: 0.625rem 1.25rem;
  327. font-size: 0.8125rem;
  328. font-weight: 500;
  329. cursor: pointer;
  330. border: none;
  331. background: none;
  332. color: var(--text-muted);
  333. border-bottom: 2px solid transparent;
  334. transition: all 0.15s;
  335. }
  336. .view-tab:hover { color: var(--text); }
  337. .view-tab.active {
  338. color: var(--accent);
  339. border-bottom-color: var(--accent);
  340. }
  341. .view-panel { display: none; }
  342. .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
  343. /* ---- Benchmark view ---- */
  344. .benchmark-view {
  345. padding: 1.5rem 2rem;
  346. overflow-y: auto;
  347. flex: 1;
  348. }
  349. .benchmark-table {
  350. border-collapse: collapse;
  351. background: var(--surface);
  352. border: 1px solid var(--border);
  353. border-radius: var(--radius);
  354. font-size: 0.8125rem;
  355. width: 100%;
  356. margin-bottom: 1.5rem;
  357. }
  358. .benchmark-table th, .benchmark-table td {
  359. padding: 0.625rem 0.75rem;
  360. text-align: left;
  361. border: 1px solid var(--border);
  362. }
  363. .benchmark-table th {
  364. font-family: 'Poppins', sans-serif;
  365. background: var(--header-bg);
  366. color: var(--header-text);
  367. font-weight: 500;
  368. font-size: 0.75rem;
  369. text-transform: uppercase;
  370. letter-spacing: 0.04em;
  371. }
  372. .benchmark-table tr:hover { background: var(--bg); }
  373. .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
  374. .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
  375. .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
  376. .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
  377. .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
  378. .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
  379. .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
  380. .benchmark-delta-positive { color: var(--green); font-weight: 600; }
  381. .benchmark-delta-negative { color: var(--red); font-weight: 600; }
  382. .benchmark-notes {
  383. background: var(--surface);
  384. border: 1px solid var(--border);
  385. border-radius: var(--radius);
  386. padding: 1rem;
  387. }
  388. .benchmark-notes h3 {
  389. font-family: 'Poppins', sans-serif;
  390. font-size: 0.875rem;
  391. margin-bottom: 0.75rem;
  392. }
  393. .benchmark-notes ul {
  394. list-style: disc;
  395. padding-left: 1.25rem;
  396. }
  397. .benchmark-notes li {
  398. font-size: 0.8125rem;
  399. line-height: 1.6;
  400. margin-bottom: 0.375rem;
  401. }
  402. .benchmark-empty {
  403. color: var(--text-muted);
  404. font-style: italic;
  405. text-align: center;
  406. padding: 3rem;
  407. }
  408. /* ---- Navigation ---- */
  409. .nav {
  410. display: flex;
  411. justify-content: space-between;
  412. align-items: center;
  413. padding: 1rem 2rem;
  414. border-top: 1px solid var(--border);
  415. background: var(--surface);
  416. flex-shrink: 0;
  417. }
  418. .nav-btn {
  419. font-family: 'Poppins', sans-serif;
  420. padding: 0.5rem 1.25rem;
  421. border: 1px solid var(--border);
  422. border-radius: var(--radius);
  423. background: var(--surface);
  424. cursor: pointer;
  425. font-size: 0.875rem;
  426. font-weight: 500;
  427. color: var(--text);
  428. transition: all 0.15s;
  429. }
  430. .nav-btn:hover:not(:disabled) {
  431. background: var(--bg);
  432. border-color: var(--text-muted);
  433. }
  434. .nav-btn:disabled {
  435. opacity: 0.4;
  436. cursor: not-allowed;
  437. }
  438. .done-btn {
  439. font-family: 'Poppins', sans-serif;
  440. padding: 0.5rem 1.5rem;
  441. border: 1px solid var(--border);
  442. border-radius: var(--radius);
  443. background: var(--surface);
  444. color: var(--text);
  445. cursor: pointer;
  446. font-size: 0.875rem;
  447. font-weight: 500;
  448. transition: all 0.15s;
  449. }
  450. .done-btn:hover {
  451. background: var(--bg);
  452. border-color: var(--text-muted);
  453. }
  454. .done-btn.ready {
  455. border: none;
  456. background: var(--accent);
  457. color: white;
  458. font-weight: 600;
  459. }
  460. .done-btn.ready:hover {
  461. background: var(--accent-hover);
  462. }
  463. /* ---- Done overlay ---- */
  464. .done-overlay {
  465. display: none;
  466. position: fixed;
  467. inset: 0;
  468. background: rgba(0, 0, 0, 0.5);
  469. z-index: 100;
  470. justify-content: center;
  471. align-items: center;
  472. }
  473. .done-overlay.visible {
  474. display: flex;
  475. }
  476. .done-card {
  477. background: var(--surface);
  478. border-radius: 12px;
  479. padding: 2rem 3rem;
  480. text-align: center;
  481. box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
  482. max-width: 500px;
  483. }
  484. .done-card h2 {
  485. font-size: 1.5rem;
  486. margin-bottom: 0.5rem;
  487. }
  488. .done-card p {
  489. color: var(--text-muted);
  490. margin-bottom: 1.5rem;
  491. line-height: 1.5;
  492. }
  493. .done-card .btn-row {
  494. display: flex;
  495. gap: 0.5rem;
  496. justify-content: center;
  497. }
  498. .done-card button {
  499. padding: 0.5rem 1.25rem;
  500. border: 1px solid var(--border);
  501. border-radius: var(--radius);
  502. background: var(--surface);
  503. cursor: pointer;
  504. font-size: 0.875rem;
  505. }
  506. .done-card button:hover {
  507. background: var(--bg);
  508. }
  509. /* ---- Toast ---- */
  510. .toast {
  511. position: fixed;
  512. bottom: 5rem;
  513. left: 50%;
  514. transform: translateX(-50%);
  515. background: var(--header-bg);
  516. color: var(--header-text);
  517. padding: 0.625rem 1.25rem;
  518. border-radius: var(--radius);
  519. font-size: 0.875rem;
  520. opacity: 0;
  521. transition: opacity 0.3s;
  522. pointer-events: none;
  523. z-index: 200;
  524. }
  525. .toast.visible {
  526. opacity: 1;
  527. }
  528. </style>
  529. </head>
  530. <body>
  531. <div id="app" style="height:100vh; display:flex; flex-direction:column;">
  532. <div class="header">
  533. <div>
  534. <h1>Eval Review: <span id="skill-name"></span></h1>
  535. <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
  536. </div>
  537. <div class="progress" id="progress"></div>
  538. </div>
  539. <!-- View tabs (only shown when benchmark data exists) -->
  540. <div class="view-tabs" id="view-tabs" style="display:none;">
  541. <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
  542. <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
  543. </div>
  544. <!-- Outputs panel (qualitative review) -->
  545. <div class="view-panel active" id="panel-outputs">
  546. <div class="main">
  547. <!-- Prompt -->
  548. <div class="section">
  549. <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
  550. <div class="section-body">
  551. <div class="prompt-text" id="prompt-text"></div>
  552. </div>
  553. </div>
  554. <!-- Outputs -->
  555. <div class="section">
  556. <div class="section-header">Output</div>
  557. <div class="section-body" id="outputs-body">
  558. <div class="empty-state">No output files found</div>
  559. </div>
  560. </div>
  561. <!-- Previous Output (collapsible) -->
  562. <div class="section" id="prev-outputs-section" style="display:none;">
  563. <div class="section-header">
  564. <div class="grades-toggle" onclick="togglePrevOutputs()">
  565. <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
  566. Previous Output
  567. </div>
  568. </div>
  569. <div class="grades-content" id="prev-outputs-content"></div>
  570. </div>
  571. <!-- Grades (collapsible) -->
  572. <div class="section" id="grades-section" style="display:none;">
  573. <div class="section-header">
  574. <div class="grades-toggle" onclick="toggleGrades()">
  575. <span class="arrow" id="grades-arrow">&#9654;</span>
  576. Formal Grades
  577. </div>
  578. </div>
  579. <div class="grades-content" id="grades-content"></div>
  580. </div>
  581. <!-- Feedback -->
  582. <div class="section">
  583. <div class="section-header">Your Feedback</div>
  584. <div class="section-body">
  585. <textarea
  586. class="feedback-textarea"
  587. id="feedback"
  588. placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
  589. ></textarea>
  590. <div class="feedback-status" id="feedback-status"></div>
  591. <div class="prev-feedback" id="prev-feedback" style="display:none;">
  592. <div class="prev-feedback-label">Previous feedback</div>
  593. <div id="prev-feedback-text"></div>
  594. </div>
  595. </div>
  596. </div>
  597. </div>
  598. <div class="nav" id="outputs-nav">
  599. <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
  600. <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
  601. <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
  602. </div>
  603. </div><!-- end panel-outputs -->
  604. <!-- Benchmark panel (quantitative stats) -->
  605. <div class="view-panel" id="panel-benchmark">
  606. <div class="benchmark-view" id="benchmark-content">
  607. <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
  608. </div>
  609. </div>
  610. </div>
  611. <!-- Done overlay -->
  612. <div class="done-overlay" id="done-overlay">
  613. <div class="done-card">
  614. <h2>Review Complete</h2>
  615. <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
  616. <div class="btn-row">
  617. <button onclick="closeDoneDialog()">OK</button>
  618. </div>
  619. </div>
  620. </div>
  621. <!-- Toast -->
  622. <div class="toast" id="toast"></div>
  623. <script>
  624. // ---- Embedded data (injected by generate_review.py) ----
  625. /*__EMBEDDED_DATA__*/
  626. // ---- State ----
  627. let feedbackMap = {}; // run_id -> feedback text
  628. let currentIndex = 0;
  629. let visitedRuns = new Set();
  630. // ---- Init ----
  631. async function init() {
  632. // Load saved feedback from server — but only if this isn't a fresh
  633. // iteration (indicated by previous_feedback being present). When
  634. // previous feedback exists, the feedback.json on disk is stale from
  635. // the prior iteration and should not pre-fill the textareas.
  636. const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
  637. || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
  638. if (!hasPrevious) {
  639. try {
  640. const resp = await fetch("/api/feedback");
  641. const data = await resp.json();
  642. if (data.reviews) {
  643. for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
  644. }
  645. } catch { /* first run, no feedback yet */ }
  646. }
  647. document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
  648. showRun(0);
  649. // Wire up feedback auto-save
  650. const textarea = document.getElementById("feedback");
  651. let saveTimeout = null;
  652. textarea.addEventListener("input", () => {
  653. clearTimeout(saveTimeout);
  654. document.getElementById("feedback-status").textContent = "";
  655. saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
  656. });
  657. }
  658. // ---- Navigation ----
  659. function navigate(delta) {
  660. const newIndex = currentIndex + delta;
  661. if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
  662. saveCurrentFeedback();
  663. showRun(newIndex);
  664. }
  665. }
  666. function updateNavButtons() {
  667. document.getElementById("prev-btn").disabled = currentIndex === 0;
  668. document.getElementById("next-btn").disabled =
  669. currentIndex === EMBEDDED_DATA.runs.length - 1;
  670. }
  671. // ---- Show a run ----
  672. function showRun(index) {
  673. currentIndex = index;
  674. const run = EMBEDDED_DATA.runs[index];
  675. // Progress
  676. document.getElementById("progress").textContent =
  677. `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
  678. // Prompt
  679. document.getElementById("prompt-text").textContent = run.prompt;
  680. // Config badge
  681. const badge = document.getElementById("config-badge");
  682. const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
  683. if (configMatch) {
  684. const config = configMatch[1];
  685. const isBaseline = config === "without_skill" || config === "old_skill";
  686. badge.textContent = config.replace(/_/g, " ");
  687. badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
  688. badge.style.display = "inline-block";
  689. } else {
  690. badge.style.display = "none";
  691. }
  692. // Outputs
  693. renderOutputs(run);
  694. // Previous outputs
  695. renderPrevOutputs(run);
  696. // Grades
  697. renderGrades(run);
  698. // Previous feedback
  699. const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
  700. const prevEl = document.getElementById("prev-feedback");
  701. if (prevFb) {
  702. document.getElementById("prev-feedback-text").textContent = prevFb;
  703. prevEl.style.display = "block";
  704. } else {
  705. prevEl.style.display = "none";
  706. }
  707. // Feedback
  708. document.getElementById("feedback").value = feedbackMap[run.id] || "";
  709. document.getElementById("feedback-status").textContent = "";
  710. updateNavButtons();
  711. // Track visited runs and promote done button when all visited
  712. visitedRuns.add(index);
  713. const doneBtn = document.getElementById("done-btn");
  714. if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
  715. doneBtn.classList.add("ready");
  716. }
  717. // Scroll main content to top
  718. document.querySelector(".main").scrollTop = 0;
  719. }
  720. // ---- Render outputs ----
  721. function renderOutputs(run) {
  722. const container = document.getElementById("outputs-body");
  723. container.innerHTML = "";
  724. const outputs = run.outputs || [];
  725. if (outputs.length === 0) {
  726. container.innerHTML = '<div class="empty-state">No output files</div>';
  727. return;
  728. }
  729. for (const file of outputs) {
  730. const fileDiv = document.createElement("div");
  731. fileDiv.className = "output-file";
  732. // Always show file header with download link
  733. const header = document.createElement("div");
  734. header.className = "output-file-header";
  735. const nameSpan = document.createElement("span");
  736. nameSpan.textContent = file.name;
  737. header.appendChild(nameSpan);
  738. const dlBtn = document.createElement("a");
  739. dlBtn.className = "dl-btn";
  740. dlBtn.textContent = "Download";
  741. dlBtn.download = file.name;
  742. dlBtn.href = getDownloadUri(file);
  743. header.appendChild(dlBtn);
  744. fileDiv.appendChild(header);
  745. const content = document.createElement("div");
  746. content.className = "output-file-content";
  747. if (file.type === "text") {
  748. const pre = document.createElement("pre");
  749. pre.textContent = file.content;
  750. content.appendChild(pre);
  751. } else if (file.type === "image") {
  752. const img = document.createElement("img");
  753. img.src = file.data_uri;
  754. img.alt = file.name;
  755. content.appendChild(img);
  756. } else if (file.type === "pdf") {
  757. const iframe = document.createElement("iframe");
  758. iframe.src = file.data_uri;
  759. content.appendChild(iframe);
  760. } else if (file.type === "xlsx") {
  761. renderXlsx(content, file.data_b64);
  762. } else if (file.type === "binary") {
  763. const a = document.createElement("a");
  764. a.className = "download-link";
  765. a.href = file.data_uri;
  766. a.download = file.name;
  767. a.textContent = "Download " + file.name;
  768. content.appendChild(a);
  769. } else if (file.type === "error") {
  770. const pre = document.createElement("pre");
  771. pre.textContent = file.content;
  772. pre.style.color = "var(--red)";
  773. content.appendChild(pre);
  774. }
  775. fileDiv.appendChild(content);
  776. container.appendChild(fileDiv);
  777. }
  778. }
  779. // ---- XLSX rendering via SheetJS ----
  780. function renderXlsx(container, b64Data) {
  781. try {
  782. const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
  783. const wb = XLSX.read(raw, { type: "array" });
  784. for (let i = 0; i < wb.SheetNames.length; i++) {
  785. const sheetName = wb.SheetNames[i];
  786. const ws = wb.Sheets[sheetName];
  787. if (wb.SheetNames.length > 1) {
  788. const sheetLabel = document.createElement("div");
  789. sheetLabel.style.cssText =
  790. "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
  791. sheetLabel.textContent = "Sheet: " + sheetName;
  792. container.appendChild(sheetLabel);
  793. }
  794. const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
  795. const wrapper = document.createElement("div");
  796. wrapper.innerHTML = htmlStr;
  797. container.appendChild(wrapper);
  798. }
  799. } catch (err) {
  800. container.textContent = "Error rendering spreadsheet: " + err.message;
  801. }
  802. }
  803. // ---- Grades ----
  804. function renderGrades(run) {
  805. const section = document.getElementById("grades-section");
  806. const content = document.getElementById("grades-content");
  807. if (!run.grading) {
  808. section.style.display = "none";
  809. return;
  810. }
  811. const grading = run.grading;
  812. section.style.display = "block";
  813. // Reset to collapsed
  814. content.classList.remove("open");
  815. document.getElementById("grades-arrow").classList.remove("open");
  816. const summary = grading.summary || {};
  817. const expectations = grading.expectations || [];
  818. let html = '<div style="padding: 1rem;">';
  819. // Summary line
  820. const passRate = summary.pass_rate != null
  821. ? Math.round(summary.pass_rate * 100) + "%"
  822. : "?";
  823. const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
  824. html += '<div class="grades-summary">';
  825. html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
  826. html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
  827. html += '</div>';
  828. // Assertions list
  829. html += '<ul class="assertion-list">';
  830. for (const exp of expectations) {
  831. const statusClass = exp.passed ? "pass" : "fail";
  832. const statusIcon = exp.passed ? "\u2713" : "\u2717";
  833. html += '<li class="assertion-item">';
  834. html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
  835. html += '<span>' + escapeHtml(exp.text) + '</span>';
  836. if (exp.evidence) {
  837. html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
  838. }
  839. html += '</li>';
  840. }
  841. html += '</ul>';
  842. html += '</div>';
  843. content.innerHTML = html;
  844. }
  845. function toggleGrades() {
  846. const content = document.getElementById("grades-content");
  847. const arrow = document.getElementById("grades-arrow");
  848. content.classList.toggle("open");
  849. arrow.classList.toggle("open");
  850. }
  851. // ---- Previous outputs (collapsible) ----
  852. function renderPrevOutputs(run) {
  853. const section = document.getElementById("prev-outputs-section");
  854. const content = document.getElementById("prev-outputs-content");
  855. const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
  856. if (!prevOutputs || prevOutputs.length === 0) {
  857. section.style.display = "none";
  858. return;
  859. }
  860. section.style.display = "block";
  861. // Reset to collapsed
  862. content.classList.remove("open");
  863. document.getElementById("prev-outputs-arrow").classList.remove("open");
  864. // Render the files into the content area
  865. content.innerHTML = "";
  866. const wrapper = document.createElement("div");
  867. wrapper.style.padding = "1rem";
  868. for (const file of prevOutputs) {
  869. const fileDiv = document.createElement("div");
  870. fileDiv.className = "output-file";
  871. const header = document.createElement("div");
  872. header.className = "output-file-header";
  873. const nameSpan = document.createElement("span");
  874. nameSpan.textContent = file.name;
  875. header.appendChild(nameSpan);
  876. const dlBtn = document.createElement("a");
  877. dlBtn.className = "dl-btn";
  878. dlBtn.textContent = "Download";
  879. dlBtn.download = file.name;
  880. dlBtn.href = getDownloadUri(file);
  881. header.appendChild(dlBtn);
  882. fileDiv.appendChild(header);
  883. const fc = document.createElement("div");
  884. fc.className = "output-file-content";
  885. if (file.type === "text") {
  886. const pre = document.createElement("pre");
  887. pre.textContent = file.content;
  888. fc.appendChild(pre);
  889. } else if (file.type === "image") {
  890. const img = document.createElement("img");
  891. img.src = file.data_uri;
  892. img.alt = file.name;
  893. fc.appendChild(img);
  894. } else if (file.type === "pdf") {
  895. const iframe = document.createElement("iframe");
  896. iframe.src = file.data_uri;
  897. fc.appendChild(iframe);
  898. } else if (file.type === "xlsx") {
  899. renderXlsx(fc, file.data_b64);
  900. } else if (file.type === "binary") {
  901. const a = document.createElement("a");
  902. a.className = "download-link";
  903. a.href = file.data_uri;
  904. a.download = file.name;
  905. a.textContent = "Download " + file.name;
  906. fc.appendChild(a);
  907. }
  908. fileDiv.appendChild(fc);
  909. wrapper.appendChild(fileDiv);
  910. }
  911. content.appendChild(wrapper);
  912. }
  913. function togglePrevOutputs() {
  914. const content = document.getElementById("prev-outputs-content");
  915. const arrow = document.getElementById("prev-outputs-arrow");
  916. content.classList.toggle("open");
  917. arrow.classList.toggle("open");
  918. }
  919. // ---- Feedback (saved to server -> feedback.json) ----
  920. function saveCurrentFeedback() {
  921. const run = EMBEDDED_DATA.runs[currentIndex];
  922. const text = document.getElementById("feedback").value;
  923. if (text.trim() === "") {
  924. delete feedbackMap[run.id];
  925. } else {
  926. feedbackMap[run.id] = text;
  927. }
  928. // Build reviews array from map
  929. const reviews = [];
  930. for (const [run_id, feedback] of Object.entries(feedbackMap)) {
  931. if (feedback.trim()) {
  932. reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
  933. }
  934. }
  935. fetch("/api/feedback", {
  936. method: "POST",
  937. headers: { "Content-Type": "application/json" },
  938. body: JSON.stringify({ reviews, status: "in_progress" }),
  939. }).then(() => {
  940. document.getElementById("feedback-status").textContent = "Saved";
  941. }).catch(() => {
  942. // Static mode or server unavailable — no-op on auto-save,
  943. // feedback will be downloaded on final submit
  944. document.getElementById("feedback-status").textContent = "Will download on submit";
  945. });
  946. }
  947. // ---- Done ----
  948. function showDoneDialog() {
  949. // Save current textarea to feedbackMap (but don't POST yet)
  950. const run = EMBEDDED_DATA.runs[currentIndex];
  951. const text = document.getElementById("feedback").value;
  952. if (text.trim() === "") {
  953. delete feedbackMap[run.id];
  954. } else {
  955. feedbackMap[run.id] = text;
  956. }
  957. // POST once with status: complete — include ALL runs so the model
  958. // can distinguish "no feedback" (looks good) from "not reviewed"
  959. const reviews = [];
  960. const ts = new Date().toISOString();
  961. for (const r of EMBEDDED_DATA.runs) {
  962. reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
  963. }
  964. const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
  965. fetch("/api/feedback", {
  966. method: "POST",
  967. headers: { "Content-Type": "application/json" },
  968. body: payload,
  969. }).then(() => {
  970. document.getElementById("done-overlay").classList.add("visible");
  971. }).catch(() => {
  972. // Server not available (static mode) — download as file
  973. const blob = new Blob([payload], { type: "application/json" });
  974. const url = URL.createObjectURL(blob);
  975. const a = document.createElement("a");
  976. a.href = url;
  977. a.download = "feedback.json";
  978. a.click();
  979. URL.revokeObjectURL(url);
  980. document.getElementById("done-overlay").classList.add("visible");
  981. });
  982. }
  983. function closeDoneDialog() {
  984. // Reset status back to in_progress
  985. saveCurrentFeedback();
  986. document.getElementById("done-overlay").classList.remove("visible");
  987. }
  988. // ---- Toast ----
  989. function showToast(message) {
  990. const toast = document.getElementById("toast");
  991. toast.textContent = message;
  992. toast.classList.add("visible");
  993. setTimeout(() => toast.classList.remove("visible"), 2000);
  994. }
  995. // ---- Keyboard nav ----
  996. document.addEventListener("keydown", (e) => {
  997. // Don't capture when typing in textarea
  998. if (e.target.tagName === "TEXTAREA") return;
  999. if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
  1000. e.preventDefault();
  1001. navigate(-1);
  1002. } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
  1003. e.preventDefault();
  1004. navigate(1);
  1005. }
  1006. });
  1007. // ---- Util ----
  1008. function getDownloadUri(file) {
  1009. if (file.data_uri) return file.data_uri;
  1010. if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
  1011. if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
  1012. return "#";
  1013. }
  1014. function escapeHtml(text) {
  1015. const div = document.createElement("div");
  1016. div.textContent = text;
  1017. return div.innerHTML;
  1018. }
  1019. // ---- View switching ----
  1020. function switchView(view) {
  1021. document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
  1022. document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
  1023. document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
  1024. document.getElementById("panel-" + view).classList.add("active");
  1025. }
  1026. // ---- Benchmark rendering ----
  1027. function renderBenchmark() {
  1028. const data = EMBEDDED_DATA.benchmark;
  1029. if (!data) return;
  1030. // Show the tabs
  1031. document.getElementById("view-tabs").style.display = "flex";
  1032. const container = document.getElementById("benchmark-content");
  1033. const summary = data.run_summary || {};
  1034. const metadata = data.metadata || {};
  1035. const notes = data.notes || [];
  1036. let html = "";
  1037. // Header
  1038. html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
  1039. html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
  1040. if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
  1041. if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
  1042. if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
  1043. html += (metadata.runs_per_configuration || "?") + " runs per configuration";
  1044. html += "</p>";
  1045. // Summary table
  1046. html += '<table class="benchmark-table">';
  1047. function fmtStat(stat, pct) {
  1048. if (!stat) return "—";
  1049. const suffix = pct ? "%" : "";
  1050. const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
  1051. const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
  1052. return m + suffix + " ± " + s + suffix;
  1053. }
  1054. function deltaClass(val) {
  1055. if (!val) return "";
  1056. const n = parseFloat(val);
  1057. if (n > 0) return "benchmark-delta-positive";
  1058. if (n < 0) return "benchmark-delta-negative";
  1059. return "";
  1060. }
  1061. // Discover config names dynamically (everything except "delta")
  1062. const configs = Object.keys(summary).filter(k => k !== "delta");
  1063. const configA = configs[0] || "config_a";
  1064. const configB = configs[1] || "config_b";
  1065. const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
  1066. const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
  1067. const a = summary[configA] || {};
  1068. const b = summary[configB] || {};
  1069. const delta = summary.delta || {};
  1070. html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
  1071. html += "<tbody>";
  1072. html += "<tr><td><strong>Pass Rate</strong></td>";
  1073. html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
  1074. html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
  1075. html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
  1076. // Time (only show row if data exists)
  1077. if (a.time_seconds || b.time_seconds) {
  1078. html += "<tr><td><strong>Time (s)</strong></td>";
  1079. html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
  1080. html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
  1081. html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
  1082. }
  1083. // Tokens (only show row if data exists)
  1084. if (a.tokens || b.tokens) {
  1085. html += "<tr><td><strong>Tokens</strong></td>";
  1086. html += "<td>" + fmtStat(a.tokens, false) + "</td>";
  1087. html += "<td>" + fmtStat(b.tokens, false) + "</td>";
  1088. html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
  1089. }
  1090. html += "</tbody></table>";
  1091. // Per-eval breakdown (if runs data available)
  1092. const runs = data.runs || [];
  1093. if (runs.length > 0) {
  1094. const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
  1095. html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
  1096. const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
  1097. const hasErrors = runs.some(r => r.result && r.result.errors > 0);
  1098. for (const evalId of evalIds) {
  1099. const evalRuns = runs.filter(r => r.eval_id === evalId);
  1100. const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
  1101. html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
  1102. html += '<table class="benchmark-table">';
  1103. html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
  1104. if (hasTime) html += "<th>Time (s)</th>";
  1105. if (hasErrors) html += "<th>Crashes During Execution</th>";
  1106. html += "</tr></thead>";
  1107. html += "<tbody>";
  1108. // Group by config and render with average rows
  1109. const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
  1110. for (let ci = 0; ci < configGroups.length; ci++) {
  1111. const config = configGroups[ci];
  1112. const configRuns = evalRuns.filter(r => r.configuration === config);
  1113. if (configRuns.length === 0) continue;
  1114. const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
  1115. const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
  1116. for (const run of configRuns) {
  1117. const r = run.result || {};
  1118. const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
  1119. html += '<tr class="' + rowClass + '">';
  1120. html += "<td>" + configLabel + "</td>";
  1121. html += "<td>" + run.run_number + "</td>";
  1122. html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
  1123. if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
  1124. if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
  1125. html += "</tr>";
  1126. }
  1127. // Average row
  1128. const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
  1129. const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
  1130. const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
  1131. html += '<tr class="benchmark-row-avg ' + rowClass + '">';
  1132. html += "<td>" + configLabel + "</td>";
  1133. html += "<td>Avg</td>";
  1134. html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
  1135. if (hasTime) {
  1136. const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
  1137. html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
  1138. }
  1139. if (hasErrors) html += "<td></td>";
  1140. html += "</tr>";
  1141. }
  1142. html += "</tbody></table>";
  1143. // Per-assertion detail for this eval
  1144. const runsWithExpectations = {};
  1145. for (const config of configGroups) {
  1146. runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
  1147. }
  1148. const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
  1149. if (hasAnyExpectations) {
  1150. // Collect all unique assertion texts across all configs
  1151. const allAssertions = [];
  1152. const seen = new Set();
  1153. for (const config of configGroups) {
  1154. for (const run of runsWithExpectations[config]) {
  1155. for (const exp of (run.expectations || [])) {
  1156. if (!seen.has(exp.text)) {
  1157. seen.add(exp.text);
  1158. allAssertions.push(exp.text);
  1159. }
  1160. }
  1161. }
  1162. }
  1163. html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
  1164. html += "<thead><tr><th>Assertion</th>";
  1165. for (const config of configGroups) {
  1166. const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
  1167. html += "<th>" + escapeHtml(label) + "</th>";
  1168. }
  1169. html += "</tr></thead><tbody>";
  1170. for (const assertionText of allAssertions) {
  1171. html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
  1172. for (const config of configGroups) {
  1173. html += "<td>";
  1174. for (const run of runsWithExpectations[config]) {
  1175. const exp = (run.expectations || []).find(e => e.text === assertionText);
  1176. if (exp) {
  1177. const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
  1178. const icon = exp.passed ? "\u2713" : "\u2717";
  1179. html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
  1180. } else {
  1181. html += "— ";
  1182. }
  1183. }
  1184. html += "</td>";
  1185. }
  1186. html += "</tr>";
  1187. }
  1188. html += "</tbody></table>";
  1189. }
  1190. }
  1191. }
  1192. // Notes
  1193. if (notes.length > 0) {
  1194. html += '<div class="benchmark-notes">';
  1195. html += "<h3>Analysis Notes</h3>";
  1196. html += "<ul>";
  1197. for (const note of notes) {
  1198. html += "<li>" + escapeHtml(note) + "</li>";
  1199. }
  1200. html += "</ul></div>";
  1201. }
  1202. container.innerHTML = html;
  1203. }
  1204. // ---- Start ----
  1205. init();
  1206. renderBenchmark();
  1207. </script>
  1208. </body>
  1209. </html>