| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325 |
- <!DOCTYPE html>
- <html lang="en">
- <head>
- <meta charset="UTF-8">
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
- <title>Eval Review</title>
- <link rel="preconnect" href="https://fonts.googleapis.com">
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
- <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
- <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
- <style>
- :root {
- --bg: #faf9f5;
- --surface: #ffffff;
- --border: #e8e6dc;
- --text: #141413;
- --text-muted: #b0aea5;
- --accent: #d97757;
- --accent-hover: #c4613f;
- --green: #788c5d;
- --green-bg: #eef2e8;
- --red: #c44;
- --red-bg: #fceaea;
- --header-bg: #141413;
- --header-text: #faf9f5;
- --radius: 6px;
- }
- * { box-sizing: border-box; margin: 0; padding: 0; }
- body {
- font-family: 'Lora', Georgia, serif;
- background: var(--bg);
- color: var(--text);
- height: 100vh;
- display: flex;
- flex-direction: column;
- }
- /* ---- Header ---- */
- .header {
- background: var(--header-bg);
- color: var(--header-text);
- padding: 1rem 2rem;
- display: flex;
- justify-content: space-between;
- align-items: center;
- flex-shrink: 0;
- }
- .header h1 {
- font-family: 'Poppins', sans-serif;
- font-size: 1.25rem;
- font-weight: 600;
- }
- .header .instructions {
- font-size: 0.8rem;
- opacity: 0.7;
- margin-top: 0.25rem;
- }
- .header .progress {
- font-size: 0.875rem;
- opacity: 0.8;
- text-align: right;
- }
- /* ---- Main content ---- */
- .main {
- flex: 1;
- overflow-y: auto;
- padding: 1.5rem 2rem;
- display: flex;
- flex-direction: column;
- gap: 1.25rem;
- }
- /* ---- Sections ---- */
- .section {
- background: var(--surface);
- border: 1px solid var(--border);
- border-radius: var(--radius);
- flex-shrink: 0;
- }
- .section-header {
- font-family: 'Poppins', sans-serif;
- padding: 0.75rem 1rem;
- font-size: 0.75rem;
- font-weight: 500;
- text-transform: uppercase;
- letter-spacing: 0.05em;
- color: var(--text-muted);
- border-bottom: 1px solid var(--border);
- background: var(--bg);
- }
- .section-body {
- padding: 1rem;
- }
- /* ---- Config badge ---- */
- .config-badge {
- display: inline-block;
- padding: 0.2rem 0.625rem;
- border-radius: 9999px;
- font-family: 'Poppins', sans-serif;
- font-size: 0.6875rem;
- font-weight: 600;
- text-transform: uppercase;
- letter-spacing: 0.03em;
- margin-left: 0.75rem;
- vertical-align: middle;
- }
- .config-badge.config-primary {
- background: rgba(33, 150, 243, 0.12);
- color: #1976d2;
- }
- .config-badge.config-baseline {
- background: rgba(255, 193, 7, 0.15);
- color: #f57f17;
- }
- /* ---- Prompt ---- */
- .prompt-text {
- white-space: pre-wrap;
- font-size: 0.9375rem;
- line-height: 1.6;
- }
- /* ---- Outputs ---- */
- .output-file {
- border: 1px solid var(--border);
- border-radius: var(--radius);
- overflow: hidden;
- }
- .output-file + .output-file {
- margin-top: 1rem;
- }
- .output-file-header {
- padding: 0.5rem 0.75rem;
- font-size: 0.8rem;
- font-weight: 600;
- color: var(--text-muted);
- background: var(--bg);
- border-bottom: 1px solid var(--border);
- font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
- display: flex;
- justify-content: space-between;
- align-items: center;
- }
- .output-file-header .dl-btn {
- font-size: 0.7rem;
- color: var(--accent);
- text-decoration: none;
- cursor: pointer;
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
- font-weight: 500;
- opacity: 0.8;
- }
- .output-file-header .dl-btn:hover {
- opacity: 1;
- text-decoration: underline;
- }
- .output-file-content {
- padding: 0.75rem;
- overflow-x: auto;
- }
- .output-file-content pre {
- font-size: 0.8125rem;
- line-height: 1.5;
- white-space: pre-wrap;
- word-break: break-word;
- font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
- }
- .output-file-content img {
- max-width: 100%;
- height: auto;
- border-radius: 4px;
- }
- .output-file-content iframe {
- width: 100%;
- height: 600px;
- border: none;
- }
- .output-file-content table {
- border-collapse: collapse;
- font-size: 0.8125rem;
- width: 100%;
- }
- .output-file-content table td,
- .output-file-content table th {
- border: 1px solid var(--border);
- padding: 0.375rem 0.5rem;
- text-align: left;
- }
- .output-file-content table th {
- background: var(--bg);
- font-weight: 600;
- }
- .output-file-content .download-link {
- display: inline-flex;
- align-items: center;
- gap: 0.5rem;
- padding: 0.5rem 1rem;
- background: var(--bg);
- border: 1px solid var(--border);
- border-radius: 4px;
- color: var(--accent);
- text-decoration: none;
- font-size: 0.875rem;
- cursor: pointer;
- }
- .output-file-content .download-link:hover {
- background: var(--border);
- }
- .empty-state {
- color: var(--text-muted);
- font-style: italic;
- padding: 2rem;
- text-align: center;
- }
- /* ---- Feedback ---- */
- .prev-feedback {
- background: var(--bg);
- border: 1px solid var(--border);
- border-radius: 4px;
- padding: 0.625rem 0.75rem;
- margin-top: 0.75rem;
- font-size: 0.8125rem;
- color: var(--text-muted);
- line-height: 1.5;
- }
- .prev-feedback-label {
- font-size: 0.7rem;
- font-weight: 600;
- text-transform: uppercase;
- letter-spacing: 0.04em;
- margin-bottom: 0.25rem;
- color: var(--text-muted);
- }
- .feedback-textarea {
- width: 100%;
- min-height: 100px;
- padding: 0.75rem;
- border: 1px solid var(--border);
- border-radius: 4px;
- font-family: inherit;
- font-size: 0.9375rem;
- line-height: 1.5;
- resize: vertical;
- color: var(--text);
- }
- .feedback-textarea:focus {
- outline: none;
- border-color: var(--accent);
- box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
- }
- .feedback-status {
- font-size: 0.75rem;
- color: var(--text-muted);
- margin-top: 0.5rem;
- min-height: 1.1em;
- }
- /* ---- Grades (collapsible) ---- */
- .grades-toggle {
- display: flex;
- align-items: center;
- cursor: pointer;
- user-select: none;
- }
- .grades-toggle:hover {
- color: var(--accent);
- }
- .grades-toggle .arrow {
- margin-right: 0.5rem;
- transition: transform 0.15s;
- font-size: 0.75rem;
- }
- .grades-toggle .arrow.open {
- transform: rotate(90deg);
- }
- .grades-content {
- display: none;
- margin-top: 0.75rem;
- }
- .grades-content.open {
- display: block;
- }
- .grades-summary {
- font-size: 0.875rem;
- margin-bottom: 0.75rem;
- display: flex;
- align-items: center;
- gap: 0.5rem;
- }
- .grade-badge {
- display: inline-block;
- padding: 0.125rem 0.5rem;
- border-radius: 9999px;
- font-size: 0.75rem;
- font-weight: 600;
- }
- .grade-pass { background: var(--green-bg); color: var(--green); }
- .grade-fail { background: var(--red-bg); color: var(--red); }
- .assertion-list {
- list-style: none;
- }
- .assertion-item {
- padding: 0.625rem 0;
- border-bottom: 1px solid var(--border);
- font-size: 0.8125rem;
- }
- .assertion-item:last-child { border-bottom: none; }
- .assertion-status {
- font-weight: 600;
- margin-right: 0.5rem;
- }
- .assertion-status.pass { color: var(--green); }
- .assertion-status.fail { color: var(--red); }
- .assertion-evidence {
- color: var(--text-muted);
- font-size: 0.75rem;
- margin-top: 0.25rem;
- padding-left: 1.5rem;
- }
- /* ---- View tabs ---- */
- .view-tabs {
- display: flex;
- gap: 0;
- padding: 0 2rem;
- background: var(--bg);
- border-bottom: 1px solid var(--border);
- flex-shrink: 0;
- }
- .view-tab {
- font-family: 'Poppins', sans-serif;
- padding: 0.625rem 1.25rem;
- font-size: 0.8125rem;
- font-weight: 500;
- cursor: pointer;
- border: none;
- background: none;
- color: var(--text-muted);
- border-bottom: 2px solid transparent;
- transition: all 0.15s;
- }
- .view-tab:hover { color: var(--text); }
- .view-tab.active {
- color: var(--accent);
- border-bottom-color: var(--accent);
- }
- .view-panel { display: none; }
- .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
- /* ---- Benchmark view ---- */
- .benchmark-view {
- padding: 1.5rem 2rem;
- overflow-y: auto;
- flex: 1;
- }
- .benchmark-table {
- border-collapse: collapse;
- background: var(--surface);
- border: 1px solid var(--border);
- border-radius: var(--radius);
- font-size: 0.8125rem;
- width: 100%;
- margin-bottom: 1.5rem;
- }
- .benchmark-table th, .benchmark-table td {
- padding: 0.625rem 0.75rem;
- text-align: left;
- border: 1px solid var(--border);
- }
- .benchmark-table th {
- font-family: 'Poppins', sans-serif;
- background: var(--header-bg);
- color: var(--header-text);
- font-weight: 500;
- font-size: 0.75rem;
- text-transform: uppercase;
- letter-spacing: 0.04em;
- }
- .benchmark-table tr:hover { background: var(--bg); }
- .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
- .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
- .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
- .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
- .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
- .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
- .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
- .benchmark-delta-positive { color: var(--green); font-weight: 600; }
- .benchmark-delta-negative { color: var(--red); font-weight: 600; }
- .benchmark-notes {
- background: var(--surface);
- border: 1px solid var(--border);
- border-radius: var(--radius);
- padding: 1rem;
- }
- .benchmark-notes h3 {
- font-family: 'Poppins', sans-serif;
- font-size: 0.875rem;
- margin-bottom: 0.75rem;
- }
- .benchmark-notes ul {
- list-style: disc;
- padding-left: 1.25rem;
- }
- .benchmark-notes li {
- font-size: 0.8125rem;
- line-height: 1.6;
- margin-bottom: 0.375rem;
- }
- .benchmark-empty {
- color: var(--text-muted);
- font-style: italic;
- text-align: center;
- padding: 3rem;
- }
- /* ---- Navigation ---- */
- .nav {
- display: flex;
- justify-content: space-between;
- align-items: center;
- padding: 1rem 2rem;
- border-top: 1px solid var(--border);
- background: var(--surface);
- flex-shrink: 0;
- }
- .nav-btn {
- font-family: 'Poppins', sans-serif;
- padding: 0.5rem 1.25rem;
- border: 1px solid var(--border);
- border-radius: var(--radius);
- background: var(--surface);
- cursor: pointer;
- font-size: 0.875rem;
- font-weight: 500;
- color: var(--text);
- transition: all 0.15s;
- }
- .nav-btn:hover:not(:disabled) {
- background: var(--bg);
- border-color: var(--text-muted);
- }
- .nav-btn:disabled {
- opacity: 0.4;
- cursor: not-allowed;
- }
- .done-btn {
- font-family: 'Poppins', sans-serif;
- padding: 0.5rem 1.5rem;
- border: 1px solid var(--border);
- border-radius: var(--radius);
- background: var(--surface);
- color: var(--text);
- cursor: pointer;
- font-size: 0.875rem;
- font-weight: 500;
- transition: all 0.15s;
- }
- .done-btn:hover {
- background: var(--bg);
- border-color: var(--text-muted);
- }
- .done-btn.ready {
- border: none;
- background: var(--accent);
- color: white;
- font-weight: 600;
- }
- .done-btn.ready:hover {
- background: var(--accent-hover);
- }
- /* ---- Done overlay ---- */
- .done-overlay {
- display: none;
- position: fixed;
- inset: 0;
- background: rgba(0, 0, 0, 0.5);
- z-index: 100;
- justify-content: center;
- align-items: center;
- }
- .done-overlay.visible {
- display: flex;
- }
- .done-card {
- background: var(--surface);
- border-radius: 12px;
- padding: 2rem 3rem;
- text-align: center;
- box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
- max-width: 500px;
- }
- .done-card h2 {
- font-size: 1.5rem;
- margin-bottom: 0.5rem;
- }
- .done-card p {
- color: var(--text-muted);
- margin-bottom: 1.5rem;
- line-height: 1.5;
- }
- .done-card .btn-row {
- display: flex;
- gap: 0.5rem;
- justify-content: center;
- }
- .done-card button {
- padding: 0.5rem 1.25rem;
- border: 1px solid var(--border);
- border-radius: var(--radius);
- background: var(--surface);
- cursor: pointer;
- font-size: 0.875rem;
- }
- .done-card button:hover {
- background: var(--bg);
- }
- /* ---- Toast ---- */
- .toast {
- position: fixed;
- bottom: 5rem;
- left: 50%;
- transform: translateX(-50%);
- background: var(--header-bg);
- color: var(--header-text);
- padding: 0.625rem 1.25rem;
- border-radius: var(--radius);
- font-size: 0.875rem;
- opacity: 0;
- transition: opacity 0.3s;
- pointer-events: none;
- z-index: 200;
- }
- .toast.visible {
- opacity: 1;
- }
- </style>
- </head>
- <body>
- <div id="app" style="height:100vh; display:flex; flex-direction:column;">
- <div class="header">
- <div>
- <h1>Eval Review: <span id="skill-name"></span></h1>
- <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
- </div>
- <div class="progress" id="progress"></div>
- </div>
- <!-- View tabs (only shown when benchmark data exists) -->
- <div class="view-tabs" id="view-tabs" style="display:none;">
- <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
- <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
- </div>
- <!-- Outputs panel (qualitative review) -->
- <div class="view-panel active" id="panel-outputs">
- <div class="main">
- <!-- Prompt -->
- <div class="section">
- <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
- <div class="section-body">
- <div class="prompt-text" id="prompt-text"></div>
- </div>
- </div>
- <!-- Outputs -->
- <div class="section">
- <div class="section-header">Output</div>
- <div class="section-body" id="outputs-body">
- <div class="empty-state">No output files found</div>
- </div>
- </div>
- <!-- Previous Output (collapsible) -->
- <div class="section" id="prev-outputs-section" style="display:none;">
- <div class="section-header">
- <div class="grades-toggle" onclick="togglePrevOutputs()">
- <span class="arrow" id="prev-outputs-arrow">▶</span>
- Previous Output
- </div>
- </div>
- <div class="grades-content" id="prev-outputs-content"></div>
- </div>
- <!-- Grades (collapsible) -->
- <div class="section" id="grades-section" style="display:none;">
- <div class="section-header">
- <div class="grades-toggle" onclick="toggleGrades()">
- <span class="arrow" id="grades-arrow">▶</span>
- Formal Grades
- </div>
- </div>
- <div class="grades-content" id="grades-content"></div>
- </div>
- <!-- Feedback -->
- <div class="section">
- <div class="section-header">Your Feedback</div>
- <div class="section-body">
- <textarea
- class="feedback-textarea"
- id="feedback"
- placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
- ></textarea>
- <div class="feedback-status" id="feedback-status"></div>
- <div class="prev-feedback" id="prev-feedback" style="display:none;">
- <div class="prev-feedback-label">Previous feedback</div>
- <div id="prev-feedback-text"></div>
- </div>
- </div>
- </div>
- </div>
- <div class="nav" id="outputs-nav">
- <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">← Previous</button>
- <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
- <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next →</button>
- </div>
- </div><!-- end panel-outputs -->
- <!-- Benchmark panel (quantitative stats) -->
- <div class="view-panel" id="panel-benchmark">
- <div class="benchmark-view" id="benchmark-content">
- <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
- </div>
- </div>
- </div>
- <!-- Done overlay -->
- <div class="done-overlay" id="done-overlay">
- <div class="done-card">
- <h2>Review Complete</h2>
- <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
- <div class="btn-row">
- <button onclick="closeDoneDialog()">OK</button>
- </div>
- </div>
- </div>
- <!-- Toast -->
- <div class="toast" id="toast"></div>
- <script>
- // ---- Embedded data (injected by generate_review.py) ----
- /*__EMBEDDED_DATA__*/
- // ---- State ----
- let feedbackMap = {}; // run_id -> feedback text
- let currentIndex = 0;
- let visitedRuns = new Set();
- // ---- Init ----
- async function init() {
- // Load saved feedback from server — but only if this isn't a fresh
- // iteration (indicated by previous_feedback being present). When
- // previous feedback exists, the feedback.json on disk is stale from
- // the prior iteration and should not pre-fill the textareas.
- const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
- || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
- if (!hasPrevious) {
- try {
- const resp = await fetch("/api/feedback");
- const data = await resp.json();
- if (data.reviews) {
- for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
- }
- } catch { /* first run, no feedback yet */ }
- }
- document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
- showRun(0);
- // Wire up feedback auto-save
- const textarea = document.getElementById("feedback");
- let saveTimeout = null;
- textarea.addEventListener("input", () => {
- clearTimeout(saveTimeout);
- document.getElementById("feedback-status").textContent = "";
- saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
- });
- }
- // ---- Navigation ----
- function navigate(delta) {
- const newIndex = currentIndex + delta;
- if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
- saveCurrentFeedback();
- showRun(newIndex);
- }
- }
- function updateNavButtons() {
- document.getElementById("prev-btn").disabled = currentIndex === 0;
- document.getElementById("next-btn").disabled =
- currentIndex === EMBEDDED_DATA.runs.length - 1;
- }
- // ---- Show a run ----
- function showRun(index) {
- currentIndex = index;
- const run = EMBEDDED_DATA.runs[index];
- // Progress
- document.getElementById("progress").textContent =
- `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
- // Prompt
- document.getElementById("prompt-text").textContent = run.prompt;
- // Config badge
- const badge = document.getElementById("config-badge");
- const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
- if (configMatch) {
- const config = configMatch[1];
- const isBaseline = config === "without_skill" || config === "old_skill";
- badge.textContent = config.replace(/_/g, " ");
- badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
- badge.style.display = "inline-block";
- } else {
- badge.style.display = "none";
- }
- // Outputs
- renderOutputs(run);
- // Previous outputs
- renderPrevOutputs(run);
- // Grades
- renderGrades(run);
- // Previous feedback
- const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
- const prevEl = document.getElementById("prev-feedback");
- if (prevFb) {
- document.getElementById("prev-feedback-text").textContent = prevFb;
- prevEl.style.display = "block";
- } else {
- prevEl.style.display = "none";
- }
- // Feedback
- document.getElementById("feedback").value = feedbackMap[run.id] || "";
- document.getElementById("feedback-status").textContent = "";
- updateNavButtons();
- // Track visited runs and promote done button when all visited
- visitedRuns.add(index);
- const doneBtn = document.getElementById("done-btn");
- if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
- doneBtn.classList.add("ready");
- }
- // Scroll main content to top
- document.querySelector(".main").scrollTop = 0;
- }
- // ---- Render outputs ----
- function renderOutputs(run) {
- const container = document.getElementById("outputs-body");
- container.innerHTML = "";
- const outputs = run.outputs || [];
- if (outputs.length === 0) {
- container.innerHTML = '<div class="empty-state">No output files</div>';
- return;
- }
- for (const file of outputs) {
- const fileDiv = document.createElement("div");
- fileDiv.className = "output-file";
- // Always show file header with download link
- const header = document.createElement("div");
- header.className = "output-file-header";
- const nameSpan = document.createElement("span");
- nameSpan.textContent = file.name;
- header.appendChild(nameSpan);
- const dlBtn = document.createElement("a");
- dlBtn.className = "dl-btn";
- dlBtn.textContent = "Download";
- dlBtn.download = file.name;
- dlBtn.href = getDownloadUri(file);
- header.appendChild(dlBtn);
- fileDiv.appendChild(header);
- const content = document.createElement("div");
- content.className = "output-file-content";
- if (file.type === "text") {
- const pre = document.createElement("pre");
- pre.textContent = file.content;
- content.appendChild(pre);
- } else if (file.type === "image") {
- const img = document.createElement("img");
- img.src = file.data_uri;
- img.alt = file.name;
- content.appendChild(img);
- } else if (file.type === "pdf") {
- const iframe = document.createElement("iframe");
- iframe.src = file.data_uri;
- content.appendChild(iframe);
- } else if (file.type === "xlsx") {
- renderXlsx(content, file.data_b64);
- } else if (file.type === "binary") {
- const a = document.createElement("a");
- a.className = "download-link";
- a.href = file.data_uri;
- a.download = file.name;
- a.textContent = "Download " + file.name;
- content.appendChild(a);
- } else if (file.type === "error") {
- const pre = document.createElement("pre");
- pre.textContent = file.content;
- pre.style.color = "var(--red)";
- content.appendChild(pre);
- }
- fileDiv.appendChild(content);
- container.appendChild(fileDiv);
- }
- }
- // ---- XLSX rendering via SheetJS ----
- function renderXlsx(container, b64Data) {
- try {
- const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
- const wb = XLSX.read(raw, { type: "array" });
- for (let i = 0; i < wb.SheetNames.length; i++) {
- const sheetName = wb.SheetNames[i];
- const ws = wb.Sheets[sheetName];
- if (wb.SheetNames.length > 1) {
- const sheetLabel = document.createElement("div");
- sheetLabel.style.cssText =
- "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
- sheetLabel.textContent = "Sheet: " + sheetName;
- container.appendChild(sheetLabel);
- }
- const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
- const wrapper = document.createElement("div");
- wrapper.innerHTML = htmlStr;
- container.appendChild(wrapper);
- }
- } catch (err) {
- container.textContent = "Error rendering spreadsheet: " + err.message;
- }
- }
- // ---- Grades ----
- function renderGrades(run) {
- const section = document.getElementById("grades-section");
- const content = document.getElementById("grades-content");
- if (!run.grading) {
- section.style.display = "none";
- return;
- }
- const grading = run.grading;
- section.style.display = "block";
- // Reset to collapsed
- content.classList.remove("open");
- document.getElementById("grades-arrow").classList.remove("open");
- const summary = grading.summary || {};
- const expectations = grading.expectations || [];
- let html = '<div style="padding: 1rem;">';
- // Summary line
- const passRate = summary.pass_rate != null
- ? Math.round(summary.pass_rate * 100) + "%"
- : "?";
- const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
- html += '<div class="grades-summary">';
- html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
- html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
- html += '</div>';
- // Assertions list
- html += '<ul class="assertion-list">';
- for (const exp of expectations) {
- const statusClass = exp.passed ? "pass" : "fail";
- const statusIcon = exp.passed ? "\u2713" : "\u2717";
- html += '<li class="assertion-item">';
- html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
- html += '<span>' + escapeHtml(exp.text) + '</span>';
- if (exp.evidence) {
- html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
- }
- html += '</li>';
- }
- html += '</ul>';
- html += '</div>';
- content.innerHTML = html;
- }
- function toggleGrades() {
- const content = document.getElementById("grades-content");
- const arrow = document.getElementById("grades-arrow");
- content.classList.toggle("open");
- arrow.classList.toggle("open");
- }
- // ---- Previous outputs (collapsible) ----
- function renderPrevOutputs(run) {
- const section = document.getElementById("prev-outputs-section");
- const content = document.getElementById("prev-outputs-content");
- const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
- if (!prevOutputs || prevOutputs.length === 0) {
- section.style.display = "none";
- return;
- }
- section.style.display = "block";
- // Reset to collapsed
- content.classList.remove("open");
- document.getElementById("prev-outputs-arrow").classList.remove("open");
- // Render the files into the content area
- content.innerHTML = "";
- const wrapper = document.createElement("div");
- wrapper.style.padding = "1rem";
- for (const file of prevOutputs) {
- const fileDiv = document.createElement("div");
- fileDiv.className = "output-file";
- const header = document.createElement("div");
- header.className = "output-file-header";
- const nameSpan = document.createElement("span");
- nameSpan.textContent = file.name;
- header.appendChild(nameSpan);
- const dlBtn = document.createElement("a");
- dlBtn.className = "dl-btn";
- dlBtn.textContent = "Download";
- dlBtn.download = file.name;
- dlBtn.href = getDownloadUri(file);
- header.appendChild(dlBtn);
- fileDiv.appendChild(header);
- const fc = document.createElement("div");
- fc.className = "output-file-content";
- if (file.type === "text") {
- const pre = document.createElement("pre");
- pre.textContent = file.content;
- fc.appendChild(pre);
- } else if (file.type === "image") {
- const img = document.createElement("img");
- img.src = file.data_uri;
- img.alt = file.name;
- fc.appendChild(img);
- } else if (file.type === "pdf") {
- const iframe = document.createElement("iframe");
- iframe.src = file.data_uri;
- fc.appendChild(iframe);
- } else if (file.type === "xlsx") {
- renderXlsx(fc, file.data_b64);
- } else if (file.type === "binary") {
- const a = document.createElement("a");
- a.className = "download-link";
- a.href = file.data_uri;
- a.download = file.name;
- a.textContent = "Download " + file.name;
- fc.appendChild(a);
- }
- fileDiv.appendChild(fc);
- wrapper.appendChild(fileDiv);
- }
- content.appendChild(wrapper);
- }
- function togglePrevOutputs() {
- const content = document.getElementById("prev-outputs-content");
- const arrow = document.getElementById("prev-outputs-arrow");
- content.classList.toggle("open");
- arrow.classList.toggle("open");
- }
- // ---- Feedback (saved to server -> feedback.json) ----
- function saveCurrentFeedback() {
- const run = EMBEDDED_DATA.runs[currentIndex];
- const text = document.getElementById("feedback").value;
- if (text.trim() === "") {
- delete feedbackMap[run.id];
- } else {
- feedbackMap[run.id] = text;
- }
- // Build reviews array from map
- const reviews = [];
- for (const [run_id, feedback] of Object.entries(feedbackMap)) {
- if (feedback.trim()) {
- reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
- }
- }
- fetch("/api/feedback", {
- method: "POST",
- headers: { "Content-Type": "application/json" },
- body: JSON.stringify({ reviews, status: "in_progress" }),
- }).then(() => {
- document.getElementById("feedback-status").textContent = "Saved";
- }).catch(() => {
- // Static mode or server unavailable — no-op on auto-save,
- // feedback will be downloaded on final submit
- document.getElementById("feedback-status").textContent = "Will download on submit";
- });
- }
- // ---- Done ----
- function showDoneDialog() {
- // Save current textarea to feedbackMap (but don't POST yet)
- const run = EMBEDDED_DATA.runs[currentIndex];
- const text = document.getElementById("feedback").value;
- if (text.trim() === "") {
- delete feedbackMap[run.id];
- } else {
- feedbackMap[run.id] = text;
- }
- // POST once with status: complete — include ALL runs so the model
- // can distinguish "no feedback" (looks good) from "not reviewed"
- const reviews = [];
- const ts = new Date().toISOString();
- for (const r of EMBEDDED_DATA.runs) {
- reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
- }
- const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
- fetch("/api/feedback", {
- method: "POST",
- headers: { "Content-Type": "application/json" },
- body: payload,
- }).then(() => {
- document.getElementById("done-overlay").classList.add("visible");
- }).catch(() => {
- // Server not available (static mode) — download as file
- const blob = new Blob([payload], { type: "application/json" });
- const url = URL.createObjectURL(blob);
- const a = document.createElement("a");
- a.href = url;
- a.download = "feedback.json";
- a.click();
- URL.revokeObjectURL(url);
- document.getElementById("done-overlay").classList.add("visible");
- });
- }
- function closeDoneDialog() {
- // Reset status back to in_progress
- saveCurrentFeedback();
- document.getElementById("done-overlay").classList.remove("visible");
- }
- // ---- Toast ----
- function showToast(message) {
- const toast = document.getElementById("toast");
- toast.textContent = message;
- toast.classList.add("visible");
- setTimeout(() => toast.classList.remove("visible"), 2000);
- }
- // ---- Keyboard nav ----
- document.addEventListener("keydown", (e) => {
- // Don't capture when typing in textarea
- if (e.target.tagName === "TEXTAREA") return;
- if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
- e.preventDefault();
- navigate(-1);
- } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
- e.preventDefault();
- navigate(1);
- }
- });
- // ---- Util ----
- function getDownloadUri(file) {
- if (file.data_uri) return file.data_uri;
- if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
- if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
- return "#";
- }
- function escapeHtml(text) {
- const div = document.createElement("div");
- div.textContent = text;
- return div.innerHTML;
- }
- // ---- View switching ----
- function switchView(view) {
- document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
- document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
- document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
- document.getElementById("panel-" + view).classList.add("active");
- }
- // ---- Benchmark rendering ----
- function renderBenchmark() {
- const data = EMBEDDED_DATA.benchmark;
- if (!data) return;
- // Show the tabs
- document.getElementById("view-tabs").style.display = "flex";
- const container = document.getElementById("benchmark-content");
- const summary = data.run_summary || {};
- const metadata = data.metadata || {};
- const notes = data.notes || [];
- let html = "";
- // Header
- html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
- html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
- if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> — ";
- if (metadata.timestamp) html += metadata.timestamp + " — ";
- if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " — ";
- html += (metadata.runs_per_configuration || "?") + " runs per configuration";
- html += "</p>";
- // Summary table
- html += '<table class="benchmark-table">';
- function fmtStat(stat, pct) {
- if (!stat) return "—";
- const suffix = pct ? "%" : "";
- const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
- const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
- return m + suffix + " ± " + s + suffix;
- }
- function deltaClass(val) {
- if (!val) return "";
- const n = parseFloat(val);
- if (n > 0) return "benchmark-delta-positive";
- if (n < 0) return "benchmark-delta-negative";
- return "";
- }
- // Discover config names dynamically (everything except "delta")
- const configs = Object.keys(summary).filter(k => k !== "delta");
- const configA = configs[0] || "config_a";
- const configB = configs[1] || "config_b";
- const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
- const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
- const a = summary[configA] || {};
- const b = summary[configB] || {};
- const delta = summary.delta || {};
- html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
- html += "<tbody>";
- html += "<tr><td><strong>Pass Rate</strong></td>";
- html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
- html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
- html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
- // Time (only show row if data exists)
- if (a.time_seconds || b.time_seconds) {
- html += "<tr><td><strong>Time (s)</strong></td>";
- html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
- html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
- html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
- }
- // Tokens (only show row if data exists)
- if (a.tokens || b.tokens) {
- html += "<tr><td><strong>Tokens</strong></td>";
- html += "<td>" + fmtStat(a.tokens, false) + "</td>";
- html += "<td>" + fmtStat(b.tokens, false) + "</td>";
- html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
- }
- html += "</tbody></table>";
- // Per-eval breakdown (if runs data available)
- const runs = data.runs || [];
- if (runs.length > 0) {
- const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
- html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
- const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
- const hasErrors = runs.some(r => r.result && r.result.errors > 0);
- for (const evalId of evalIds) {
- const evalRuns = runs.filter(r => r.eval_id === evalId);
- const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
- html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
- html += '<table class="benchmark-table">';
- html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
- if (hasTime) html += "<th>Time (s)</th>";
- if (hasErrors) html += "<th>Crashes During Execution</th>";
- html += "</tr></thead>";
- html += "<tbody>";
- // Group by config and render with average rows
- const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
- for (let ci = 0; ci < configGroups.length; ci++) {
- const config = configGroups[ci];
- const configRuns = evalRuns.filter(r => r.configuration === config);
- if (configRuns.length === 0) continue;
- const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
- const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
- for (const run of configRuns) {
- const r = run.result || {};
- const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
- html += '<tr class="' + rowClass + '">';
- html += "<td>" + configLabel + "</td>";
- html += "<td>" + run.run_number + "</td>";
- html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
- if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
- if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
- html += "</tr>";
- }
- // Average row
- const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
- const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
- const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
- html += '<tr class="benchmark-row-avg ' + rowClass + '">';
- html += "<td>" + configLabel + "</td>";
- html += "<td>Avg</td>";
- html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
- if (hasTime) {
- const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
- html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
- }
- if (hasErrors) html += "<td></td>";
- html += "</tr>";
- }
- html += "</tbody></table>";
- // Per-assertion detail for this eval
- const runsWithExpectations = {};
- for (const config of configGroups) {
- runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
- }
- const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
- if (hasAnyExpectations) {
- // Collect all unique assertion texts across all configs
- const allAssertions = [];
- const seen = new Set();
- for (const config of configGroups) {
- for (const run of runsWithExpectations[config]) {
- for (const exp of (run.expectations || [])) {
- if (!seen.has(exp.text)) {
- seen.add(exp.text);
- allAssertions.push(exp.text);
- }
- }
- }
- }
- html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
- html += "<thead><tr><th>Assertion</th>";
- for (const config of configGroups) {
- const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
- html += "<th>" + escapeHtml(label) + "</th>";
- }
- html += "</tr></thead><tbody>";
- for (const assertionText of allAssertions) {
- html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
- for (const config of configGroups) {
- html += "<td>";
- for (const run of runsWithExpectations[config]) {
- const exp = (run.expectations || []).find(e => e.text === assertionText);
- if (exp) {
- const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
- const icon = exp.passed ? "\u2713" : "\u2717";
- html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
- } else {
- html += "— ";
- }
- }
- html += "</td>";
- }
- html += "</tr>";
- }
- html += "</tbody></table>";
- }
- }
- }
- // Notes
- if (notes.length > 0) {
- html += '<div class="benchmark-notes">';
- html += "<h3>Analysis Notes</h3>";
- html += "<ul>";
- for (const note of notes) {
- html += "<li>" + escapeHtml(note) + "</li>";
- }
- html += "</ul></div>";
- }
- container.innerHTML = html;
- }
- // ---- Start ----
- init();
- renderBenchmark();
- </script>
- </body>
- </html>
|