diff --git a/packages/interfacectl-cli/dist/commands/generation-session.d.ts b/packages/interfacectl-cli/dist/commands/generation-session.d.ts index 7d0149d..78911c5 100644 --- a/packages/interfacectl-cli/dist/commands/generation-session.d.ts +++ b/packages/interfacectl-cli/dist/commands/generation-session.d.ts @@ -54,7 +54,20 @@ export interface SummarizeGenerationBenchmarkCommandOptions { comparisonPaths?: string; suggestionPaths?: string; outDir?: string; + runPath?: string; } +export interface ReplayGenerationBenchmarkCommandOptions { + specPath?: string; + tool?: string; + outDir?: string; + cohortId?: string; + sourceRunPath?: string; + requestedModelLabel?: string; + resolvedModelId?: string; + baseUrl?: string; + fingerprint?: string; +} +export declare function runReplayGenerationBenchmarkCommand(options: ReplayGenerationBenchmarkCommandOptions): Promise; export declare function runInitGenerationSessionCommand(options: InitGenerationSessionCommandOptions): Promise; export declare function runPrepareGenerationHandoffCommand(options: PrepareGenerationHandoffCommandOptions): Promise; export declare function runRecordGenerationAttemptCommand(options: RecordGenerationAttemptCommandOptions): Promise; diff --git a/packages/interfacectl-cli/dist/commands/generation-session.d.ts.map b/packages/interfacectl-cli/dist/commands/generation-session.d.ts.map index 77aaf71..5257f5b 100644 --- a/packages/interfacectl-cli/dist/commands/generation-session.d.ts.map +++ b/packages/interfacectl-cli/dist/commands/generation-session.d.ts.map @@ -1 +1 @@ -{"version":3,"file":"generation-session.d.ts","sourceRoot":"","sources":["../../src/commands/generation-session.ts"],"names":[],"mappings":"AA8BA,MAAM,WAAW,mCAAmC;IAClD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,sCAAsC;IACrD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,qCAAqC;IACpD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,sCAAsC;IACrD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC;IAChC,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,qCAAqC;IACpD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC;IAChC,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,wCAAwC;IACvD,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,uCAAuC;IACtD,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,mCAAmC;IAClD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,4CAA4C;IAC3D,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,0CAA0C;IACzD,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAqhED,wBAAsB,+BAA+B,CACnD,OAAO,EAAE,mCAAmC,GAC3C,OAAO,CAAC,MAAM,CAAC,CA8EjB;AAED,wBAAsB,kCAAkC,CACtD,OAAO,EAAE,sCAAsC,GAC9C,OAAO,CAAC,MAAM,CAAC,CA6DjB;AAED,wBAAsB,iCAAiC,CACrD,OAAO,EAAE,qCAAqC,GAC7C,OAAO,CAAC,MAAM,CAAC,CA+FjB;AAED,wBAAsB,kCAAkC,CACtD,OAAO,EAAE,sCAAsC,GAC9C,OAAO,CAAC,MAAM,CAAC,CA0GjB;AAED,wBAAsB,iCAAiC,CACrD,OAAO,EAAE,qCAAqC,GAC7C,OAAO,CAAC,MAAM,CAAC,CAuEjB;AAED,wBAAsB,oCAAoC,CACxD,OAAO,EAAE,wCAAwC,GAChD,OAAO,CAAC,MAAM,CAAC,CAmBjB;AAED,wBAAsB,mCAAmC,CACvD,OAAO,EAAE,uCAAuC,GAC/C,OAAO,CAAC,MAAM,CAAC,CA4CjB;AAED,wBAAsB,+BAA+B,CACnD,OAAO,EAAE,mCAAmC,GAC3C,OAAO,CAAC,MAAM,CAAC,CAuCjB;AAED,wBAAsB,wCAAwC,CAC5D,OAAO,EAAE,4CAA4C,GACpD,OAAO,CAAC,MAAM,CAAC,CA8EjB;AAED,wBAAsB,sCAAsC,CAC1D,OAAO,EAAE,0CAA0C,GAClD,OAAO,CAAC,MAAM,CAAC,CAuIjB"} \ No newline at end of file +{"version":3,"file":"generation-session.d.ts","sourceRoot":"","sources":["../../src/commands/generation-session.ts"],"names":[],"mappings":"AAuCA,MAAM,WAAW,mCAAmC;IAClD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,sCAAsC;IACrD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,qCAAqC;IACpD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,sCAAsC;IACrD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC;IAChC,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,qCAAqC;IACpD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC;IAChC,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,wCAAwC;IACvD,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,uCAAuC;IACtD,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,mCAAmC;IAClD,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,4CAA4C;IAC3D,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,0CAA0C;IACzD,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,uCAAuC;IACtD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAw+ED,wBAAsB,mCAAmC,CACvD,OAAO,EAAE,uCAAuC,GAC/C,OAAO,CAAC,MAAM,CAAC,CAgFjB;AAED,wBAAsB,+BAA+B,CACnD,OAAO,EAAE,mCAAmC,GAC3C,OAAO,CAAC,MAAM,CAAC,CA8EjB;AAED,wBAAsB,kCAAkC,CACtD,OAAO,EAAE,sCAAsC,GAC9C,OAAO,CAAC,MAAM,CAAC,CA6DjB;AAED,wBAAsB,iCAAiC,CACrD,OAAO,EAAE,qCAAqC,GAC7C,OAAO,CAAC,MAAM,CAAC,CA+FjB;AAED,wBAAsB,kCAAkC,CACtD,OAAO,EAAE,sCAAsC,GAC9C,OAAO,CAAC,MAAM,CAAC,CA0GjB;AAED,wBAAsB,iCAAiC,CACrD,OAAO,EAAE,qCAAqC,GAC7C,OAAO,CAAC,MAAM,CAAC,CAuEjB;AAED,wBAAsB,oCAAoC,CACxD,OAAO,EAAE,wCAAwC,GAChD,OAAO,CAAC,MAAM,CAAC,CAmBjB;AAED,wBAAsB,mCAAmC,CACvD,OAAO,EAAE,uCAAuC,GAC/C,OAAO,CAAC,MAAM,CAAC,CA4CjB;AAED,wBAAsB,+BAA+B,CACnD,OAAO,EAAE,mCAAmC,GAC3C,OAAO,CAAC,MAAM,CAAC,CAuCjB;AAED,wBAAsB,wCAAwC,CAC5D,OAAO,EAAE,4CAA4C,GACpD,OAAO,CAAC,MAAM,CAAC,CA8EjB;AAED,wBAAsB,sCAAsC,CAC1D,OAAO,EAAE,0CAA0C,GAClD,OAAO,CAAC,MAAM,CAAC,CA8QjB"} \ No newline at end of file diff --git a/packages/interfacectl-cli/dist/commands/generation-session.js b/packages/interfacectl-cli/dist/commands/generation-session.js index 1fdd7ae..6916605 100644 --- a/packages/interfacectl-cli/dist/commands/generation-session.js +++ b/packages/interfacectl-cli/dist/commands/generation-session.js @@ -9,16 +9,20 @@ import { emitContractRunArtifact, } from "../utils/run-artifacts.js"; import { writeDeterministicJsonSync } from "../utils/deterministic-json.js"; const VALID_TOOLS = new Set(["codex", "cursor", "local-llm"]); const VALID_GRADES = new Set(["strong", "partial", "weak"]); -const VALID_GUIDANCE_STRATEGIES = new Set(["prompt-summary", "json-primary", "unguided"]); +const VALID_GUIDANCE_STRATEGIES = new Set(["prompt-summary", "baseline-primary", "json-primary", "unguided"]); const VALID_REVIEW_STATUSES = new Set(["accepted", "rejected"]); const VALID_SUGGESTION_STATUSES = new Set(["proposed", "accepted", "rejected"]); const VALID_SUCCESS_RULES = new Set(["pass", "pass-or-reviewed-warn"]); +const VALID_EVALUATION_MODES = new Set(["zero-shot", "iterative"]); +const VALID_PLATFORM_TARGETS = new Set(["web", "ios", "android"]); +const VALID_CONSUMER_TYPES = new Set(["web-browser", "desktop-shell", "ios-native", "android-native"]); const ASSESSMENT_DIMENSIONS = [ "structure", "components", "boundary", "visual", "responsiveness", + "platformFit", ]; class SessionInputError extends Error { code; @@ -79,10 +83,31 @@ function ensureGuidanceStrategy(guidanceStrategy) { const normalized = typeof guidanceStrategy === "string" ? guidanceStrategy.trim().toLowerCase() : "prompt-summary"; const mapped = normalized === "prepared" ? "prompt-summary" : normalized; if (!VALID_GUIDANCE_STRATEGIES.has(mapped)) { - throw new SessionInputError(`Invalid guidance strategy "${guidanceStrategy ?? ""}". Expected prompt-summary|json-primary|unguided.`); + throw new SessionInputError(`Invalid guidance strategy "${guidanceStrategy ?? ""}". Expected prompt-summary|baseline-primary|json-primary|unguided.`); } return mapped; } +function ensureEvaluationMode(value) { + const normalized = typeof value === "string" ? value.trim().toLowerCase() : "zero-shot"; + if (!VALID_EVALUATION_MODES.has(normalized)) { + throw new SessionInputError(`Invalid evaluation mode "${value ?? ""}". Expected zero-shot|iterative.`); + } + return normalized; +} +function ensurePlatformTarget(value, label) { + const normalized = typeof value === "string" ? value.trim().toLowerCase() : ""; + if (!VALID_PLATFORM_TARGETS.has(normalized)) { + throw new SessionInputError(`Invalid ${label} "${String(value ?? "")}". Expected web|ios|android.`); + } + return normalized; +} +function ensureConsumerType(value, label) { + const normalized = typeof value === "string" ? value.trim().toLowerCase() : ""; + if (!VALID_CONSUMER_TYPES.has(normalized)) { + throw new SessionInputError(`Invalid ${label} "${String(value ?? "")}". Expected web-browser|desktop-shell|ios-native|android-native.`); + } + return normalized; +} function buildDefaultSessionId() { return new Date().toISOString().replace(/[-:]/g, "").replace(/\.\d{3}Z$/, "Z"); } @@ -121,7 +146,9 @@ function normalizeAssessment(payload, filePath, options = {}) { const structureFallback = payload.structure; const grade = (key) => { let value = payload[key]; - if (value === undefined && options.allowLegacyMissing && (key === "components" || key === "boundary")) { + if (value === undefined + && options.allowLegacyMissing + && (key === "components" || key === "boundary" || key === "platformFit")) { value = structureFallback; } if (!VALID_GRADES.has(value)) { @@ -175,6 +202,7 @@ function normalizeAssessment(payload, filePath, options = {}) { boundary: grade("boundary"), visual: grade("visual"), responsiveness: grade("responsiveness"), + platformFit: grade("platformFit"), notes, ...(touchedFiles && touchedFiles.length > 0 ? { touchedFiles } : {}), ...(heuristics ? { heuristics } : {}), @@ -457,6 +485,22 @@ function averageNullable(values) { } return Math.round((filtered.reduce((sum, value) => sum + value, 0) / filtered.length) * 1000) / 1000; } +function readOptionalTrimmedText(filePath) { + if (!filePath || !fs.existsSync(filePath)) { + return null; + } + return fs.readFileSync(filePath, "utf8").trim(); +} +function appendArtifactLines(lines, title, artifacts) { + const filtered = artifacts.filter(([, filePath]) => Boolean(filePath)); + if (filtered.length === 0) { + return; + } + lines.push("", title); + for (const [label, filePath] of filtered) { + lines.push(`- ${label}: ${filePath}`); + } +} function renderSummaryMarkdown(summary) { const lines = [ "# Generation Session Summary", @@ -496,6 +540,7 @@ function renderSummaryMarkdown(summary) { lines.push(`- boundary: ${summary.latestAssessment?.boundary ?? "n/a"}`); lines.push(`- visual: ${summary.latestAssessment?.visual ?? "n/a"}`); lines.push(`- responsiveness: ${summary.latestAssessment?.responsiveness ?? "n/a"}`); + lines.push(`- platform fit: ${summary.latestAssessment?.platformFit ?? "n/a"}`); lines.push(`- notes: ${summary.latestAssessment?.notes ?? "n/a"}`); if (summary.latestAssessment?.touchedFiles?.length) { lines.push(`- touched files: ${summary.latestAssessment.touchedFiles.join(", ")}`); @@ -785,11 +830,24 @@ function renderSuggestionsMarkdown(artifact) { } return `${lines.join("\n")}\n`; } -function renderBenchmarkReportMarkdown(report) { +function renderBenchmarkReportMarkdown(report, run) { const lines = [ "# Generation Benchmark Report", "", `Generated at: ${report.generatedAt}`, + ...(report.run + ? [ + `Cohort: ${report.run.cohortId}`, + `Evaluation mode: ${report.run.evaluationMode}`, + `Tool: ${report.run.tool}`, + `Model label: ${report.run.model.requestedModelLabel ?? "not recorded"}`, + `Resolved model id: ${report.run.model.resolvedModelId ?? "not recorded"}`, + `Base URL: ${report.run.model.baseUrl ?? "not recorded"}`, + `Fingerprint: ${report.run.model.fingerprint ?? "not recorded"}`, + `Source spec: ${report.run.sourceSpecPath}`, + `Source run: ${report.run.sourceRunPath ?? "none"}`, + ] + : []), `Surfaces: ${report.overall.surfaceCount}`, `Surfaces meeting goal: ${report.overall.surfacesMeetingGoal}`, `Candidate fewer first-attempt blocking findings: ${report.overall.guidedFewerFirstAttemptBlockingFindings}`, @@ -797,8 +855,13 @@ function renderBenchmarkReportMarkdown(report) { "", "## Comparisons", ]; - for (const comparison of report.comparisons) { - lines.push(`- ${comparison.surfaceId}: baseline=${comparison.baselineGuidanceStrategy}, candidate=${comparison.guidedGuidanceStrategy}, meetsGoal=${comparison.meetsGoal}, improved dimensions=${comparison.guidedRubricBetterDimensions.join(", ") || "none"}`); + if (report.comparisons.length === 0) { + lines.push("- none"); + } + else { + for (const comparison of report.comparisons) { + lines.push(`- ${comparison.surfaceId}: baseline=${comparison.baselineGuidanceStrategy}, candidate=${comparison.guidedGuidanceStrategy}, platform=${comparison.platformTarget ?? "unknown"}, consumer=${comparison.consumerType ?? "unknown"}, model=${comparison.modelLabel ?? "unknown"}, meetsGoal=${comparison.meetsGoal}, improved dimensions=${comparison.guidedRubricBetterDimensions.join(", ") || "none"}`); + } } lines.push("", "## Suggestion decisions"); for (const suggestion of report.suggestions) { @@ -811,6 +874,65 @@ function renderBenchmarkReportMarkdown(report) { lines.push(`- lower touched files per resolved finding: ${report.overall.heuristics.lowerTouchedFilesPerResolvedFinding}`); lines.push(`- lower repeated finding carryover count: ${report.overall.heuristics.lowerRepeatedFindingCarryoverCount}`); lines.push(`- lower reruns to acceptable outcome: ${report.overall.heuristics.lowerRerunsToAcceptableOutcome}`); + if (report.breakdowns) { + const renderBreakdownBlock = (title, entries) => { + lines.push("", title); + const keys = Object.keys(entries).sort((left, right) => left.localeCompare(right)); + if (keys.length === 0) { + lines.push("- none"); + return; + } + for (const key of keys) { + const entry = entries[key]; + lines.push(`- ${key}: comparisons=${entry.comparisonCount}, surfaces=${entry.surfaceCount}, meetsGoal=${entry.surfacesMeetingGoal}, fewerBlocking=${entry.guidedFewerFirstAttemptBlockingFindings}, acceptableNoLater=${entry.guidedReachedAcceptableNoLater}`); + } + }; + renderBreakdownBlock("## By Platform Target", report.breakdowns.byPlatformTarget); + renderBreakdownBlock("## By Consumer Type", report.breakdowns.byConsumerType); + renderBreakdownBlock("## By Model", report.breakdowns.byModelLabel); + } + if (run) { + lines.push("", "## Zero-Shot Evidence"); + for (const fixture of run.fixtures) { + lines.push("", `### ${fixture.surfaceId}`, `- fixture: ${fixture.fixtureId}`, `- platform target: ${fixture.platformTarget}`, `- consumer type: ${fixture.consumerType}`, `- capture preset: ${fixture.capturePreset}`, `- brief path: ${fixture.brief.path}`, `- brief sha256: ${fixture.brief.sha256}`); + const briefText = readOptionalTrimmedText(fixture.brief.path); + if (briefText) { + lines.push("", "#### Benchmark Brief", "", "```md", briefText, "```"); + } + appendArtifactLines(lines, "#### Contract Artifacts", [ + ["source contract", fixture.paths?.sourceContractPath], + ["source AST", fixture.paths?.sourceAstPath], + ["bundle root", fixture.paths?.bundleRoot], + ["compiled contract", fixture.paths?.compiledContractPath], + ["effective AST", fixture.paths?.effectiveAstPath], + ]); + appendArtifactLines(lines, "#### Prompt And Input Artifacts", [ + ["prepared input", fixture.paths?.preparedInputPath], + ["accepted suggestions", fixture.paths?.acceptedSuggestionsPath], + ["designer notes", fixture.paths?.designerNotesPath], + ["baseline validate", fixture.paths?.baselineValidatePath], + ]); + if (fixture.comparisons.length > 0) { + lines.push("", "#### Fixture Comparisons"); + for (const comparison of fixture.comparisons) { + lines.push(`- ${comparison.baselineGuidanceStrategy} vs ${comparison.guidedGuidanceStrategy}: ${comparison.comparisonPath}`); + } + } + if (fixture.sessions.length > 0) { + lines.push("", "#### Session Evidence"); + for (const session of fixture.sessions) { + const summary = fs.existsSync(session.summaryPath) + ? readJsonFile(session.summaryPath, "generation benchmark session summary") + : null; + lines.push("", `##### ${session.guidanceStrategy}`, `- session id: ${session.sessionId}`, `- session dir: ${session.sessionDir}`, `- latest status: ${asString(summary?.latestStatus) ?? "not recorded"}`, `- latest outcome: ${asString(summary?.latestOutcome) ?? "not recorded"}`, `- error: ${asString(summary?.errorMessage) ?? "none"}`, `- summary path: ${session.summaryPath}`, `- guidance handoff: ${session.guidanceHandoffPath}`, `- agent input: ${session.agentInputPath}`, `- preview: ${session.previewPath ?? "not captured"}`); + const agentInput = readOptionalTrimmedText(session.agentInputPath); + if (agentInput) { + lines.push("", "```txt", agentInput, "```"); + } + } + } + } + } return `${lines.join("\n")}\n`; } function freezeBriefFile(sessionDir, briefFile) { @@ -900,6 +1022,35 @@ function buildPreparedPromptSummary(preparedPayload) { `Top repair priorities: ${topRepairs.join(", ") || "none"}`, ].join("\n"); } +function buildBaselinePrimarySummary(preparedPayload) { + const surface = asRecord(preparedPayload.surface); + const contract = asRecord(preparedPayload.contract); + const constraints = asRecord(preparedPayload.constraints); + const generation = asRecord(preparedPayload.generation); + const layout = asRecord(generation.layout); + const guidance = asRecord(generation.guidance); + const boundaryRules = Array.isArray(guidance.boundaryRules) + ? guidance.boundaryRules.filter((entry) => isRecord(entry)) + : []; + const sections = Array.isArray(preparedPayload.sections) + ? preparedPayload.sections.filter((entry) => isRecord(entry)) + : []; + const repairMap = extractRepairEntries(preparedPayload.repairMap); + const color = asRecord(constraints.color); + const motion = asRecord(constraints.motion); + return [ + `Surface: ${asString(surface.id) ?? "unknown"} (${asString(surface.type) ?? "unspecified"})`, + `Contract: ${asString(contract.id) ?? "unknown"} v${asString(contract.version) ?? "0.0.0"}`, + `Required sections: ${sections.map((entry) => asString(entry.id) ?? "").filter(Boolean).join(", ") || "none recorded"}`, + `Boundary rules: ${boundaryRules.map((entry) => asString(entry.id) ?? "").filter(Boolean).join(", ") || "none recorded"}`, + `Max content width: ${typeof layout.maxContentWidth === "number" ? `${layout.maxContentWidth}px` : "unspecified"}`, + `Allowed colors: ${asStringArray(color.allowedValues).join(", ") || "none recorded"}`, + `Motion durations: ${Array.isArray(motion.allowedDurationsMs) + ? motion.allowedDurationsMs.map((value) => `${String(value)}ms`).join(", ") + : "none recorded"}`, + `Top repair codes: ${repairMap.slice(0, 5).map((entry) => asString(entry.code) ?? "").filter(Boolean).join(", ") || "none"}`, + ].join("\n"); +} function selectRelevantComponents(preparedPayload) { const sections = Array.isArray(preparedPayload.sections) ? preparedPayload.sections.filter((entry) => isRecord(entry)) @@ -1064,6 +1215,12 @@ function buildGuidanceHandoff(session, paths, guidanceStrategy, options = {}) { preparedGuidanceSummary: buildPreparedPromptSummary(preparedPayload), } : null, + baselinePrimary: guidanceStrategy === "baseline-primary" + ? { + effectiveContractSummary: summarizeContractForSurface(session.contractPath, session.surfaceId), + baselineContractSummary: buildBaselinePrimarySummary(preparedPayload), + } + : null, jsonPrimary: guidanceStrategy === "json-primary" ? { surface: asRecord(preparedPayload.surface), @@ -1398,6 +1555,200 @@ function normalizeSuggestionReviewFile(filePath) { }; }); } +function buildDefaultBenchmarkCohortId() { + return new Date().toISOString().replace(/[-:.TZ]/g, "").slice(0, 14); +} +function normalizeBenchmarkComparisonPairs(value, label) { + if (!Array.isArray(value) || value.length === 0) { + throw new SessionInputError(`${label} must be a non-empty array.`); + } + return value.map((entry, index) => { + const record = asRecord(entry); + return { + baselineGuidanceStrategy: ensureGuidanceStrategy(asString(record.baselineGuidanceStrategy) ?? (() => { + throw new SessionInputError(`${label}[${index}].baselineGuidanceStrategy is required.`); + })()), + guidedGuidanceStrategy: ensureGuidanceStrategy(asString(record.guidedGuidanceStrategy) ?? (() => { + throw new SessionInputError(`${label}[${index}].guidedGuidanceStrategy is required.`); + })()), + }; + }); +} +function loadGenerationBenchmarkSpec(specPath) { + const resolvedPath = path.resolve(specPath); + const payload = readJsonFile(resolvedPath, "generation benchmark spec"); + const fixturesValue = payload.fixtures; + if (!Array.isArray(fixturesValue) || fixturesValue.length === 0) { + throw new SessionInputError(`Benchmark spec must include a non-empty fixtures array: ${resolvedPath}.`); + } + const guidanceStrategies = asStringArray(payload.guidanceStrategies).map((entry) => ensureGuidanceStrategy(entry)); + if (guidanceStrategies.length < 2) { + throw new SessionInputError(`Benchmark spec must freeze at least two guidance strategies: ${resolvedPath}.`); + } + const comparisonPairs = normalizeBenchmarkComparisonPairs(payload.comparisonPairs, "comparisonPairs"); + const attemptBudget = Number(payload.attemptBudget); + if (!Number.isInteger(attemptBudget) || attemptBudget < 1) { + throw new SessionInputError(`Benchmark spec attemptBudget must be a positive integer: ${resolvedPath}.`); + } + return { + schemaVersion: 1, + specId: asString(payload.specId) ?? path.basename(resolvedPath, path.extname(resolvedPath)), + generatedAt: asString(payload.generatedAt) ?? new Date().toISOString(), + evaluationMode: ensureEvaluationMode(asString(payload.evaluationMode) ?? "zero-shot"), + attemptBudget, + guidanceStrategies, + comparisonPairs, + ...(asString(payload.suiteId) ? { suiteId: asString(payload.suiteId) ?? undefined } : {}), + ...(asString(payload.suiteName) ? { suiteName: asString(payload.suiteName) ?? undefined } : {}), + fixtures: fixturesValue.map((entry, index) => { + const record = asRecord(entry); + const brief = asRecord(record.brief); + const pathsRecord = record.paths !== undefined ? asRecord(record.paths) : null; + const fixtureComparisonPairs = record.comparisonPairs !== undefined + ? normalizeBenchmarkComparisonPairs(record.comparisonPairs, `fixtures[${index}].comparisonPairs`) + : comparisonPairs; + return { + fixtureId: asString(record.fixtureId) ?? (() => { + throw new SessionInputError(`fixtures[${index}].fixtureId is required in ${resolvedPath}.`); + })(), + surfaceId: asString(record.surfaceId) ?? (() => { + throw new SessionInputError(`fixtures[${index}].surfaceId is required in ${resolvedPath}.`); + })(), + brief: { + path: asString(brief.path) ?? (() => { + throw new SessionInputError(`fixtures[${index}].brief.path is required in ${resolvedPath}.`); + })(), + sha256: asString(brief.sha256) ?? (() => { + throw new SessionInputError(`fixtures[${index}].brief.sha256 is required in ${resolvedPath}.`); + })(), + }, + platformTarget: ensurePlatformTarget(record.platformTarget, `fixtures[${index}].platformTarget`), + consumerType: ensureConsumerType(record.consumerType, `fixtures[${index}].consumerType`), + capturePreset: asString(record.capturePreset) ?? "web-browser", + comparisonPairs: fixtureComparisonPairs, + ...(pathsRecord + ? { + paths: { + ...(asString(pathsRecord.fixtureDir) ? { fixtureDir: asString(pathsRecord.fixtureDir) ?? undefined } : {}), + ...(asString(pathsRecord.sourceContractPath) + ? { sourceContractPath: asString(pathsRecord.sourceContractPath) ?? undefined } + : {}), + ...(asString(pathsRecord.sourceAstPath) + ? { sourceAstPath: asString(pathsRecord.sourceAstPath) ?? undefined } + : {}), + ...(asString(pathsRecord.bundleRoot) + ? { bundleRoot: asString(pathsRecord.bundleRoot) ?? undefined } + : {}), + ...(asString(pathsRecord.compiledContractPath) + ? { compiledContractPath: asString(pathsRecord.compiledContractPath) ?? undefined } + : {}), + ...(asString(pathsRecord.effectiveAstPath) + ? { effectiveAstPath: asString(pathsRecord.effectiveAstPath) ?? undefined } + : {}), + ...(asString(pathsRecord.preparedInputPath) + ? { preparedInputPath: asString(pathsRecord.preparedInputPath) ?? undefined } + : {}), + ...(asString(pathsRecord.acceptedSuggestionsPath) + ? { acceptedSuggestionsPath: asString(pathsRecord.acceptedSuggestionsPath) ?? undefined } + : {}), + ...(asString(pathsRecord.designerNotesPath) + ? { designerNotesPath: asString(pathsRecord.designerNotesPath) ?? undefined } + : {}), + ...(asString(pathsRecord.baselineValidatePath) + ? { baselineValidatePath: asString(pathsRecord.baselineValidatePath) ?? undefined } + : {}), + }, + } + : {}), + }; + }), + }; +} +function loadGenerationBenchmarkRun(runPath) { + return readJsonFile(path.resolve(runPath), "generation benchmark run"); +} +function buildBreakdownSummary(entries) { + return { + comparisonCount: entries.length, + surfaceCount: new Set(entries.map((entry) => entry.surfaceId)).size, + surfacesMeetingGoal: entries.filter((entry) => entry.meetsGoal).length, + guidedFewerFirstAttemptBlockingFindings: entries.filter((entry) => entry.guidedFewerFirstAttemptBlockingFindings).length, + guidedReachedAcceptableNoLater: entries.filter((entry) => entry.guidedReachedAcceptableNoLater).length, + }; +} +export async function runReplayGenerationBenchmarkCommand(options) { + try { + if (!options.specPath) { + throw new SessionInputError("--spec is required."); + } + if (!options.outDir) { + throw new SessionInputError("--out-dir is required."); + } + const tool = ensureSessionTool(options.tool); + const specPath = path.resolve(options.specPath); + const spec = loadGenerationBenchmarkSpec(specPath); + const benchmarkDir = path.resolve(options.outDir); + const cohortId = options.cohortId?.trim() || buildDefaultBenchmarkCohortId(); + const runPath = path.join(benchmarkDir, "run.json"); + const copiedSpecPath = path.join(benchmarkDir, "spec.json"); + const sourceRunPath = options.sourceRunPath ? path.resolve(options.sourceRunPath) : null; + fs.mkdirSync(benchmarkDir, { recursive: true }); + if (path.resolve(specPath) !== path.resolve(copiedSpecPath)) { + fs.copyFileSync(specPath, copiedSpecPath); + } + const run = { + schemaVersion: 1, + cohortId, + generatedAt: new Date().toISOString(), + evaluationMode: spec.evaluationMode, + tool, + sourceSpecPath: specPath, + sourceRunPath, + attemptBudget: spec.attemptBudget, + guidanceStrategies: [...spec.guidanceStrategies], + comparisonPairs: spec.comparisonPairs.map((pair) => ({ ...pair })), + model: { + requestedModelLabel: options.requestedModelLabel?.trim() || null, + resolvedModelId: options.resolvedModelId?.trim() || null, + baseUrl: options.baseUrl?.trim() || null, + fingerprint: options.fingerprint?.trim() || null, + }, + ...(spec.suiteId ? { suiteId: spec.suiteId } : {}), + ...(spec.suiteName ? { suiteName: spec.suiteName } : {}), + paths: { + benchmarkDir, + specPath: copiedSpecPath, + runPath, + reportJsonPath: null, + reportMarkdownPath: null, + }, + fixtures: spec.fixtures.map((fixture) => ({ + ...fixture, + sessions: [], + comparisons: [], + })), + }; + writeDeterministicJsonSync(runPath, run); + process.stdout.write(`${JSON.stringify({ + ok: true, + run, + paths: { + specPath: copiedSpecPath, + runPath, + benchmarkDir, + }, + }, null, 2)}\n`); + return 0; + } + catch (error) { + if (error instanceof SessionInputError || error instanceof AdapterInputError) { + writeError(error, error.code); + return 10; + } + writeError(error instanceof Error ? error : new Error(String(error)), "generation-session.internal"); + return 1; + } +} export async function runInitGenerationSessionCommand(options) { try { if (!options.bundleRoot) { @@ -1925,8 +2276,12 @@ export async function runReviewContractDeltaSuggestionsCommand(options) { } export async function runSummarizeGenerationBenchmarkCommand(options) { try { + const run = options.runPath ? loadGenerationBenchmarkRun(options.runPath) : null; const comparisonPaths = parseCsvPaths(options.comparisonPaths); - if (comparisonPaths.length === 0) { + if (comparisonPaths.length === 0 && run) { + comparisonPaths.push(...run.fixtures.flatMap((fixture) => fixture.comparisons.map((comparison) => path.resolve(comparison.comparisonPath)))); + } + if (comparisonPaths.length === 0 && !run) { throw new SessionInputError("--comparisons must include at least one comparison artifact path."); } const suggestionPaths = parseCsvPaths(options.suggestionPaths); @@ -1938,21 +2293,57 @@ export async function runSummarizeGenerationBenchmarkCommand(options) { path: suggestionPath, value: readJsonFile(suggestionPath, "contract delta suggestions artifact"), })); + const fixtureMetadataByComparisonPath = new Map(); + if (run) { + for (const fixture of run.fixtures) { + for (const comparison of fixture.comparisons) { + fixtureMetadataByComparisonPath.set(path.resolve(comparison.comparisonPath), { + platformTarget: fixture.platformTarget, + consumerType: fixture.consumerType, + }); + } + } + } const report = { - schemaVersion: 2, + schemaVersion: 3, generatedAt: new Date().toISOString(), - comparisons: comparisons.map(({ path: comparisonPath, value }) => ({ - surfaceId: value.surfaceId, - tool: value.tool, - comparisonPath, - meetsGoal: value.checks.meetsGoal, - baselineGuidanceStrategy: value.baseline.guidanceStrategy, - guidedGuidanceStrategy: value.guided.guidanceStrategy, - guidedFewerFirstAttemptBlockingFindings: value.checks.guidedFewerFirstAttemptBlockingFindings, - guidedReachedAcceptableNoLater: value.checks.guidedReachedAcceptableNoLater, - guidedRubricBetterDimensions: value.checks.guidedRubricBetterDimensions, - heuristics: value.heuristics.delta, - })), + ...(run + ? { + run: { + cohortId: run.cohortId, + evaluationMode: run.evaluationMode, + tool: run.tool, + sourceSpecPath: run.sourceSpecPath, + sourceRunPath: run.sourceRunPath, + guidanceStrategies: [...run.guidanceStrategies], + attemptBudget: run.attemptBudget, + model: { + requestedModelLabel: run.model.requestedModelLabel, + resolvedModelId: run.model.resolvedModelId, + baseUrl: run.model.baseUrl, + fingerprint: run.model.fingerprint, + }, + }, + } + : {}), + comparisons: comparisons.map(({ path: comparisonPath, value }) => { + const comparisonMetadata = fixtureMetadataByComparisonPath.get(path.resolve(comparisonPath)); + return { + surfaceId: value.surfaceId, + tool: value.tool, + comparisonPath, + meetsGoal: value.checks.meetsGoal, + baselineGuidanceStrategy: value.baseline.guidanceStrategy, + guidedGuidanceStrategy: value.guided.guidanceStrategy, + ...(comparisonMetadata ? { platformTarget: comparisonMetadata.platformTarget } : {}), + ...(comparisonMetadata ? { consumerType: comparisonMetadata.consumerType } : {}), + ...(run ? { modelLabel: run.model.requestedModelLabel ?? run.model.resolvedModelId ?? "unknown" } : {}), + guidedFewerFirstAttemptBlockingFindings: value.checks.guidedFewerFirstAttemptBlockingFindings, + guidedReachedAcceptableNoLater: value.checks.guidedReachedAcceptableNoLater, + guidedRubricBetterDimensions: value.checks.guidedRubricBetterDimensions, + heuristics: value.heuristics.delta, + }; + }), suggestions: suggestions.map(({ path: suggestionsPath, value }) => ({ surfaceId: value.surfaceId, sessionId: value.sessionId, @@ -1962,7 +2353,7 @@ export async function runSummarizeGenerationBenchmarkCommand(options) { rejectedCount: value.suggestions.filter((entry) => entry.status === "rejected").length, })), overall: { - surfaceCount: comparisons.length, + surfaceCount: comparisons.length > 0 ? comparisons.length : (run?.fixtures.length ?? 0), surfacesMeetingGoal: comparisons.filter(({ value }) => value.checks.meetsGoal).length, guidedFewerFirstAttemptBlockingFindings: comparisons.filter(({ value }) => value.checks.guidedFewerFirstAttemptBlockingFindings).length, guidedReachedAcceptableNoLater: comparisons.filter(({ value }) => value.checks.guidedReachedAcceptableNoLater).length, @@ -1986,6 +2377,72 @@ export async function runSummarizeGenerationBenchmarkCommand(options) { }, }, }, + ...(run + ? { + breakdowns: { + byPlatformTarget: Object.fromEntries([...new Set(run.fixtures.map((fixture) => fixture.platformTarget))] + .sort((left, right) => left.localeCompare(right)) + .map((platformTarget) => [ + platformTarget, + buildBreakdownSummary(comparisons + .map(({ path: comparisonPath, value }) => ({ + ...value, + __comparisonPath: comparisonPath, + })) + .filter((entry) => fixtureMetadataByComparisonPath.get(path.resolve(entry.__comparisonPath))?.platformTarget === platformTarget) + .map((entry) => ({ + surfaceId: entry.surfaceId, + tool: entry.tool, + comparisonPath: entry.__comparisonPath, + meetsGoal: entry.checks.meetsGoal, + baselineGuidanceStrategy: entry.baseline.guidanceStrategy, + guidedGuidanceStrategy: entry.guided.guidanceStrategy, + guidedFewerFirstAttemptBlockingFindings: entry.checks.guidedFewerFirstAttemptBlockingFindings, + guidedReachedAcceptableNoLater: entry.checks.guidedReachedAcceptableNoLater, + guidedRubricBetterDimensions: entry.checks.guidedRubricBetterDimensions, + heuristics: entry.heuristics.delta, + }))), + ])), + byConsumerType: Object.fromEntries([...new Set(run.fixtures.map((fixture) => fixture.consumerType))] + .sort((left, right) => left.localeCompare(right)) + .map((consumerType) => [ + consumerType, + buildBreakdownSummary(comparisons + .map(({ path: comparisonPath, value }) => ({ + ...value, + __comparisonPath: comparisonPath, + })) + .filter((entry) => fixtureMetadataByComparisonPath.get(path.resolve(entry.__comparisonPath))?.consumerType === consumerType) + .map((entry) => ({ + surfaceId: entry.surfaceId, + tool: entry.tool, + comparisonPath: entry.__comparisonPath, + meetsGoal: entry.checks.meetsGoal, + baselineGuidanceStrategy: entry.baseline.guidanceStrategy, + guidedGuidanceStrategy: entry.guided.guidanceStrategy, + guidedFewerFirstAttemptBlockingFindings: entry.checks.guidedFewerFirstAttemptBlockingFindings, + guidedReachedAcceptableNoLater: entry.checks.guidedReachedAcceptableNoLater, + guidedRubricBetterDimensions: entry.checks.guidedRubricBetterDimensions, + heuristics: entry.heuristics.delta, + }))), + ])), + byModelLabel: { + [run.model.requestedModelLabel ?? run.model.resolvedModelId ?? "unknown"]: buildBreakdownSummary(comparisons.map(({ path: comparisonPath, value }) => ({ + surfaceId: value.surfaceId, + tool: value.tool, + comparisonPath, + meetsGoal: value.checks.meetsGoal, + baselineGuidanceStrategy: value.baseline.guidanceStrategy, + guidedGuidanceStrategy: value.guided.guidanceStrategy, + guidedFewerFirstAttemptBlockingFindings: value.checks.guidedFewerFirstAttemptBlockingFindings, + guidedReachedAcceptableNoLater: value.checks.guidedReachedAcceptableNoLater, + guidedRubricBetterDimensions: value.checks.guidedRubricBetterDimensions, + heuristics: value.heuristics.delta, + }))), + }, + }, + } + : {}), }; const outDir = options.outDir ? path.resolve(options.outDir) @@ -1994,7 +2451,17 @@ export async function runSummarizeGenerationBenchmarkCommand(options) { const markdownPath = path.join(outDir, "benchmark-report.md"); writeDeterministicJsonSync(jsonPath, report); fs.mkdirSync(path.dirname(markdownPath), { recursive: true }); - fs.writeFileSync(markdownPath, renderBenchmarkReportMarkdown(report), "utf8"); + fs.writeFileSync(markdownPath, renderBenchmarkReportMarkdown(report, run), "utf8"); + if (run && options.runPath) { + writeDeterministicJsonSync(path.resolve(options.runPath), { + ...run, + paths: { + ...run.paths, + reportJsonPath: jsonPath, + reportMarkdownPath: markdownPath, + }, + }); + } process.stdout.write(`${JSON.stringify({ ok: true, report, diff --git a/packages/interfacectl-cli/dist/index.js b/packages/interfacectl-cli/dist/index.js index 7de1261..ad21b44 100755 --- a/packages/interfacectl-cli/dist/index.js +++ b/packages/interfacectl-cli/dist/index.js @@ -15,7 +15,7 @@ import { runPrepareRuntimeCommand } from "./commands/prepare-runtime.js"; import { runValidateGenerationCommand } from "./commands/validate-generation.js"; import { runServeGenerationAdapterCommand } from "./commands/serve-generation-adapter.js"; import { runEmitRunArtifactCommand } from "./commands/emit-run-artifact.js"; -import { runCaptureGenerationPreviewCommand, runCompareGenerationSessionsCommand, runInitGenerationSessionCommand, runPrepareGenerationHandoffCommand, runRecordGenerationAttemptCommand, runReviewContractDeltaSuggestionsCommand, runReviewGenerationAttemptCommand, runSuggestContractDeltasCommand, runSummarizeGenerationSessionCommand, runSummarizeGenerationBenchmarkCommand, } from "./commands/generation-session.js"; +import { runCaptureGenerationPreviewCommand, runCompareGenerationSessionsCommand, runInitGenerationSessionCommand, runPrepareGenerationHandoffCommand, runReplayGenerationBenchmarkCommand, runRecordGenerationAttemptCommand, runReviewContractDeltaSuggestionsCommand, runReviewGenerationAttemptCommand, runSuggestContractDeltasCommand, runSummarizeGenerationSessionCommand, runSummarizeGenerationBenchmarkCommand, } from "./commands/generation-session.js"; import { runInitCommand } from "./commands/init.js"; import { runAnalyzeCommand } from "./commands/analyze.js"; import { runAuthCaptureCommand, runAuthClearCommand, runAuthListCommandWithOptions, runAuthTestCommand, } from "./commands/auth.js"; @@ -251,7 +251,7 @@ program .requiredOption("--surface ", "Surface identifier") .requiredOption("--workspace-root ", "Workspace root for emitted run artifacts") .option("--tool ", "Generation tool identifier (codex|cursor|local-llm)") - .option("--guidance-strategy ", "Session guidance strategy (prompt-summary|json-primary|unguided)") + .option("--guidance-strategy ", "Session guidance strategy (prompt-summary|baseline-primary|json-primary|unguided)") .option("--guidance-mode ", "Legacy alias for --guidance-strategy (prepared|unguided)") .option("--brief-file ", "Optional implementation brief file to freeze into the session") .option("--session ", "Optional session identifier") @@ -273,7 +273,7 @@ program .command("prepare-generation-handoff") .description("Build one canonical strategy-aware guidance handoff artifact for a tracked generation session") .requiredOption("--session-dir ", "Path to the generation session directory") - .option("--guidance-strategy ", "Optional guidance strategy override (prompt-summary|json-primary|unguided)") + .option("--guidance-strategy ", "Optional guidance strategy override (prompt-summary|baseline-primary|json-primary|unguided)") .option("--accepted-suggestions ", "Optional accepted suggestions JSON file") .option("--designer-notes ", "Optional designer notes JSON file") .option("--finding-codes ", "Optional comma-separated finding codes to match against repair guidance") @@ -378,16 +378,43 @@ program program .command("summarize-generation-benchmark") .description("Aggregate one or more comparison and suggestion artifacts into a benchmark report") - .requiredOption("--comparisons ", "Comma-separated generation session comparison JSON paths") + .option("--comparisons ", "Comma-separated generation session comparison JSON paths") .option("--suggestions ", "Comma-separated contract delta suggestion JSON paths") + .option("--run-path ", "Optional benchmark run manifest to enrich the benchmark report") .option("--out-dir ", "Output directory for the benchmark report") .action(async (options) => { process.exitCode = await runSummarizeGenerationBenchmarkCommand({ comparisonPaths: options.comparisons, suggestionPaths: options.suggestions, + runPath: options.runPath, outDir: options.outDir, }); }); +program + .command("replay-generation-benchmark") + .description("Freeze a benchmark spec into a new replayable benchmark run manifest") + .requiredOption("--spec ", "Path to the benchmark spec JSON file") + .requiredOption("--tool ", "Generation tool identifier (codex|cursor|local-llm)") + .requiredOption("--out-dir ", "Output directory for the replay run") + .option("--cohort-id ", "Optional cohort id override") + .option("--source-run ", "Optional source benchmark run manifest") + .option("--requested-model-label