From fdbe02b2ebbeac48cc1b4125522077c37d21a891 Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 02:25:13 +0100 Subject: [PATCH 1/9] feat(benchmarks): Add Claude UI benchmark harness Add a local Claude Code benchmark harness for measuring UI task runs against XcodeBuildMCP and optional local tool surfaces. The harness now records observed baselines, writes per-run artifacts, supports private local suites, and reports completion separately from benchmark metrics. Keep vendor/private suites out of tracked source by discovering ignored local benchmark suites from a generic local directory. Refresh the committed first-party baselines from the latest canonical benchmark runs. Co-Authored-By: OpenAI Codex --- .gitignore | 1 + benchmarks/claude-ui/README.md | 162 +++-- .../claude-ui/parse_claude_conversation.py | 26 +- benchmarks/claude-ui/suites/contacts.yml | 26 +- benchmarks/claude-ui/suites/reminders.yml | 24 +- benchmarks/claude-ui/suites/weather.yml | 20 +- .../__tests__/claude-ui-benchmark.test.ts | 165 +++-- .../__tests__/claude-ui-tool-config.test.ts | 685 ++++++++++++++++++ .../__tests__/first-run-preflight.test.ts | 108 ++- .../__tests__/preflight-commands.test.ts | 36 + .../claude-ui/__tests__/render.test.ts | 124 ++-- .../simulator-existing-lifecycle.test.ts | 89 +++ .../__tests__/simulator-lifecycle.test.ts | 31 +- src/benchmarks/claude-ui/claude-invocation.ts | 105 +++ src/benchmarks/claude-ui/compare.ts | 145 ++-- src/benchmarks/claude-ui/config.ts | 162 +++-- src/benchmarks/claude-ui/constants.ts | 13 + .../claude-ui/first-run-preflight.ts | 10 +- src/benchmarks/claude-ui/harness.ts | 524 +++++++++----- src/benchmarks/claude-ui/mcp-config.ts | 140 ++++ .../claude-ui/preflight-commands.ts | 136 ++++ src/benchmarks/claude-ui/render.ts | 154 ++-- .../claude-ui/simulator-deletion.ts | 82 +++ .../claude-ui/simulator-lifecycle.ts | 305 +++++--- src/benchmarks/claude-ui/transcript.ts | 256 ++++++- src/benchmarks/claude-ui/types.ts | 90 ++- 26 files changed, 2789 insertions(+), 830 deletions(-) create mode 100644 src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts create mode 100644 src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts create mode 100644 src/benchmarks/claude-ui/__tests__/simulator-existing-lifecycle.test.ts create mode 100644 src/benchmarks/claude-ui/claude-invocation.ts create mode 100644 src/benchmarks/claude-ui/constants.ts create mode 100644 src/benchmarks/claude-ui/mcp-config.ts create mode 100644 src/benchmarks/claude-ui/preflight-commands.ts create mode 100644 src/benchmarks/claude-ui/simulator-deletion.ts diff --git a/.gitignore b/.gitignore index fce89463..ba0a68ac 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,4 @@ DerivedData /repros /.xcodebuildmcp /out.nosync +/benchmarks/claude-ui/local/ diff --git a/benchmarks/claude-ui/README.md b/benchmarks/claude-ui/README.md index 3723cc05..ebbfdbe1 100644 --- a/benchmarks/claude-ui/README.md +++ b/benchmarks/claude-ui/README.md @@ -1,19 +1,19 @@ # Claude UI benchmark harness -Local/manual harness for running Claude Code against the development XcodeBuildMCP MCP server and auditing UI automation behavior. +Local/manual harness for running Claude Code against configurable tool surfaces and auditing UI automation behavior. The default suite configuration targets the development XcodeBuildMCP MCP server. The harness: - reads a suite YAML file from `benchmarks/claude-ui/suites/` - reads the referenced prompt Markdown file from disk and feeds it to `claude -p` - creates, boots, waits for, and opens a fresh temporary simulator before Claude launches for each suite run by default -- writes an isolated per-run MCP workspace config with the suite defaults and temporary `simulatorId` -- generates a Claude MCP config pointing at `node build/cli.js mcp` with `XCODEBUILDMCP_CWD` set to that isolated workspace +- writes an isolated per-run MCP workspace config with the suite defaults and temporary `simulatorId` when MCP is enabled +- generates a Claude MCP config pointing at `node build/cli.js mcp` with `XCODEBUILDMCP_CWD` set to that isolated workspace when MCP is enabled - optionally preflights configured first-run prompts before Claude launches, outside the measured run - deletes the temporary simulator at the end of the suite, best effort, using only the ID created by the harness - writes artifacts under `out.nosync/claude-benchmarks///` - runs the bundled `parse_claude_conversation.py` parser against Claude's stream JSONL -- audits tool counts, MCP calls, UI automation calls, wall clock, failures/stumbles, and expected tool sequence drift +- audits tool counts, configured tracked tool calls, UI automation calls, wall clock, stumbles, and observed tool sequence differences - prints a structured per-suite report and (for `--all`) an aggregate summary - optionally prints machine-readable JSON with `--json` - can render an existing `result.json` or artifact directory with `--from-result` without rerunning Claude @@ -35,7 +35,11 @@ Shortcut: npm run bench:claude-ui -- --suite weather ``` -Run every suite YAML: +CLI-backed suites should use benchmark-local skills rather than global Claude/agent skills. Keep private/local CLI suites and assets under `benchmarks/claude-ui/local/`; that directory is ignored by git. For local CLI skills, point the suite at the local directory with `claude.pluginDirs`. + +Do not rely on skills installed in `~/.claude`, `~/.agents`, or the repository root; the benchmark run can isolate Claude's working directory so only the suite's configured tool skill is visible. + +Run every committed suite YAML plus any private local suites under `benchmarks/claude-ui/local/suites/`: ```bash npm run bench:claude-ui -- --all @@ -74,36 +78,92 @@ firstRunPromptDismissals: timeoutSeconds: 12 baseline: totalToolCalls: 19 + trackedToolCalls: 18 mcpToolCalls: 18 uiAutomationCalls: 16 wallClockSeconds: 125 tools: snapshot_ui: 1 tap: 9 -allowedVariance: - totalToolCalls: 2 - mcpToolCalls: 2 - uiAutomationCalls: 2 - wallClockSeconds: 45 - toolCalls: 2 -expectedToolSequence: +baselineToolSequence: - session_show_defaults - build_run_sim - snapshot_ui -sequence: - mode: warn failurePatterns: - STALE_ELEMENT_REF - SNAPSHOT_MISSING - WAIT_TIMEOUT +ignoredFailurePatterns: + - element_disabled ``` -Variance is an upper bound: lower tool counts or faster runs are accepted, while values above `baseline + allowedVariance` fail. Defaults are `totalToolCalls: 0`, `mcpToolCalls: 0`, `uiAutomationCalls: 0`, `toolCalls: 0`, and `wallClockSeconds: 30`. +Baselines are recorded comparison data, not completion gates. Metric rows show the current observed value, the recorded baseline value, and the delta. Use baselines to compare against the best or representative successful run recorded so far, not as made-up correctness thresholds. + +When recording new baselines, only write baseline values from clean successful runs. A clean run means Claude completed the task, Claude exited with status `0`, the parser exited with status `0`, there were no parse errors, and there were no unexpected terminal/process/task failures. Old-baseline metric drift does not disqualify a run. -Tool sequence drift is warning-only by default (`sequence.mode: warn`) because real Claude runs can choose equally valid UI paths. Use `sequence.mode: fail` only for suites where exact MCP call order is part of the contract. +Retry a suite up to three total attempts when trying to establish a baseline. If no attempt produces a clean successful run, remove that suite's `baseline:` block rather than leaving stale recorded data behind. Report `no clean baseline after 3 attempts / no baseline recorded` in your notes instead. + +Tool sequence differences are reported as observed comparison data because real Claude runs can choose equally valid UI paths. Sequence differences do not affect task/process completion status. `sessionDefaults` are written to a harness-owned config at `/mcp-workspace/.xcodebuildmcp/config.yaml`. The generated Claude MCP config sets `XCODEBUILDMCP_CWD` to `/mcp-workspace`, so the dev MCP server reads only the benchmark config instead of any repo or example-project `.xcodebuildmcp/config.yaml`. Unknown keys fail fast. Relative path defaults such as `projectPath`, `workspacePath`, and `derivedDataPath` are resolved against the suite `workingDirectory` before being written because the MCP server cwd is the isolated workspace. +## Configuring Claude and tracked tools + +Suites can override the Claude invocation without changing harness code. Omit this block for the default XcodeBuildMCP MCP behavior. + +```yaml +claude: + useMcpServer: false + tools: + - Bash + allowedTools: + - Bash(vendorcli *) + - Bash(xcodebuild *) + appendSystemPrompt: | + Use the simulator from `$CLAUDE_UI_BENCHMARK_SIMULATOR_ID`. + You may use the configured local CLI and xcodebuild directly. + pluginDirs: + - benchmarks/claude-ui/local/skills/vendor-cli + isolatedWorkingDirectory: true + extraArgs: + - --setting-sources + - project,local + - --model + - sonnet +toolAnalysis: + matchers: + - kind: bashCommand + commandPrefix: vendorcli ui screen + shortName: vendorcli.screen + uiAutomation: true + - kind: bashCommand + commandPrefix: vendorcli ui tap + shortName: vendorcli.tap + uiAutomation: true + - kind: bashCommand + commandPrefix: xcodebuild + shortName: xcodebuild +``` + +`claude.useMcpServer: false` writes an empty per-run MCP config and passes it with `--strict-mcp-config`, so project/user MCP servers cannot leak into CLI-only benchmark runs. The harness still prepares the simulator lifecycle and exports `CLAUDE_UI_BENCHMARK_SIMULATOR_ID`, `CLAUDE_UI_BENCHMARK_RUN_DIR`, and `CLAUDE_UI_BENCHMARK_WORKING_DIRECTORY` to Claude. `appendSystemPrompt` also supports `{simulatorId}`, `{runDirectory}`, and `{workingDirectory}` placeholders. + +`claude.pluginDirs` is passed to Claude as one `--plugin-dir` argument per configured path, resolved from the repository root. Use this for suite-specific local/private CLI skills. `claude.isolatedWorkingDirectory: true` runs Claude from the per-run artifact directory instead of the suite working directory, which prevents repository/project skills from being discovered implicitly. When using an isolated working directory, include absolute `{workingDirectory}` paths in prompts for build commands or project files. + +`toolAnalysis.matchers` defines what the analyzer treats as benchmark-relevant. `namePrefix` matchers track MCP-style tool names and can strip the prefix or final `__` segment. `bashCommand` matchers track Claude `Bash` tool calls whose `command` starts with the configured prefix. `uiAutomation: true` marks a tracked command as UI automation; xcodebuild commands can be tracked without counting as UI automation. + +`ignoredFailurePatterns` removes configured, known non-terminal tool-result errors from observed stumbles and from `failurePatterns` matches. Keep these patterns narrow and suite-specific. This is useful for CLI tools that return nonzero for exploratory probes while the agent can recover and still complete the user task. + +Use `preflightCommands` when a CLI tool needs host setup outside Claude's measured run, such as starting its companion app or validating local health: + +```yaml +preflightCommands: + - open -a LocalBenchTool + - sleep 3 + - vendorcli status + - vendorcli ui inspect --udid "$CLAUDE_UI_BENCHMARK_SIMULATOR_ID" + - vendorcli ui home --udid "$CLAUDE_UI_BENCHMARK_SIMULATOR_ID" +``` + ## Temporary simulator lifecycle By default, each suite creates a fresh simulator before Claude launches. The harness uses `sessionDefaults.simulatorName` as the `simctl create` device type name, captures the returned simulator ID, boots that simulator, waits for `simctl bootstatus -b`, opens Simulator.app to that device, applies a short UI-readiness delay, and writes the simulator ID as `sessionDefaults.simulatorId` in the isolated MCP workspace config. This makes Claude and the dev MCP server target a visible, booted, isolated simulator instead of reusing a previous run's state or spending benchmark calls on simulator boot/open setup. @@ -113,7 +173,7 @@ Simulator setup is deliberately outside the benchmark measurement boundary. The Config contract: - Omit `temporarySimulator` for the default behavior: create and later delete a temporary simulator. -- Set `temporarySimulator: false` to opt out and use the suite/project defaults as-is. +- Set `temporarySimulator: false` with `sessionDefaults.simulatorName` to resolve, boot, open, and export an existing simulator by name without deleting it. - Set `sessionDefaults.simulatorId` to use an existing simulator. In this case the harness does not create or delete a simulator. - Do not set both `temporarySimulator: true` and `sessionDefaults.simulatorId`; the harness fails fast because deleting a user-provided simulator would be unsafe. @@ -125,45 +185,45 @@ Lifecycle details are written to `simulator-lifecycle.log`, including the `creat ## Terminal report -Each suite renders as a structured report with a status banner, aligned metric and tool tables, a failures/stumbles section (only when non-zero), and a sequence diff. When run with `--all`, an aggregate summary follows the per-suite reports. +Each suite renders as a structured report with a task-completion banner, aligned metric and tool tables, a stumbles section, and observed sequence differences. Baseline metric and sequence differences are observational. When run with `--all`, an aggregate summary follows the per-suite reports. ### Single suite ```text ──────────────────────────────────────────────────────────────────────── -PASS weather 1m 38.6s +COMPLETED weather 1m 38.6s suite benchmarks/claude-ui/suites/weather.yml artifacts out.nosync/claude-benchmarks/weather/20260522T214044Z exit claude=0 parser=0 Metrics - METRIC ACTUAL BASELINE VARIANCE DELTA STATUS - totalToolCalls 13 19 +2 −6 PASS - mcpToolCalls 12 18 +2 −6 PASS - uiAutomationCalls 10 16 +2 −6 PASS - wallClockSeconds 98.62 125.00 +45.00 −26.38 PASS - -Tool calls (baseline-tracked) - TOOL ACTUAL BASELINE DELTA STATUS - session_show_defaults 1 1 0 PASS - build_run_sim 1 1 0 PASS - snapshot_ui 1 1 0 PASS - tap 6 9 −3 PASS - batch 1 1 0 PASS - -PASS failures/stumbles: 0 + METRIC ACTUAL BASELINE DELTA + totalToolCalls 13 19 −6 + mcpToolCalls 12 18 −6 + uiAutomationCalls 10 16 −6 + wallClockSeconds 98.62 125.00 −26.38 + +Tool calls (baseline-observed) + TOOL ACTUAL BASELINE DELTA + session_show_defaults 1 1 0 + build_run_sim 1 1 0 + snapshot_ui 1 1 0 + tap 6 9 −3 + batch 1 1 0 + +OBSERVED stumbles: 0 ``` -### Sequence drift +### Sequence differences -When the tool sequence drifts, the report includes unified-diff style hunks with expected/actual index columns. Drift is warning-only by default, so the overall status stays `WARN` rather than `FAIL`: +When the tool sequence differs from the recorded sequence, the report includes unified-diff style hunks with baseline/actual index columns: ```text -WARN tool sequence (warn): drift: 4 missing, 0 additional - @@ expected[8..15] actual[8..11] @@ - 8 8 tap - 9 9 tap - 10 − tap +OBSERVED tool sequence: 4 missing from baseline, 0 additional + @@ baseline[8..15] actual[8..11] @@ + 8 8 tap + 9 9 tap + 10 − tap 11 10 swipe 12 11 tap 13 − swipe @@ -171,15 +231,15 @@ WARN tool sequence (warn): drift: 4 missing, 0 additional 15 − tap ``` -`−` lines are expected calls Claude skipped; `+` lines are calls Claude made that were not expected. Dim lines are surrounding context. +`−` lines are baseline calls Claude skipped; `+` lines are calls Claude made beyond the baseline sequence. Dim lines are surrounding context. -### Failures and inspect hints +### Stumbles and inspect hints -When `failures/stumbles` is non-zero the report lists the first few tool failures and pattern matches, and surfaces an `Inspect` block with the relevant artifact paths: +When `stumbles` is non-zero the report lists the first few tool errors and pattern matches, and surfaces an `Inspect` block with the relevant artifact paths: ```text -FAIL failures/stumbles: 1 - • tool failures: 1 +INCOMPLETE stumbles: 1 + • tool errors: 1 boot_sim @ line 9: Boot failed: device not found Inspect @@ -197,17 +257,17 @@ After `--all` (or multi-result `--from-result`) the harness appends: ════════════════════════════════════════════════════════════════════════ Claude UI Benchmarks · Summary ════════════════════════════════════════════════════════════════════════ - Suites: 3 total · 2 passed · 1 failed · 2 sequence warnings + Suites: 3 total · 2 completed · 1 incomplete Duration: total 4m 49.8s · slowest reminders (1m 39.8s) Artifacts: out.nosync/claude-benchmarks/ - ! WARN weather 1m 38.6s sequence warn: 4m/0a - ✗ FAIL reminders 1m 39.8s 1 stumble · sequence warn: 7m/4a - ! WARN contacts 1m 31.4s sequence warn: 2m/2a + ✓ COMPLETED weather 1m 38.6s sequence delta: 4m/0a + ! INCOMPLETE reminders 1m 39.8s 1 stumble · sequence delta: 7m/4a + ✓ COMPLETED contacts 1m 31.4s sequence delta: 2m/2a ════════════════════════════════════════════════════════════════════════ ``` -`Nm/Ka` denotes "N missing / K additional" calls vs. `expectedToolSequence`. +`Nm/Ka` denotes "N missing / K additional" calls vs. `baselineToolSequence`. The renderer auto-detects TTY and adds ANSI color when stdout is a terminal and `NO_COLOR` is unset. Plain-text output (e.g. when piping to a file or under `NO_COLOR=1`) carries the same information without color codes. diff --git a/benchmarks/claude-ui/parse_claude_conversation.py b/benchmarks/claude-ui/parse_claude_conversation.py index e5ed27d3..a62dd64a 100755 --- a/benchmarks/claude-ui/parse_claude_conversation.py +++ b/benchmarks/claude-ui/parse_claude_conversation.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 """Parse a Claude Code NDJSON session log into per-turn files. -Filters to: user prompts, assistant text replies, and XcodeBuildMCP tool +Filters to: user prompts, assistant text replies, and configured tool calls/results. Strips screenshot image blobs to a short placeholder. Usage: parse_claude_conversation.py [output_dir] \\ - [--tool-prefix=mcp__xcodebuildmcp] + [--tool-prefix=mcp__xcodebuildmcp] [--tool-name=Bash] """ from __future__ import annotations @@ -101,10 +101,14 @@ def extract_user_text(entry: dict) -> str: return "\n\n".join(parts) -def parse(path: Path, out_dir: Path, tool_prefix: str) -> bool: +def matches_tool_name(name: str, tool_prefixes: list[str], tool_names: set[str]) -> bool: + return name in tool_names or any(name.startswith(prefix) for prefix in tool_prefixes) + + +def parse(path: Path, out_dir: Path, tool_prefixes: list[str], tool_names: set[str]) -> bool: out_dir.mkdir(parents=True, exist_ok=True) - # Track tool_use_ids that target our prefix so we keep matching results. + # Track tool_use_ids that target configured tools so we keep matching results. tracked_ids: set[str] = set() tool_name_by_id: dict[str, str] = {} counter = 0 @@ -185,7 +189,7 @@ def next_path(kind: str, label: str | None = None) -> Path: ) elif btype == "tool_use": name = block.get("name", "") - if not name.startswith(tool_prefix): + if not matches_tool_name(name, tool_prefixes, tool_names): continue tool_id = block.get("id", "") tracked_ids.add(tool_id) @@ -220,9 +224,16 @@ def main() -> int: ) ap.add_argument( "--tool-prefix", - default="mcp__xcodebuildmcp", + action="append", + default=None, help="Only include tool calls whose name starts with this prefix", ) + ap.add_argument( + "--tool-name", + action="append", + default=[], + help="Also include tool calls whose name exactly matches this value", + ) args = ap.parse_args() if not args.jsonl.is_file(): @@ -230,7 +241,8 @@ def main() -> int: return 1 out = args.output or args.jsonl.with_name(f"{args.jsonl.stem}_conversation") - return 0 if parse(args.jsonl, out, args.tool_prefix) else 1 + tool_prefixes = args.tool_prefix or ["mcp__xcodebuildmcp"] + return 0 if parse(args.jsonl, out, tool_prefixes, set(args.tool_name)) else 1 if __name__ == "__main__": diff --git a/benchmarks/claude-ui/suites/contacts.yml b/benchmarks/claude-ui/suites/contacts.yml index 77d26b80..d314fc53 100644 --- a/benchmarks/claude-ui/suites/contacts.yml +++ b/benchmarks/claude-ui/suites/contacts.yml @@ -9,38 +9,38 @@ firstRunPromptDismissals: - Continue - Not Now - OK - timeoutSeconds: 8 + timeoutSeconds: 30 baseline: - totalToolCalls: 14 - mcpToolCalls: 13 - uiAutomationCalls: 11 - wallClockSeconds: 97 + totalToolCalls: 19 + trackedToolCalls: 18 + mcpToolCalls: 18 + uiAutomationCalls: 16 + wallClockSeconds: 102.94 tools: session_show_defaults: 1 launch_app_sim: 1 - snapshot_ui: 1 + snapshot_ui: 6 tap: 5 type_text: 5 -allowedVariance: - totalToolCalls: 3 - mcpToolCalls: 3 - uiAutomationCalls: 3 - wallClockSeconds: 45 - toolCalls: 2 -expectedToolSequence: +baselineToolSequence: - session_show_defaults - launch_app_sim - snapshot_ui - tap + - snapshot_ui - tap - type_text + - snapshot_ui - type_text - type_text - tap + - snapshot_ui - type_text - tap + - snapshot_ui - type_text - tap + - snapshot_ui failurePatterns: - STALE_ELEMENT_REF - SNAPSHOT_MISSING diff --git a/benchmarks/claude-ui/suites/reminders.yml b/benchmarks/claude-ui/suites/reminders.yml index fb437575..208913e8 100644 --- a/benchmarks/claude-ui/suites/reminders.yml +++ b/benchmarks/claude-ui/suites/reminders.yml @@ -8,31 +8,29 @@ firstRunPromptDismissals: labels: - Continue - Not Now - timeoutSeconds: 12 + timeoutSeconds: 30 baseline: - totalToolCalls: 15 - mcpToolCalls: 14 - uiAutomationCalls: 12 - wallClockSeconds: 85 + totalToolCalls: 17 + trackedToolCalls: 16 + mcpToolCalls: 16 + uiAutomationCalls: 14 + wallClockSeconds: 92.79 tools: session_show_defaults: 1 launch_app_sim: 1 snapshot_ui: 1 - tap: 4 + tap: 5 + wait_for_ui: 1 type_text: 4 key_press: 2 batch: 1 -allowedVariance: - totalToolCalls: 5 - mcpToolCalls: 5 - uiAutomationCalls: 5 - wallClockSeconds: 60 - toolCalls: 3 -expectedToolSequence: +baselineToolSequence: - session_show_defaults - launch_app_sim - snapshot_ui - tap + - wait_for_ui + - tap - type_text - tap - tap diff --git a/benchmarks/claude-ui/suites/weather.yml b/benchmarks/claude-ui/suites/weather.yml index 1b03b170..e66b7af7 100644 --- a/benchmarks/claude-ui/suites/weather.yml +++ b/benchmarks/claude-ui/suites/weather.yml @@ -6,25 +6,20 @@ sessionDefaults: scheme: Weather simulatorName: iPhone 17 Pro Max baseline: - totalToolCalls: 13 - mcpToolCalls: 12 - uiAutomationCalls: 10 - wallClockSeconds: 98 + totalToolCalls: 14 + trackedToolCalls: 13 + mcpToolCalls: 13 + uiAutomationCalls: 11 + wallClockSeconds: 100.03 tools: session_show_defaults: 1 build_run_sim: 1 - snapshot_ui: 1 + snapshot_ui: 2 tap: 6 batch: 1 type_text: 1 swipe: 1 -allowedVariance: - totalToolCalls: 2 - mcpToolCalls: 2 - uiAutomationCalls: 2 - wallClockSeconds: 45 - toolCalls: 2 -expectedToolSequence: +baselineToolSequence: - session_show_defaults - build_run_sim - snapshot_ui @@ -36,6 +31,7 @@ expectedToolSequence: - tap - tap - swipe + - snapshot_ui - tap failurePatterns: - STALE_ELEMENT_REF diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts index 67a3ef85..2a0244d1 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts @@ -1,11 +1,15 @@ -import { spawn } from 'node:child_process'; -import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { compareBenchmark, diffToolSequence } from '../compare.ts'; import { readConfig } from '../config.ts'; -import { requireSuitePaths, resolveParserPath } from '../harness.ts'; +import { + listSuitePaths, + requireSuitePaths, + resolveParserPath, + resolveSuitePath, +} from '../harness.ts'; import { analyzeClaudeJsonl } from '../transcript.ts'; import type { BenchmarkConfig, BenchmarkRunMetadata } from '../types.ts'; @@ -16,28 +20,6 @@ function line(value: unknown): string { return JSON.stringify(value); } -function runParserScript(args: string[]): Promise<{ - exitCode: number | null; - stdout: string; - stderr: string; -}> { - return new Promise((resolve, reject) => { - const child = spawn('python3', args, { stdio: ['ignore', 'pipe', 'pipe'] }); - const stdout: Buffer[] = []; - const stderr: Buffer[] = []; - child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk)); - child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk)); - child.on('error', reject); - child.on('close', (exitCode) => { - resolve({ - exitCode, - stdout: Buffer.concat(stdout).toString('utf8'), - stderr: Buffer.concat(stderr).toString('utf8'), - }); - }); - }); -} - function runMetadata( wallClockSeconds: number, claudeExitCode = 0, @@ -106,27 +88,17 @@ describe('Claude UI benchmark harness', () => { ); }); - it('returns a non-zero parser exit when JSONL lines are malformed', async () => { - const dir = await mkdtemp(path.join(tmpdir(), 'claude-ui-parser-')); + it('discovers local private suites when present', async () => { + const localSuitesDir = path.join(repoRoot, 'benchmarks/claude-ui/local/suites'); + const suiteName = `unit-private-suite-${process.pid}`; + const suitePath = path.join(localSuitesDir, `${suiteName}.yml`); + await mkdir(localSuitesDir, { recursive: true }); + await writeFile(suitePath, `name: ${suiteName}\nprompt: ../prompts/weather.md\n`, 'utf8'); try { - const jsonlPath = path.join(dir, 'claude.jsonl'); - const outputPath = path.join(dir, 'parsed'); - await writeFile( - jsonlPath, - `${line({ type: 'assistant', message: { content: [{ type: 'text', text: 'ok' }] } })}\n{broken\n`, - 'utf8', - ); - - const result = await runParserScript([ - path.join(repoRoot, 'benchmarks/claude-ui/parse_claude_conversation.py'), - jsonlPath, - outputPath, - ]); - - expect(result.exitCode).toBe(1); - expect(result.stderr).toContain('warn: skipping line 2'); + await expect(resolveSuitePath(suiteName)).resolves.toBe(suitePath); + await expect(listSuitePaths()).resolves.toContain(suitePath); } finally { - await rm(dir, { recursive: true, force: true }); + await rm(suitePath, { force: true }); } }); }); @@ -246,11 +218,57 @@ describe('Claude UI benchmark analysis', () => { const audit = analyzeClaudeJsonl(transcript, { mcpToolPrefix: toolPrefix, - failurePatterns: ['stale element ref'], + failurePatterns: ['WAIT_TIMEOUT'], }); expect(audit.failures).toHaveLength(1); expect(audit.patternFailures).toHaveLength(1); + + const result = compareBenchmark( + { name: 'weather', prompt: 'prompt.md' }, + audit, + runMetadata(10), + ); + + expect(result.completion.issueCount).toBe(1); + }); + + it('counts parser failures once when malformed JSONL also records parse errors', () => { + const audit = analyzeClaudeJsonl('{broken\n', { mcpToolPrefix: toolPrefix }); + + const result = compareBenchmark( + { name: 'weather', prompt: 'prompt.md' }, + audit, + runMetadata(10, 0, 1), + ); + + expect(audit.parseErrors).toHaveLength(1); + expect(result.completion.issueCount).toBe(1); + expect(result.completed).toBe(false); + }); + + it('rejects removed old suite config keys', () => { + expect(() => + readConfig( + { + name: 'weather', + prompt: 'prompt.md', + expectedToolSequence: ['snapshot_ui'], + }, + 'weather.yml', + ), + ).toThrow('weather.yml.expectedToolSequence: renamed to baselineToolSequence'); + + expect(() => + readConfig( + { + name: 'weather', + prompt: 'prompt.md', + allowedVariance: { totalToolCalls: 2 }, + }, + 'weather.yml', + ), + ).toThrow('weather.yml.allowedVariance: removed; baselines are observed data only'); }); it('rejects malformed failure pattern regexes when loading config', () => { @@ -322,7 +340,7 @@ describe('Claude UI benchmark analysis', () => { expect(config.sessionDefaults?.env).toEqual({ FEATURE_FLAG: '1' }); }); - it('warns by default when tool sequences drift', () => { + it('reports observed metric and tool sequence deltas without affecting completion', () => { const config: BenchmarkConfig = { name: 'weather', prompt: 'prompt.md', @@ -332,13 +350,7 @@ describe('Claude UI benchmark analysis', () => { uiAutomationCalls: 2, wallClockSeconds: 120, }, - allowedVariance: { - totalToolCalls: 1, - mcpToolCalls: 0, - uiAutomationCalls: 0, - wallClockSeconds: 30, - }, - expectedToolSequence: ['session_show_defaults', 'snapshot_ui', 'tap'], + baselineToolSequence: ['session_show_defaults', 'snapshot_ui', 'tap'], }; const audit = analyzeClaudeJsonl( [ @@ -365,16 +377,22 @@ describe('Claude UI benchmark analysis', () => { const result = compareBenchmark(config, audit, runMetadata(145)); - expect(result.metrics.find((item) => item.name === 'totalToolCalls')?.pass).toBe(true); - expect(result.metrics.find((item) => item.name === 'mcpToolCalls')?.pass).toBe(false); - expect(result.sequence.mode).toBe('warn'); + expect(result.metrics.find((item) => item.name === 'totalToolCalls')).toEqual({ + name: 'totalToolCalls', + actual: 5, + baseline: 4, + }); + expect(result.metrics.find((item) => item.name === 'mcpToolCalls')).toEqual({ + name: 'mcpToolCalls', + actual: 4, + baseline: 3, + }); expect(result.sequence.matched).toBe(false); - expect(result.sequence.pass).toBe(true); expect(result.sequence.additional).toEqual(['screenshot']); - expect(result.pass).toBe(false); + expect(result.completed).toBe(true); }); - it('preserves default allowed variance when config only overrides some keys', () => { + it('reports actual and baseline metrics without variance classification', () => { const config: BenchmarkConfig = readConfig( { name: 'weather', @@ -383,9 +401,6 @@ describe('Claude UI benchmark analysis', () => { totalToolCalls: 3, wallClockSeconds: 120, }, - allowedVariance: { - wallClockSeconds: 30, - }, }, 'weather.yml', ); @@ -411,26 +426,21 @@ describe('Claude UI benchmark analysis', () => { { name: 'totalToolCalls', actual: 3, - expected: 3, - allowedVariance: 0, - pass: true, + baseline: 3, }, { name: 'wallClockSeconds', actual: 145, - expected: 120, - allowedVariance: 30, - pass: true, + baseline: 120, }, ]); }); - it('fails on tool sequence drift when strict mode is enabled', () => { + it('reports tool sequence deltas without affecting completion', () => { const config: BenchmarkConfig = { name: 'weather', prompt: 'prompt.md', - expectedToolSequence: ['session_show_defaults', 'snapshot_ui', 'tap'], - sequence: { mode: 'fail' }, + baselineToolSequence: ['session_show_defaults', 'snapshot_ui', 'tap'], }; const audit = analyzeClaudeJsonl( [ @@ -456,13 +466,12 @@ describe('Claude UI benchmark analysis', () => { const result = compareBenchmark(config, audit, runMetadata(10)); - expect(result.sequence.mode).toBe('fail'); expect(result.sequence.matched).toBe(false); - expect(result.sequence.pass).toBe(false); - expect(result.pass).toBe(false); + expect(result.sequence.additional).toEqual(['screenshot']); + expect(result.completed).toBe(true); }); - it('fails the benchmark when the external parser fails', () => { + it('marks the benchmark incomplete when the external parser exits non-zero', () => { const config: BenchmarkConfig = { name: 'weather', prompt: 'prompt.md', @@ -471,9 +480,9 @@ describe('Claude UI benchmark analysis', () => { const result = compareBenchmark(config, audit, runMetadata(10, 0, 1)); - expect(result.failureMetric.pass).toBe(false); - expect(result.failureMetric.count).toBe(1); - expect(result.pass).toBe(false); + expect(result.completion.completed).toBe(false); + expect(result.completion.issueCount).toBe(1); + expect(result.completed).toBe(false); }); it('returns no sequence hunks when expected and actual match', () => { diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts new file mode 100644 index 00000000..605ce457 --- /dev/null +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts @@ -0,0 +1,685 @@ +import { spawn } from 'node:child_process'; +import { mkdtemp, readdir, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { buildClaudeArgs } from '../claude-invocation.ts'; +import { compareBenchmark } from '../compare.ts'; +import { readConfig } from '../config.ts'; +import { analyzeClaudeJsonl } from '../transcript.ts'; +import type { BenchmarkArtifacts, BenchmarkRunMetadata } from '../types.ts'; + +const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..'); + +function line(value: unknown): string { + return JSON.stringify(value); +} + +function artifacts(runDirectory: string): BenchmarkArtifacts { + return { + runDirectory, + promptPath: path.join(runDirectory, 'prompt.md'), + mcpConfigPath: path.join(runDirectory, 'mcp-config.json'), + mcpWorkspaceDirectory: path.join(runDirectory, 'mcp-workspace'), + mcpWorkspaceConfigPath: path.join(runDirectory, 'mcp-workspace/.xcodebuildmcp/config.yaml'), + claudeJsonlPath: path.join(runDirectory, 'claude.jsonl'), + claudeStderrPath: path.join(runDirectory, 'claude.stderr'), + claudeCommandLogPath: path.join(runDirectory, 'claude-command.log'), + simulatorLifecycleLogPath: path.join(runDirectory, 'simulator-lifecycle.log'), + parsedDirectory: path.join(runDirectory, 'parsed'), + parseLogPath: path.join(runDirectory, 'parse.log'), + resultJsonPath: path.join(runDirectory, 'result.json'), + }; +} + +function runMetadata(wallClockSeconds: number): BenchmarkRunMetadata { + return { + suitePath: '/tmp/vendor-cli.yml', + wallClockSeconds, + claudeExitCode: 0, + parserExitCode: 0, + artifacts: artifacts('/tmp/run'), + }; +} + +function runParserScript(args: string[]): Promise<{ + exitCode: number | null; + stdout: string; + stderr: string; +}> { + return new Promise((resolve, reject) => { + const child = spawn('python3', args, { stdio: ['ignore', 'pipe', 'pipe'] }); + const stdout: Buffer[] = []; + const stderr: Buffer[] = []; + child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk)); + child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk)); + child.on('error', reject); + child.on('close', (exitCode) => { + resolve({ + exitCode, + stdout: Buffer.concat(stdout).toString('utf8'), + stderr: Buffer.concat(stderr).toString('utf8'), + }); + }); + }); +} + +describe('Claude UI benchmark tool configuration', () => { + it('loads Claude invocation and tool analysis from suite config', () => { + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + preflightCommands: ['open -a LocalBenchTool', 'vendorcli status'], + failurePatternTargets: ['commands'], + ignoredFailurePatterns: ['element_disabled'], + claude: { + useMcpServer: false, + permissionMode: 'default', + maxClaudeSeconds: 600, + tools: ['Bash'], + allowedTools: ['Bash(vendorcli *)', 'Bash(xcrun *)'], + appendSystemPrompt: 'Target simulator: {simulatorId}', + extraArgs: ['--model', 'sonnet'], + pluginDirs: ['benchmarks/claude-ui/local/skills/vendor-cli'], + skillDirs: ['benchmarks/claude-ui/local/skills/vendor-cli'], + activateSkill: 'vendor-cli', + isolatedWorkingDirectory: true, + }, + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli ui screen', + shortName: 'vendorcli.screen', + uiAutomation: true, + }, + ], + }, + }, + 'vendor-cli.yml', + ); + + expect(config.claude).toEqual({ + useMcpServer: false, + permissionMode: 'default', + maxClaudeSeconds: 600, + tools: ['Bash'], + allowedTools: ['Bash(vendorcli *)', 'Bash(xcrun *)'], + appendSystemPrompt: 'Target simulator: {simulatorId}', + extraArgs: ['--model', 'sonnet'], + pluginDirs: ['benchmarks/claude-ui/local/skills/vendor-cli'], + skillDirs: ['benchmarks/claude-ui/local/skills/vendor-cli'], + activateSkill: 'vendor-cli', + isolatedWorkingDirectory: true, + }); + expect(config.preflightCommands).toEqual(['open -a LocalBenchTool', 'vendorcli status']); + expect(config.failurePatternTargets).toEqual(['commands']); + expect(config.ignoredFailurePatterns).toEqual(['element_disabled']); + expect(config.toolAnalysis?.matchers).toEqual([ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli ui screen', + shortName: 'vendorcli.screen', + uiAutomation: true, + }, + ]); + }); + + it('builds Claude args without MCP wiring when configured for CLI tools', () => { + const runArtifacts = artifacts('/tmp/run'); + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + claude: { + useMcpServer: false, + permissionMode: 'default', + tools: ['Bash'], + allowedTools: ['Bash(vendorcli *)'], + appendSystemPrompt: 'Use simulator {simulatorId} from {workingDirectory}', + extraArgs: ['--setting-sources', 'project,local'], + }, + }, + 'vendor-cli.yml', + ); + + expect( + buildClaudeArgs({ + config, + artifacts: runArtifacts, + workingDirectory: '/workspace', + pluginDirs: ['/repo/benchmarks/claude-ui/local/skills/vendor-cli'], + simulatorId: 'SIM-123', + }), + ).toEqual([ + '-p', + '--verbose', + '--output-format', + 'stream-json', + '--disable-slash-commands', + '--mcp-config', + '/tmp/run/mcp-config.json', + '--strict-mcp-config', + '--tools', + 'Bash', + '--allowedTools', + 'Bash(vendorcli *)', + '--append-system-prompt', + 'Use simulator SIM-123 from /workspace', + '--plugin-dir', + '/repo/benchmarks/claude-ui/local/skills/vendor-cli', + '--setting-sources', + 'project,local', + ]); + }); + + it('tracks configured Bash command tools separately from total Claude tools', () => { + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + baseline: { + totalToolCalls: 3, + trackedToolCalls: 2, + uiAutomationCalls: 1, + tools: { + 'vendorcli.screen': 1, + 'xcodebuild.build': 1, + }, + }, + baselineToolSequence: ['vendorcli.screen', 'xcodebuild.build'], + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli ui screen', + shortName: 'vendorcli.screen', + uiAutomation: true, + }, + { + kind: 'bashCommand', + commandPrefix: 'xcodebuild', + shortName: 'xcodebuild.build', + }, + ], + }, + }, + 'vendor-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'vendorcli ui screen --simulator SIM-123 --json' }, + }, + { type: 'tool_use', id: 'tool-2', name: 'Read', input: { file_path: 'README.md' } }, + { + type: 'tool_use', + id: 'tool-3', + name: 'Bash', + input: { command: 'xcodebuild -scheme App build' }, + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis }); + const result = compareBenchmark(config, audit, runMetadata(10)); + + expect(audit.totalToolCalls).toBe(3); + expect(audit.trackedToolCalls).toBe(2); + expect(audit.mcpToolCalls).toBe(0); + expect(audit.uiAutomationCalls).toBe(1); + expect(result.sequence.actual).toEqual(['vendorcli.screen', 'xcodebuild.build']); + expect(result.completed).toBe(true); + }); + + it('reports metric deltas when actual tool counts differ from the recorded baseline', () => { + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + baseline: { + trackedToolCalls: 10, + tools: { + 'vendorcli.screen': 8, + }, + }, + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli ui screen', + shortName: 'vendorcli.screen', + uiAutomation: true, + }, + ], + }, + }, + 'vendor-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'vendorcli ui screen --json' }, + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis }); + const result = compareBenchmark(config, audit, runMetadata(10)); + + expect(result.completed).toBe(true); + expect(result.metrics).toEqual([ + { + name: 'trackedToolCalls', + actual: 1, + baseline: 10, + }, + { + name: 'tool:vendorcli.screen', + actual: 1, + baseline: 8, + }, + ]); + }); + + it('uses the most specific Bash matcher once per command offset', () => { + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli', + shortName: 'vendorcli.other', + }, + { + kind: 'bashCommand', + commandPrefix: 'vendorcli ui screen', + shortName: 'vendorcli.screen', + uiAutomation: true, + }, + ], + }, + }, + 'vendor-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'vendorcli ui screen --json && vendorcli --help' }, + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis }); + + expect(audit.trackedToolCallsByName).toEqual({ + 'vendorcli.screen': 1, + 'vendorcli.other': 1, + }); + expect(audit.trackedSequence.map((call) => call.shortName)).toEqual([ + 'vendorcli.screen', + 'vendorcli.other', + ]); + }); + + it('ignores configured non-terminal tool failures', () => { + const config = readConfig( + { + name: 'private CLI weather', + prompt: 'weather.md', + failurePatterns: ['WAIT_TIMEOUT'], + ignoredFailurePatterns: ['wait_timeout', 'element_disabled'], + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'privatecli wait', + shortName: 'privatecli.wait', + uiAutomation: true, + }, + { + kind: 'bashCommand', + commandPrefix: 'privatecli tap', + shortName: 'privatecli.tap', + uiAutomation: true, + }, + ], + }, + }, + 'private-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'privatecli wait element --label Weather --timeout 1' }, + }, + { + type: 'tool_use', + id: 'tool-2', + name: 'Bash', + input: { command: 'privatecli tap --label Settings' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [ + { + type: 'tool_result', + tool_use_id: 'tool-1', + is_error: true, + content: 'Exit code 1\n{"error":{"code":"wait_timeout"}}', + }, + { + type: 'tool_result', + tool_use_id: 'tool-2', + is_error: true, + content: 'Exit code 1\n{"error":{"code":"element_disabled"}}', + }, + ], + }, + }), + line({ type: 'result', is_error: false, result: 'done' }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { + toolAnalysis: config.toolAnalysis, + failurePatterns: config.failurePatterns, + ignoredFailurePatterns: config.ignoredFailurePatterns, + }); + const result = compareBenchmark(config, audit, runMetadata(10)); + + expect(audit.failures).toEqual([]); + expect(audit.patternFailures).toEqual([]); + expect(result.completed).toBe(true); + }); + + it('matches failure patterns against commands and tool results without treating final prose as a new failure', () => { + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + failurePatterns: ['idb', 'SNAPSHOT_MISSING'], + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli ui screen', + shortName: 'vendorcli.screen', + uiAutomation: true, + }, + ], + }, + }, + 'vendor-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'which idb' }, + }, + { + type: 'tool_use', + id: 'tool-2', + name: 'Bash', + input: { command: 'vendorcli ui screen --json' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [ + { + type: 'tool_result', + tool_use_id: 'tool-2', + is_error: true, + content: 'Exit code 1\nSNAPSHOT_MISSING', + }, + ], + }, + }), + line({ + type: 'result', + is_error: false, + result: 'I tried idb earlier and saw SNAPSHOT_MISSING, then stopped.', + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { + toolAnalysis: config.toolAnalysis, + failurePatterns: config.failurePatterns, + }); + + expect(audit.patternFailures).toEqual([ + { pattern: 'idb', line: 1, excerpt: 'which idb' }, + { pattern: 'SNAPSHOT_MISSING', line: 2, excerpt: 'Exit code 1\nSNAPSHOT_MISSING' }, + ]); + }); + + it('can restrict failure pattern matching to commands', () => { + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + failurePatterns: ['xcodebuildmcp'], + failurePatternTargets: ['commands'], + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli', + shortName: 'vendorcli.other', + }, + ], + }, + }, + 'vendor-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'vendorcli run --json' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [ + { + type: 'tool_result', + tool_use_id: 'tool-1', + content: 'Workspace: /Volumes/Developer/XcodeBuildMCP/example_projects/Weather', + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { + toolAnalysis: config.toolAnalysis, + failurePatterns: config.failurePatterns, + failurePatternTargets: config.failurePatternTargets, + }); + + expect(audit.patternFailures).toEqual([]); + }); + + it('records tool failures as benchmark stumbles', () => { + const config = readConfig( + { + name: 'private CLI weather', + prompt: 'weather.md', + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'privatecli', + shortName: 'privatecli.other', + }, + ], + }, + }, + 'private-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'privatecli --version' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [ + { + type: 'tool_result', + tool_use_id: 'tool-1', + is_error: true, + content: "Exit code 64\nError: Unknown option '--version'", + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis }); + const result = compareBenchmark(config, audit, runMetadata(600)); + + expect(result.completed).toBe(true); + expect(result.completion).toEqual({ + completed: true, + issueCount: 1, + }); + }); + + it('marks the benchmark incomplete when Claude exits non-zero', () => { + const config = readConfig( + { + name: 'private CLI weather', + prompt: 'weather.md', + }, + 'private-cli.yml', + ); + const audit = analyzeClaudeJsonl('', {}); + const result = compareBenchmark(config, audit, { + ...runMetadata(600), + claudeExitCode: 143, + }); + + expect(result.completed).toBe(false); + expect(result.completion).toEqual({ + completed: false, + issueCount: 1, + }); + }); + + it('lets the parser include configured non-MCP tool names', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'claude-ui-parser-')); + try { + const jsonlPath = path.join(dir, 'claude.jsonl'); + const outputPath = path.join(dir, 'parsed'); + await writeFile( + jsonlPath, + [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'vendorcli ui screen --json' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { content: [{ type: 'tool_result', tool_use_id: 'tool-1', content: 'ok' }] }, + }), + ].join('\n'), + 'utf8', + ); + + const result = await runParserScript([ + path.join(repoRoot, 'benchmarks/claude-ui/parse_claude_conversation.py'), + jsonlPath, + outputPath, + '--tool-prefix=mcp__xcodebuildmcp', + '--tool-name=Bash', + ]); + + expect(result.exitCode).toBe(0); + expect(await readdir(outputPath)).toEqual([ + '0001_tool_call_Bash.md', + '0002_tool_result_Bash.md', + ]); + expect(await readFile(path.join(outputPath, '0001_tool_call_Bash.md'), 'utf8')).toContain( + 'vendorcli ui screen --json', + ); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/src/benchmarks/claude-ui/__tests__/first-run-preflight.test.ts b/src/benchmarks/claude-ui/__tests__/first-run-preflight.test.ts index e3ef76ac..529bb357 100644 --- a/src/benchmarks/claude-ui/__tests__/first-run-preflight.test.ts +++ b/src/benchmarks/claude-ui/__tests__/first-run-preflight.test.ts @@ -55,7 +55,7 @@ describe('Claude UI first-run prompt preflight', () => { if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') { return { exitCode: 0, - stdout: describeOutputs.shift() ?? emptyDescribeUi, + stdout: describeOutputs.shift() ?? loadedDescribeUi, stderr: '', durationSeconds: 0.01, }; @@ -100,6 +100,7 @@ describe('Claude UI first-run prompt preflight', () => { 'TEMP-SIM-123', ], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], ]); expect(events).toEqual([ @@ -178,6 +179,7 @@ describe('Claude UI first-run prompt preflight', () => { 'TEMP-SIM-123', ], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], ]); const log = await readFile(logPath, 'utf8'); @@ -185,6 +187,58 @@ describe('Claude UI first-run prompt preflight', () => { expect(log).toContain('Dismissing first-run prompt label: Not Now'); }); + it('keeps polling when initial app UI appears before first-run prompts', async () => { + const logPath = await tempLogPath(); + const commands: LifecycleCommandOptions[] = []; + const describeOutputs = [ + loadedDescribeUi, + describeUiWithLabel('Continue'), + loadedDescribeUi, + loadedDescribeUi, + ]; + const executor: LifecycleCommandExecutor = async (opts) => { + commands.push(opts); + if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') { + return { + exitCode: 0, + stdout: describeOutputs.shift() ?? loadedDescribeUi, + stderr: '', + durationSeconds: 0.01, + }; + } + return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 }; + }; + + await dismissFirstRunPrompts({ + config: config({ firstRunPromptDismissals: { labels: ['Continue'], timeoutSeconds: 5 } }), + simulatorId: 'TEMP-SIM-123', + cwd: '/repo', + logPath, + executor, + axePath: '/mock/axe', + timing: { now: () => 1_000, sleep: async () => {} }, + }); + + expect(commands.map((item) => [item.command, ...item.args])).toEqual([ + ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + [ + '/mock/axe', + 'tap', + '--label', + 'Continue', + '--element-type', + 'Button', + '--udid', + 'TEMP-SIM-123', + ], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], + ]); + }); + it('retries transient describe-ui failures before dismissing prompts', async () => { const logPath = await tempLogPath(); const commands: LifecycleCommandOptions[] = []; @@ -197,7 +251,7 @@ describe('Claude UI first-run prompt preflight', () => { const executor: LifecycleCommandExecutor = async (opts) => { commands.push(opts); if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') { - const result = describeResults.shift() ?? { exitCode: 0, stdout: emptyDescribeUi }; + const result = describeResults.shift() ?? { exitCode: 0, stdout: loadedDescribeUi }; return { ...result, stderr: '', durationSeconds: 0.01 }; } return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 }; @@ -235,6 +289,7 @@ describe('Claude UI first-run prompt preflight', () => { 'TEMP-SIM-123', ], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], ]); const log = await readFile(logPath, 'utf8'); @@ -254,7 +309,7 @@ describe('Claude UI first-run prompt preflight', () => { commands.push(opts); if (opts.command === 'xcrun' && opts.args[1] === 'launch') now += 9_000; if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') { - const result = describeResults.shift() ?? { exitCode: 0, stdout: emptyDescribeUi }; + const result = describeResults.shift() ?? { exitCode: 0, stdout: loadedDescribeUi }; return { ...result, stderr: '', durationSeconds: 0.01 }; } return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 }; @@ -279,6 +334,7 @@ describe('Claude UI first-run prompt preflight', () => { ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], ]); const log = await readFile(logPath, 'utf8'); @@ -308,7 +364,7 @@ describe('Claude UI first-run prompt preflight', () => { timing: { now: () => { nowCalls += 1; - return nowCalls <= 2 ? 1_000 : 7_000; + return nowCalls <= 3 ? 1_000 : 7_000; }, sleep: async () => {}, }, @@ -317,6 +373,7 @@ describe('Claude UI first-run prompt preflight', () => { expect(commands.map((item) => [item.command, ...item.args])).toEqual([ ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], ]); const log = await readFile(logPath, 'utf8'); @@ -359,6 +416,7 @@ describe('Claude UI first-run prompt preflight', () => { ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], ]); }); @@ -400,6 +458,47 @@ describe('Claude UI first-run prompt preflight', () => { ]); }); + it('logs terminate failures after successful preflight', async () => { + const logPath = await tempLogPath(); + const commands: LifecycleCommandOptions[] = []; + const describeOutputs = [loadedDescribeUi, loadedDescribeUi]; + const executor: LifecycleCommandExecutor = async (opts) => { + commands.push(opts); + if (opts.command === '/mock/axe' && opts.args[0] === 'describe-ui') { + return { + exitCode: 0, + stdout: describeOutputs.shift() ?? loadedDescribeUi, + stderr: '', + durationSeconds: 0.01, + }; + } + if (opts.command === 'xcrun' && opts.args[1] === 'terminate') { + return { exitCode: 1, stdout: '', stderr: 'not running', durationSeconds: 0.01 }; + } + return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 }; + }; + + await dismissFirstRunPrompts({ + config: config({ firstRunPromptDismissals: { labels: ['Continue'], timeoutSeconds: 5 } }), + simulatorId: 'TEMP-SIM-123', + cwd: '/repo', + logPath, + executor, + axePath: '/mock/axe', + timing: { now: () => 1_000, sleep: async () => {} }, + }); + + expect(commands.map((item) => [item.command, ...item.args])).toEqual([ + ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], + ]); + const log = await readFile(logPath, 'utf8'); + expect(log).toContain('First-run prompt preflight terminate failed'); + expect(log).toContain('First-run prompt preflight: complete'); + }); + it('retries malformed describe-ui output as transiently unavailable', async () => { const logPath = await tempLogPath(); const commands: LifecycleCommandOptions[] = []; @@ -436,6 +535,7 @@ describe('Claude UI first-run prompt preflight', () => { ['xcrun', 'simctl', 'launch', 'TEMP-SIM-123', 'com.apple.reminders'], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], + ['/mock/axe', 'describe-ui', '--udid', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'terminate', 'TEMP-SIM-123', 'com.apple.reminders'], ]); const log = await readFile(logPath, 'utf8'); diff --git a/src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts b/src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts new file mode 100644 index 00000000..8236ff26 --- /dev/null +++ b/src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts @@ -0,0 +1,36 @@ +import { preflightCommandsWithFocusResign } from '../preflight-commands.ts'; + +describe('Claude UI benchmark preflight commands', () => { + it('resigns focus to the target Simulator after launching RocketSim.app', () => { + expect( + preflightCommandsWithFocusResign({ + commands: ['killall -9 RocketSim || true', 'sleep 2', 'open -gja RocketSim', 'sleep 10'], + simulatorId: 'SIM-123', + }), + ).toEqual([ + 'killall -9 RocketSim || true', + 'sleep 2', + 'open -gja RocketSim', + "open -a Simulator --args -CurrentDeviceUDID 'SIM-123'", + 'sleep 10', + ]); + }); + + it('keeps preflight commands unchanged when no target simulator is resolved', () => { + const commands = ['open -gja RocketSim']; + + expect(preflightCommandsWithFocusResign({ commands })).toBe(commands); + }); + + it('shell-quotes simulator IDs used by the focus command', () => { + expect( + preflightCommandsWithFocusResign({ + commands: ['open -a RocketSim.app'], + simulatorId: "SIM'123", + }), + ).toEqual([ + 'open -a RocketSim.app', + "open -a Simulator --args -CurrentDeviceUDID 'SIM'\"'\"'123'", + ]); + }); +}); diff --git a/src/benchmarks/claude-ui/__tests__/render.test.ts b/src/benchmarks/claude-ui/__tests__/render.test.ts index bc7d9486..0c66c9ce 100644 --- a/src/benchmarks/claude-ui/__tests__/render.test.ts +++ b/src/benchmarks/claude-ui/__tests__/render.test.ts @@ -5,38 +5,30 @@ function baseResult(overrides: Partial = {}): BenchmarkResult { const runDirectory = '/repo/out.nosync/claude-benchmarks/weather/20260101T000000Z'; return { name: 'weather', - pass: true, + completed: true, metrics: [ { name: 'totalToolCalls', actual: 13, - expected: 19, - allowedVariance: 2, - pass: true, + baseline: 19, }, { name: 'mcpToolCalls', actual: 12, - expected: 18, - allowedVariance: 2, - pass: true, + baseline: 18, }, { name: 'wallClockSeconds', actual: 98.62, - expected: 125, - allowedVariance: 45, - pass: true, + baseline: 125, }, - { name: 'tool:tap', actual: 6, expected: 9, allowedVariance: 2, pass: true }, - { name: 'tool:snapshot_ui', actual: 1, expected: 1, allowedVariance: 2, pass: true }, + { name: 'tool:tap', actual: 6, baseline: 9 }, + { name: 'tool:snapshot_ui', actual: 1, baseline: 1 }, ], - failureMetric: { pass: true, count: 0 }, + completion: { completed: true, issueCount: 0 }, sequence: { - mode: 'warn', - pass: true, matched: true, - expected: ['snapshot_ui', 'tap'], + baseline: ['snapshot_ui', 'tap'], actual: ['snapshot_ui', 'tap'], diff: [], missing: [], @@ -47,10 +39,13 @@ function baseResult(overrides: Partial = {}): BenchmarkResult { parseErrors: [], totalToolCalls: 13, totalToolCallsByName: {}, + trackedToolCalls: 12, + trackedToolCallsByName: {}, mcpToolCalls: 12, mcpToolCallsByName: {}, uiAutomationCalls: 10, uiAutomationCallsByName: {}, + trackedSequence: [], mcpSequence: [], failures: [], patternFailures: [], @@ -80,22 +75,23 @@ function baseResult(overrides: Partial = {}): BenchmarkResult { } describe('renderSuiteReport', () => { - it('renders a passing suite with no sequence drift', () => { + it('renders a completed suite with no sequence delta', () => { const output = renderSuiteReport(baseResult(), { color: false, width: 80, cwd: '/repo' }); - expect(output).toContain('PASS weather'); + expect(output).toContain('COMPLETED weather'); expect(output).toContain('Metrics'); expect(output).toContain('totalToolCalls'); - expect(output).toContain('Tool calls (baseline-tracked)'); - expect(output).toContain('PASS failures/stumbles: 0'); + expect(output).toContain('METRIC ACTUAL BASELINE DELTA'); + expect(output).toContain('Tool calls (baseline-observed)'); + expect(output).toContain('OBSERVED stumbles: 0'); expect(output).not.toContain('Inspect'); - expect(output).not.toContain('@@ expected'); + expect(output).not.toContain('@@ baseline'); }); it('renders failure detail and inspect hints when failures present', () => { const result = baseResult({ - pass: false, - failureMetric: { pass: false, count: 2 }, + completed: false, + completion: { completed: false, issueCount: 2 }, audit: { ...baseResult().audit, failures: [ @@ -118,9 +114,9 @@ describe('renderSuiteReport', () => { const output = renderSuiteReport(result, { color: false, width: 80, cwd: '/repo' }); - expect(output).toContain('FAIL weather'); - expect(output).toContain('FAIL failures/stumbles: 2'); - expect(output).toContain('tool failures: 1'); + expect(output).toContain('INCOMPLETE weather'); + expect(output).toContain('INCOMPLETE stumbles: 2'); + expect(output).toContain('tool errors: 1'); expect(output).toContain('boot_sim @ line 9: Boot failed'); expect(output).toContain('pattern matches: 1'); expect(output).toContain('STALE_ELEMENT_REF @ line 22'); @@ -128,10 +124,34 @@ describe('renderSuiteReport', () => { expect(output).toContain('transcript out.nosync/claude-benchmarks/weather'); }); - it('renders null process exit codes as failures', () => { + it('renders inspect hints for completed suites with stumbles', () => { const result = baseResult({ - pass: false, - failureMetric: { pass: false, count: 2 }, + completion: { completed: true, issueCount: 1 }, + audit: { + ...baseResult().audit, + failures: [ + { + shortName: 'screen', + fullName: 'Bash', + line: 7, + message: 'temporary probe error', + }, + ], + }, + }); + + const output = renderSuiteReport(result, { color: false, width: 80, cwd: '/repo' }); + + expect(output).toContain('COMPLETED weather'); + expect(output).toContain('OBSERVED stumbles: 1'); + expect(output).toContain('Inspect'); + expect(output).toContain('transcript out.nosync/claude-benchmarks/weather'); + }); + + it('renders null process exit codes as incomplete', () => { + const result = baseResult({ + completed: false, + completion: { completed: false, issueCount: 2 }, run: { ...baseResult().run, claudeExitCode: null, @@ -145,13 +165,11 @@ describe('renderSuiteReport', () => { expect(output).toContain('parser exit code: null'); }); - it('renders sequence drift hunks with marker columns', () => { + it('renders observed sequence delta hunks with marker columns', () => { const result = baseResult({ sequence: { - mode: 'warn', - pass: true, matched: false, - expected: ['session_show_defaults', 'snapshot_ui', 'tap'], + baseline: ['session_show_defaults', 'snapshot_ui', 'tap'], actual: ['session_show_defaults', 'snapshot_ui', 'screenshot', 'tap'], diff: [ { @@ -159,14 +177,14 @@ describe('renderSuiteReport', () => { { kind: 'context', tool: 'snapshot_ui', - expectedIndex: 1, + baselineIndex: 1, actualIndex: 1, }, { kind: 'additional', tool: 'screenshot', actualIndex: 2 }, { kind: 'context', tool: 'tap', - expectedIndex: 2, + baselineIndex: 2, actualIndex: 3, }, ], @@ -179,8 +197,8 @@ describe('renderSuiteReport', () => { const output = renderSuiteReport(result, { color: false, width: 80, cwd: '/repo' }); - expect(output).toContain('WARN tool sequence (warn): drift: 0 missing, 1 additional'); - expect(output).toContain('@@ expected[1..2] actual[1..3] @@'); + expect(output).toContain('OBSERVED tool sequence: 0 missing from baseline, 1 additional'); + expect(output).toContain('@@ baseline[1..2] actual[1..3] @@'); expect(output).toContain('+ screenshot'); }); @@ -198,7 +216,7 @@ describe('renderSuiteReport', () => { ...baseResult().run, temporarySimulator: { simulatorId: 'TEMP-SIM-123', - name: 'XcodeBuildMCP Claude UI weather 20260101T000000Z', + name: 'Claude UI weather 20260101T000000Z', lifecycleLogPath: '/repo/out.nosync/claude-benchmarks/weather/20260101T000000Z/simulator-lifecycle.log', setupDurationSeconds: 23.4, @@ -217,9 +235,9 @@ describe('renderSuiteReport', () => { }); describe('renderAggregate', () => { - it('summarizes pass/fail/warn counts and lists each suite', () => { - const pass = baseResult(); - const warn = baseResult({ + it('summarizes completion counts and lists each suite', () => { + const completed = baseResult(); + const sequenceDelta = baseResult({ name: 'contacts', sequence: { ...baseResult().sequence, @@ -236,17 +254,15 @@ describe('renderAggregate', () => { }, }, }); - const fail = baseResult({ + const incomplete = baseResult({ name: 'reminders', - pass: false, - failureMetric: { pass: false, count: 1 }, + completed: false, + completion: { completed: false, issueCount: 1 }, metrics: [ { name: 'mcpToolCalls', actual: 30, - expected: 18, - allowedVariance: 2, - pass: false, + baseline: 18, }, ], sequence: { @@ -265,21 +281,21 @@ describe('renderAggregate', () => { }, }); - const output = renderAggregate([pass, warn, fail], { + const output = renderAggregate([completed, sequenceDelta, incomplete], { color: false, width: 80, cwd: '/repo', }); expect(output).toContain('Claude UI Benchmarks · Summary'); - expect(output).toContain('Suites: 3 total · 2 passed · 1 failed · 1 sequence warnings'); + expect(output).toContain('Suites: 3 total · 2 completed · 1 incomplete'); expect(output).toContain('total '); expect(output).toContain('slowest reminders (2m 25.0s)'); expect(output).toContain('Artifacts: out.nosync/claude-benchmarks/'); - expect(output).toContain('PASS weather'); - expect(output).toContain('WARN contacts'); - expect(output).toContain('FAIL reminders'); - expect(output).toContain('sequence warn: 2m/1a'); - expect(output).toContain('metrics: mcpToolCalls'); + expect(output).toContain('COMPLETED weather'); + expect(output).toContain('COMPLETED contacts'); + expect(output).toContain('INCOMPLETE reminders'); + expect(output).toContain('sequence delta: 2m/1a'); + expect(output).not.toContain('metric warning'); }); }); diff --git a/src/benchmarks/claude-ui/__tests__/simulator-existing-lifecycle.test.ts b/src/benchmarks/claude-ui/__tests__/simulator-existing-lifecycle.test.ts new file mode 100644 index 00000000..b4e0b805 --- /dev/null +++ b/src/benchmarks/claude-ui/__tests__/simulator-existing-lifecycle.test.ts @@ -0,0 +1,89 @@ +import { + prepareTemporarySimulator, + type LifecycleCommandExecutor, + type LifecycleCommandOptions, +} from '../simulator-lifecycle.ts'; +import type { BenchmarkConfig } from '../types.ts'; + +function config(overrides: Partial = {}): BenchmarkConfig { + return { + name: 'weather', + prompt: '../prompts/weather.md', + sessionDefaults: { + simulatorName: 'iPhone 17 Pro Max', + bundleId: 'com.example.App', + }, + ...overrides, + }; +} + +function inMemoryLifecycleLog() { + const messages: string[] = []; + return { + messages, + writer: async (_logPath: string, message: string) => { + messages.push(message); + }, + }; +} + +describe('Claude UI existing simulator lifecycle', () => { + it('resolves, boots, and opens an existing simulator by name', async () => { + const logPath = '/tmp/simulator-lifecycle.log'; + const log = inMemoryLifecycleLog(); + const commands: LifecycleCommandOptions[] = []; + const events: string[] = []; + const executor: LifecycleCommandExecutor = async (opts) => { + commands.push(opts); + if (opts.args[1] === 'list') { + return { + exitCode: 0, + stdout: JSON.stringify({ + devices: { + 'com.apple.CoreSimulator.SimRuntime.iOS-26-0': [ + { name: 'iPhone 17 Pro Max', udid: 'EXISTING-SIM-123', isAvailable: true }, + ], + }, + }), + stderr: '', + durationSeconds: 0.01, + }; + } + return { exitCode: 0, stdout: '', stderr: '', durationSeconds: 0.01 }; + }; + + const simulator = await prepareTemporarySimulator({ + config: config({ temporarySimulator: false }), + suiteSlug: 'weather', + timestamp: '20260522T120000Z', + cwd: '/repo', + logPath, + executor, + logWriter: log.writer, + onEvent: (message) => events.push(message), + readinessDelayMs: 0, + }); + + expect(simulator).toEqual({ + createdByHarness: false, + simulatorId: 'EXISTING-SIM-123', + name: 'iPhone 17 Pro Max', + logPath, + }); + expect(commands.map((item) => [item.command, ...item.args])).toEqual([ + ['xcrun', 'simctl', 'list', 'devices', 'available', '--json'], + ['xcrun', 'simctl', 'boot', 'EXISTING-SIM-123'], + ['xcrun', 'simctl', 'bootstatus', 'EXISTING-SIM-123', '-b'], + ['open', '-a', 'Simulator', '--args', '-CurrentDeviceUDID', 'EXISTING-SIM-123'], + ]); + expect(events).toEqual([ + 'resolving simulator iPhone 17 Pro Max', + 'using simulator EXISTING-SIM-123', + 'booting simulator EXISTING-SIM-123', + 'waiting for simulator EXISTING-SIM-123 bootstatus', + 'opening Simulator.app for EXISTING-SIM-123', + 'simulator ready EXISTING-SIM-123', + ]); + expect(log.messages.join('\n')).toContain('Existing simulator ready: EXISTING-SIM-123'); + }); +}); diff --git a/src/benchmarks/claude-ui/__tests__/simulator-lifecycle.test.ts b/src/benchmarks/claude-ui/__tests__/simulator-lifecycle.test.ts index 2ecc9ec5..7465c761 100644 --- a/src/benchmarks/claude-ui/__tests__/simulator-lifecycle.test.ts +++ b/src/benchmarks/claude-ui/__tests__/simulator-lifecycle.test.ts @@ -8,9 +8,9 @@ import { requireFirstRunPreflightSimulatorId, resolveBenchmarkSimulatorId, writeMcpConfig, -} from '../harness.ts'; +} from '../mcp-config.ts'; +import { deleteTemporarySimulator } from '../simulator-deletion.ts'; import { - deleteTemporarySimulator, prepareTemporarySimulator, resolveTemporarySimulatorPlan, type CreatedTemporarySimulator, @@ -128,6 +128,7 @@ describe('Claude UI temporary simulator lifecycle', () => { enabled: false, reason: 'temporarySimulator is false', existingSimulatorId: undefined, + existingSimulatorName: undefined, }); }); @@ -178,19 +179,13 @@ describe('Claude UI temporary simulator lifecycle', () => { }); expect(simulator?.simulatorId).toBe('TEMP-SIM-123'); expect(commands.map((item) => [item.command, ...item.args])).toEqual([ - [ - 'xcrun', - 'simctl', - 'create', - 'XcodeBuildMCP Claude UI weather 20260522T120000Z', - 'iPhone 17 Pro Max', - ], + ['xcrun', 'simctl', 'create', 'Claude UI weather 20260522T120000Z', 'iPhone 17 Pro Max'], ['xcrun', 'simctl', 'boot', 'TEMP-SIM-123'], ['xcrun', 'simctl', 'bootstatus', 'TEMP-SIM-123', '-b'], ['open', '-a', 'Simulator', '--args', '-CurrentDeviceUDID', 'TEMP-SIM-123'], ]); expect(events).toEqual([ - 'creating simulator XcodeBuildMCP Claude UI weather 20260522T120000Z', + 'creating simulator Claude UI weather 20260522T120000Z', 'booting simulator TEMP-SIM-123', 'waiting for simulator TEMP-SIM-123 bootstatus', 'opening Simulator.app for TEMP-SIM-123', @@ -278,13 +273,7 @@ describe('Claude UI temporary simulator lifecycle', () => { ).rejects.toThrow('temporary simulator did not reach bootstatus'); expect(commands.map((item) => [item.command, ...item.args])).toEqual([ - [ - 'xcrun', - 'simctl', - 'create', - 'XcodeBuildMCP Claude UI weather 20260522T120000Z', - 'iPhone 17 Pro Max', - ], + ['xcrun', 'simctl', 'create', 'Claude UI weather 20260522T120000Z', 'iPhone 17 Pro Max'], ['xcrun', 'simctl', 'boot', 'TEMP-SIM-SETUP-FAIL'], ['xcrun', 'simctl', 'bootstatus', 'TEMP-SIM-SETUP-FAIL', '-b'], ['xcrun', 'simctl', 'delete', 'TEMP-SIM-SETUP-FAIL'], @@ -300,7 +289,7 @@ describe('Claude UI temporary simulator lifecycle', () => { const simulator: CreatedTemporarySimulator = { createdByHarness: true, simulatorId: 'TEMP-SIM-DELETE-FAIL', - name: 'XcodeBuildMCP Claude UI weather 20260522T120000Z', + name: 'Claude UI weather 20260522T120000Z', deviceTypeName: 'iPhone 17 Pro Max', logPath, }; @@ -332,7 +321,7 @@ describe('Claude UI temporary simulator lifecycle', () => { const simulator: CreatedTemporarySimulator = { createdByHarness: true, simulatorId: 'TEMP-SIM-LOG-FAIL', - name: 'XcodeBuildMCP Claude UI weather 20260522T120000Z', + name: 'Claude UI weather 20260522T120000Z', deviceTypeName: 'iPhone 17 Pro Max', logPath, }; @@ -374,7 +363,7 @@ describe('Claude UI temporary simulator lifecycle', () => { temporarySimulator: { createdByHarness: true, simulatorId: 'TEMP-SIM-123', - name: 'XcodeBuildMCP Claude UI weather 20260522T120000Z', + name: 'Claude UI weather 20260522T120000Z', deviceTypeName: 'iPhone 17 Pro Max', logPath: path.join(directory, 'simulator-lifecycle.log'), }, @@ -451,7 +440,7 @@ describe('Claude UI temporary simulator lifecycle', () => { temporarySimulator: { createdByHarness: true, simulatorId: 'TEMP-SIM-123', - name: 'XcodeBuildMCP Claude UI weather 20260522T120000Z', + name: 'Claude UI weather 20260522T120000Z', deviceTypeName: 'iPhone 17 Pro Max', logPath: path.join(directory, 'simulator-lifecycle.log'), }, diff --git a/src/benchmarks/claude-ui/claude-invocation.ts b/src/benchmarks/claude-ui/claude-invocation.ts new file mode 100644 index 00000000..8d2473fe --- /dev/null +++ b/src/benchmarks/claude-ui/claude-invocation.ts @@ -0,0 +1,105 @@ +import { mcpToolPrefix } from './constants.ts'; +import type { + BenchmarkArtifacts, + BenchmarkConfig, + ClaudeInvocationConfig, + ToolAnalysisConfig, +} from './types.ts'; + +export function usesMcpServer(config: BenchmarkConfig): boolean { + return config.claude?.useMcpServer ?? true; +} + +function renderTemplate(value: string, variables: Record): string { + return value.replaceAll(/{([A-Za-z0-9_]+)}/g, (match, key: string) => variables[key] ?? match); +} + +function templateVariables(opts: { + runDirectory: string; + workingDirectory: string; + simulatorId?: string; +}): Record { + return { + runDirectory: opts.runDirectory, + workingDirectory: opts.workingDirectory, + simulatorId: opts.simulatorId ?? 'suite/default', + }; +} + +export function buildClaudeArgs(opts: { + config: BenchmarkConfig; + artifacts: BenchmarkArtifacts; + workingDirectory: string; + pluginDirs?: string[]; + simulatorId?: string; + resumeSessionId?: string; + sessionId?: string; +}): string[] { + const claudeConfig: ClaudeInvocationConfig = opts.config.claude ?? {}; + const useMcpServer = usesMcpServer(opts.config); + const allowedTools = claudeConfig.allowedTools ?? (useMcpServer ? [`${mcpToolPrefix}*`] : []); + const args = ['-p', '--verbose', '--output-format', 'stream-json']; + if (opts.sessionId) { + args.push('--session-id', opts.sessionId); + } + if (opts.resumeSessionId) { + args.push('--resume', opts.resumeSessionId); + } + args.push('--disable-slash-commands'); + const permissionMode = claudeConfig.permissionMode ?? 'bypassPermissions'; + if (permissionMode !== 'default') { + args.push('--permission-mode', permissionMode); + } + + args.push('--mcp-config', opts.artifacts.mcpConfigPath, '--strict-mcp-config'); + if (claudeConfig.tools && claudeConfig.tools.length > 0) { + args.push('--tools', claudeConfig.tools.join(',')); + } + if (allowedTools.length > 0) { + args.push('--allowedTools', allowedTools.join(',')); + } + if (claudeConfig.appendSystemPrompt) { + args.push( + '--append-system-prompt', + renderTemplate( + claudeConfig.appendSystemPrompt, + templateVariables({ + runDirectory: opts.artifacts.runDirectory, + workingDirectory: opts.workingDirectory, + simulatorId: opts.simulatorId, + }), + ), + ); + } + for (const pluginDir of opts.pluginDirs ?? []) { + args.push('--plugin-dir', pluginDir); + } + args.push(...(claudeConfig.extraArgs ?? [])); + return args; +} + +export function benchmarkContextEnv(opts: { + runDirectory: string; + workingDirectory: string; + simulatorId?: string; +}): NodeJS.ProcessEnv { + return Object.fromEntries( + Object.entries({ + CLAUDE_UI_BENCHMARK_RUN_DIR: opts.runDirectory, + CLAUDE_UI_BENCHMARK_WORKING_DIRECTORY: opts.workingDirectory, + CLAUDE_UI_BENCHMARK_SIMULATOR_ID: opts.simulatorId, + }).filter((entry): entry is [string, string] => typeof entry[1] === 'string'), + ); +} + +export function parserToolArgs(toolAnalysis: ToolAnalysisConfig | undefined): string[] { + if (!toolAnalysis) return [`--tool-prefix=${mcpToolPrefix}`]; + const args: string[] = []; + const toolNames = new Set(); + for (const matcher of toolAnalysis.matchers) { + if (matcher.kind === 'namePrefix') args.push(`--tool-prefix=${matcher.prefix}`); + if (matcher.kind === 'bashCommand') toolNames.add('Bash'); + } + for (const toolName of toolNames) args.push(`--tool-name=${toolName}`); + return args; +} diff --git a/src/benchmarks/claude-ui/compare.ts b/src/benchmarks/claude-ui/compare.ts index 8a316a7c..de406bbe 100644 --- a/src/benchmarks/claude-ui/compare.ts +++ b/src/benchmarks/claude-ui/compare.ts @@ -7,40 +7,24 @@ import type { SequenceDiffLine, TranscriptAudit, } from './types.ts'; -import { DEFAULT_ALLOWED_VARIANCE } from './types.ts'; - -function expectedWithinUpperVariance( - actual: number, - expected: number, - allowedVariance: number, -): boolean { - return actual <= expected + allowedVariance; -} -function metric( - name: string, - actual: number, - expected: number, - allowedVariance: number, -): MetricResult { +function metric(name: string, actual: number, baseline: number): MetricResult { return { name, actual, - expected, - allowedVariance, - pass: expectedWithinUpperVariance(actual, expected, allowedVariance), + baseline, }; } -function lcsMatrix(expected: string[], actual: string[]): number[][] { - const matrix = Array.from({ length: expected.length + 1 }, () => +function lcsMatrix(baseline: string[], actual: string[]): number[][] { + const matrix = Array.from({ length: baseline.length + 1 }, () => Array.from({ length: actual.length + 1 }, () => 0), ); - for (let i = expected.length - 1; i >= 0; i -= 1) { + for (let i = baseline.length - 1; i >= 0; i -= 1) { for (let j = actual.length - 1; j >= 0; j -= 1) { matrix[i]![j] = - expected[i] === actual[j] + baseline[i] === actual[j] ? matrix[i + 1]![j + 1]! + 1 : Math.max(matrix[i + 1]![j]!, matrix[i]![j + 1]!); } @@ -49,19 +33,19 @@ function lcsMatrix(expected: string[], actual: string[]): number[][] { return matrix; } -function rawSequenceDiff(expected: string[], actual: string[]): SequenceDiffLine[] { - const matrix = lcsMatrix(expected, actual); +function rawSequenceDiff(baseline: string[], actual: string[]): SequenceDiffLine[] { + const matrix = lcsMatrix(baseline, actual); const lines: SequenceDiffLine[] = []; let i = 0; let j = 0; - while (i < expected.length && j < actual.length) { - if (expected[i] === actual[j]) { - lines.push({ kind: 'context', tool: expected[i]!, expectedIndex: i, actualIndex: j }); + while (i < baseline.length && j < actual.length) { + if (baseline[i] === actual[j]) { + lines.push({ kind: 'context', tool: baseline[i]!, baselineIndex: i, actualIndex: j }); i += 1; j += 1; } else if (matrix[i + 1]![j]! >= matrix[i]![j + 1]!) { - lines.push({ kind: 'missing', tool: expected[i]!, expectedIndex: i }); + lines.push({ kind: 'missing', tool: baseline[i]!, baselineIndex: i }); i += 1; } else { lines.push({ kind: 'additional', tool: actual[j]!, actualIndex: j }); @@ -69,8 +53,8 @@ function rawSequenceDiff(expected: string[], actual: string[]): SequenceDiffLine } } - while (i < expected.length) { - lines.push({ kind: 'missing', tool: expected[i]!, expectedIndex: i }); + while (i < baseline.length) { + lines.push({ kind: 'missing', tool: baseline[i]!, baselineIndex: i }); i += 1; } @@ -83,11 +67,11 @@ function rawSequenceDiff(expected: string[], actual: string[]): SequenceDiffLine } export function diffToolSequence( - expected: string[], + baseline: string[], actual: string[], contextSize = 2, ): SequenceDiffHunk[] { - const raw = rawSequenceDiff(expected, actual); + const raw = rawSequenceDiff(baseline, actual); const changedIndexes = raw .map((line, index) => (line.kind === 'context' ? -1 : index)) .filter((index) => index >= 0); @@ -115,95 +99,86 @@ function buildMetrics( run: BenchmarkRunMetadata, ): MetricResult[] { const baseline = config.baseline ?? {}; - const variance = { ...DEFAULT_ALLOWED_VARIANCE, ...(config.allowedVariance ?? {}) }; const metrics: MetricResult[] = []; if (baseline.totalToolCalls !== undefined) { - metrics.push( - metric( - 'totalToolCalls', - audit.totalToolCalls, - baseline.totalToolCalls, - variance.totalToolCalls, - ), - ); + metrics.push(metric('totalToolCalls', audit.totalToolCalls, baseline.totalToolCalls)); } if (baseline.mcpToolCalls !== undefined) { - metrics.push( - metric('mcpToolCalls', audit.mcpToolCalls, baseline.mcpToolCalls, variance.mcpToolCalls), - ); + metrics.push(metric('mcpToolCalls', audit.mcpToolCalls, baseline.mcpToolCalls)); + } + if (baseline.trackedToolCalls !== undefined) { + metrics.push(metric('trackedToolCalls', audit.trackedToolCalls, baseline.trackedToolCalls)); } if (baseline.uiAutomationCalls !== undefined) { - metrics.push( - metric( - 'uiAutomationCalls', - audit.uiAutomationCalls, - baseline.uiAutomationCalls, - variance.uiAutomationCalls, - ), - ); + metrics.push(metric('uiAutomationCalls', audit.uiAutomationCalls, baseline.uiAutomationCalls)); } if (baseline.wallClockSeconds !== undefined) { - metrics.push( - metric( - 'wallClockSeconds', - run.wallClockSeconds, - baseline.wallClockSeconds, - variance.wallClockSeconds, - ), - ); + metrics.push(metric('wallClockSeconds', run.wallClockSeconds, baseline.wallClockSeconds)); } - for (const [tool, expected] of Object.entries(baseline.tools ?? {})) { - metrics.push( - metric(`tool:${tool}`, audit.mcpToolCallsByName[tool] ?? 0, expected, variance.toolCalls), - ); + for (const [tool, recorded] of Object.entries(baseline.tools ?? {})) { + metrics.push(metric(`tool:${tool}`, audit.trackedToolCallsByName[tool] ?? 0, recorded)); } return metrics; } +function isTerminalClaudeFailure(failure: TranscriptAudit['failures'][number]): boolean { + return ( + failure.id === undefined && failure.fullName === undefined && failure.shortName === undefined + ); +} + +function processCompleted(run: BenchmarkRunMetadata, audit: TranscriptAudit): boolean { + if (audit.parseErrors.length > 0) return false; + if (run.claudeExitCode !== 0) return false; + if (run.parserExitCode !== 0) return false; + return !audit.failures.some(isTerminalClaudeFailure); +} + +function countCompletionIssues(audit: TranscriptAudit, run: BenchmarkRunMetadata): number { + const failureLines = new Set(audit.failures.map((failure) => failure.line)); + const uniquePatternFailures = audit.patternFailures.filter( + (failure) => !failureLines.has(failure.line), + ).length; + let count = audit.parseErrors.length + audit.failures.length + uniquePatternFailures; + if (run.claudeExitCode !== 0) count += 1; + if (run.parserExitCode !== 0 && audit.parseErrors.length === 0) count += 1; + return count; +} + export function compareBenchmark( config: BenchmarkConfig, audit: TranscriptAudit, run: BenchmarkRunMetadata, ): BenchmarkResult { const metrics = buildMetrics(audit, config, run); - const expected = config.expectedToolSequence ?? []; - const actual = audit.mcpSequence.map((call) => call.shortName); - const diff = expected.length > 0 ? diffToolSequence(expected, actual) : []; + const baselineSequence = config.baselineToolSequence ?? []; + const actual = audit.trackedSequence.map((call) => call.shortName); + const diff = baselineSequence.length > 0 ? diffToolSequence(baselineSequence, actual) : []; const missing = diff.flatMap((hunk) => hunk.lines.filter((line) => line.kind === 'missing').map((line) => line.tool), ); const additional = diff.flatMap((hunk) => hunk.lines.filter((line) => line.kind === 'additional').map((line) => line.tool), ); - const failureCount = - audit.parseErrors.length + - audit.failures.length + - audit.patternFailures.length + - (run.claudeExitCode === 0 ? 0 : 1) + - (run.parserExitCode === 0 || audit.parseErrors.length > 0 ? 0 : 1); - const sequenceMode = config.sequence?.mode ?? 'warn'; + const failureCount = countCompletionIssues(audit, run); const sequenceMatched = - expected.length === 0 || (missing.length === 0 && additional.length === 0); - const sequencePass = sequenceMatched || sequenceMode === 'warn'; - const failurePass = failureCount === 0; - const pass = metrics.every((item) => item.pass) && sequencePass && failurePass; + baselineSequence.length === 0 || (missing.length === 0 && additional.length === 0); + const completed = processCompleted(run, audit); return { name: config.name, - pass, + completed, metrics, - failureMetric: { - pass: failurePass, - count: failureCount, + completion: { + completed, + issueCount: failureCount, }, sequence: { - mode: sequenceMode, - pass: sequencePass, matched: sequenceMatched, - expected, + baseline: baselineSequence, actual, diff, missing, diff --git a/src/benchmarks/claude-ui/config.ts b/src/benchmarks/claude-ui/config.ts index 5521ce05..8fc3f228 100644 --- a/src/benchmarks/claude-ui/config.ts +++ b/src/benchmarks/claude-ui/config.ts @@ -3,7 +3,14 @@ import { parse as parseYaml } from 'yaml'; import * as z from 'zod'; import { sessionDefaultsSchema } from '../../utils/session-defaults-schema.ts'; import type { SessionDefaults } from '../../utils/session-store.ts'; -import type { AllowedVariance, BenchmarkConfig, SequenceMode } from './types.ts'; +import type { + BenchmarkConfig, + ClaudeInvocationConfig, + FailurePatternTarget, + ToolAnalysisConfig, + ToolMatcher, + ToolMatcherShortName, +} from './types.ts'; export const sessionDefaultEnvNames: Record = { workspacePath: 'XCODEBUILDMCP_WORKSPACE_PATH', @@ -72,19 +79,6 @@ function readOptionalBoolean( return raw; } -function readSequenceMode(raw: unknown, source: string): SequenceMode { - if (raw === 'warn' || raw === 'fail') return raw; - throw new Error(`${source}: expected 'warn' or 'fail'`); -} - -function readSequenceConfig(raw: unknown, source: string): BenchmarkConfig['sequence'] { - if (raw === undefined) return undefined; - if (!isRecord(raw)) throw new Error(`${source}: expected object`); - return { - mode: raw.mode === undefined ? undefined : readSequenceMode(raw.mode, `${source}.mode`), - }; -} - function readOptionalNumber( value: Record, key: string, @@ -107,43 +101,108 @@ function readNumberMap(value: unknown, source: string): Record | ); } -function readAllowedVariance(raw: unknown, source: string): Partial | undefined { +function readClaudeInvocationConfig( + raw: unknown, + source: string, +): ClaudeInvocationConfig | undefined { if (raw === undefined) return undefined; if (!isRecord(raw)) throw new Error(`${source}: expected object`); + const permissionMode = readOptionalString(raw, 'permissionMode', source); + if ( + permissionMode !== undefined && + permissionMode !== 'default' && + permissionMode !== 'bypassPermissions' + ) { + throw new Error(`${source}.permissionMode: expected 'default' or 'bypassPermissions'`); + } + return { + useMcpServer: readOptionalBoolean(raw, 'useMcpServer', source), + permissionMode, + tools: readOptionalStringArray(raw, 'tools', source), + allowedTools: readOptionalStringArray(raw, 'allowedTools', source), + appendSystemPrompt: readOptionalString(raw, 'appendSystemPrompt', source), + extraArgs: readOptionalStringArray(raw, 'extraArgs', source), + pluginDirs: readOptionalStringArray(raw, 'pluginDirs', source), + skillDirs: readOptionalStringArray(raw, 'skillDirs', source), + activateSkill: readOptionalString(raw, 'activateSkill', source), + isolatedWorkingDirectory: readOptionalBoolean(raw, 'isolatedWorkingDirectory', source), + maxClaudeSeconds: readOptionalNumber(raw, 'maxClaudeSeconds', source), + }; +} - const variance: Partial = {}; - const totalToolCalls = readOptionalNumber(raw, 'totalToolCalls', source); - if (totalToolCalls !== undefined) variance.totalToolCalls = totalToolCalls; - const mcpToolCalls = readOptionalNumber(raw, 'mcpToolCalls', source); - if (mcpToolCalls !== undefined) variance.mcpToolCalls = mcpToolCalls; - const uiAutomationCalls = readOptionalNumber(raw, 'uiAutomationCalls', source); - if (uiAutomationCalls !== undefined) variance.uiAutomationCalls = uiAutomationCalls; - const wallClockSeconds = readOptionalNumber(raw, 'wallClockSeconds', source); - if (wallClockSeconds !== undefined) variance.wallClockSeconds = wallClockSeconds; - const toolCalls = readOptionalNumber(raw, 'toolCalls', source); - if (toolCalls !== undefined) variance.toolCalls = toolCalls; - return variance; -} - -function readFailurePatterns(raw: unknown, source: string): string[] | undefined { - const patterns = readOptionalStringArray( - raw as Record, - 'failurePatterns', - source, - ); +function readShortNameMode(raw: unknown, source: string): ToolMatcherShortName | undefined { + if (raw === undefined) return undefined; + if (raw === 'afterLastDoubleUnderscore' || raw === 'afterPrefix' || raw === 'full') { + return raw; + } + throw new Error(`${source}: expected 'afterLastDoubleUnderscore', 'afterPrefix', or 'full'`); +} + +function readToolMatcher(raw: unknown, source: string): ToolMatcher { + if (!isRecord(raw)) throw new Error(`${source}: expected object`); + const kind = readString(raw, 'kind', source); + if (kind === 'namePrefix') { + return { + kind, + prefix: readString(raw, 'prefix', source), + shortName: readShortNameMode(raw.shortName, `${source}.shortName`), + uiAutomationNames: readOptionalStringArray(raw, 'uiAutomationNames', source), + }; + } + if (kind === 'bashCommand') { + return { + kind, + commandPrefix: readString(raw, 'commandPrefix', source), + shortName: readString(raw, 'shortName', source), + uiAutomation: readOptionalBoolean(raw, 'uiAutomation', source), + }; + } + throw new Error(`${source}.kind: expected 'namePrefix' or 'bashCommand'`); +} + +function readToolAnalysisConfig(raw: unknown, source: string): ToolAnalysisConfig | undefined { + if (raw === undefined) return undefined; + if (!isRecord(raw)) throw new Error(`${source}: expected object`); + const matchers = raw.matchers; + if (!Array.isArray(matchers)) throw new Error(`${source}.matchers: expected array`); + return { + matchers: matchers.map((matcher, index) => + readToolMatcher(matcher, `${source}.matchers[${index}]`), + ), + }; +} + +function readRegexPatterns( + raw: Record, + key: string, + source: string, +): string[] | undefined { + const patterns = readOptionalStringArray(raw, key, source); for (const [index, pattern] of (patterns ?? []).entries()) { try { new RegExp(pattern, 'i'); } catch (error) { const message = error instanceof Error ? error.message : String(error); - throw new Error( - `${source}.failurePatterns[${index}]: invalid regular expression: ${message}`, - ); + throw new Error(`${source}.${key}[${index}]: invalid regular expression: ${message}`); } } return patterns; } +function readFailurePatternTarget(raw: unknown, source: string): FailurePatternTarget { + if (raw === 'commands' || raw === 'toolResults') return raw; + throw new Error(`${source}: expected 'commands' or 'toolResults'`); +} + +function readFailurePatternTargets( + raw: unknown, + source: string, +): FailurePatternTarget[] | undefined { + if (raw === undefined) return undefined; + if (!Array.isArray(raw)) throw new Error(`${source}: expected array`); + return raw.map((target, index) => readFailurePatternTarget(target, `${source}[${index}]`)); +} + function readFirstRunPromptDismissals( raw: unknown, source: string, @@ -177,16 +236,33 @@ function formatZodIssues(error: z.ZodError): string { .join('\n'); } +function rejectRemovedConfigKeys(raw: Record, source: string): void { + const removedKeys: Record = { + allowedVariance: 'removed; baselines are observed data only', + expectedFailures: 'removed; benchmark stumbles are observed data', + expectedToolSequence: 'renamed to baselineToolSequence', + }; + for (const [key, message] of Object.entries(removedKeys)) { + if (raw[key] !== undefined) throw new Error(`${source}.${key}: ${message}`); + } +} + export function readConfig(raw: unknown, source: string): BenchmarkConfig { if (!isRecord(raw)) throw new Error(`${source}: expected YAML object`); + rejectRemovedConfigKeys(raw, source); const config: BenchmarkConfig = { name: readString(raw, 'name', source), prompt: readString(raw, 'prompt', source), workingDirectory: readOptionalString(raw, 'workingDirectory', source), - expectedToolSequence: readOptionalStringArray(raw, 'expectedToolSequence', source), - sequence: readSequenceConfig(raw.sequence, `${source}.sequence`), - failurePatterns: readFailurePatterns(raw, source), + baselineToolSequence: readOptionalStringArray(raw, 'baselineToolSequence', source), + failurePatterns: readRegexPatterns(raw, 'failurePatterns', source), + failurePatternTargets: readFailurePatternTargets( + raw.failurePatternTargets, + `${source}.failurePatternTargets`, + ), + ignoredFailurePatterns: readRegexPatterns(raw, 'ignoredFailurePatterns', source), temporarySimulator: readOptionalBoolean(raw, 'temporarySimulator', source), + preflightCommands: readOptionalStringArray(raw, 'preflightCommands', source), firstRunPromptDismissals: readFirstRunPromptDismissals( raw.firstRunPromptDismissals, `${source}.firstRunPromptDismissals`, @@ -199,12 +275,12 @@ export function readConfig(raw: unknown, source: string): BenchmarkConfig { } config.sessionDefaults = validateSessionDefaults(raw.sessionDefaults); } - config.allowedVariance = readAllowedVariance(raw.allowedVariance, `${source}.allowedVariance`); if (raw.baseline !== undefined) { if (!isRecord(raw.baseline)) throw new Error(`${source}.baseline: expected object`); config.baseline = { totalToolCalls: readOptionalNumber(raw.baseline, 'totalToolCalls', `${source}.baseline`), + trackedToolCalls: readOptionalNumber(raw.baseline, 'trackedToolCalls', `${source}.baseline`), mcpToolCalls: readOptionalNumber(raw.baseline, 'mcpToolCalls', `${source}.baseline`), uiAutomationCalls: readOptionalNumber( raw.baseline, @@ -215,6 +291,8 @@ export function readConfig(raw: unknown, source: string): BenchmarkConfig { tools: readNumberMap(raw.baseline.tools, `${source}.baseline.tools`), }; } + config.claude = readClaudeInvocationConfig(raw.claude, `${source}.claude`); + config.toolAnalysis = readToolAnalysisConfig(raw.toolAnalysis, `${source}.toolAnalysis`); return config; } diff --git a/src/benchmarks/claude-ui/constants.ts b/src/benchmarks/claude-ui/constants.ts new file mode 100644 index 00000000..466ebf8f --- /dev/null +++ b/src/benchmarks/claude-ui/constants.ts @@ -0,0 +1,13 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +export const sourceDir = path.dirname(fileURLToPath(import.meta.url)); +export const repoRoot = path.resolve(sourceDir, '../../..'); +export const suitesDir = path.join(repoRoot, 'benchmarks/claude-ui/suites'); +export const localSuitesDir = path.join(repoRoot, 'benchmarks/claude-ui/local/suites'); +export const bundledParserPath = path.join( + repoRoot, + 'benchmarks/claude-ui/parse_claude_conversation.py', +); +export const serverName = 'xcodebuildmcp-dev'; +export const mcpToolPrefix = `mcp__${serverName}__`; diff --git a/src/benchmarks/claude-ui/first-run-preflight.ts b/src/benchmarks/claude-ui/first-run-preflight.ts index 92f08b8a..663c4b35 100644 --- a/src/benchmarks/claude-ui/first-run-preflight.ts +++ b/src/benchmarks/claude-ui/first-run-preflight.ts @@ -202,10 +202,10 @@ export async function dismissFirstRunPrompts(opts: { ); } - let preflightSucceeded = false; try { const deadline = timing.now() + timeoutMs; let promptsDismissed = false; + let consecutiveReadySnapshots = 0; while (timing.now() < deadline) { const search = await findFirstRunPromptLabel({ simulatorId: opts.simulatorId, @@ -218,6 +218,7 @@ export async function dismissFirstRunPrompts(opts: { }); if (search.status === 'unavailable') { + consecutiveReadySnapshots = 0; await appendLifecycleLog( opts.logPath, `First-run prompt preflight: UI unavailable; retrying (exit ${search.exitCode})`, @@ -227,7 +228,8 @@ export async function dismissFirstRunPrompts(opts: { } if (search.status === 'not-found') { - if (search.hasElements) { + consecutiveReadySnapshots = search.hasElements ? consecutiveReadySnapshots + 1 : 0; + if (consecutiveReadySnapshots >= 2) { promptsDismissed = true; break; } @@ -235,6 +237,7 @@ export async function dismissFirstRunPrompts(opts: { continue; } + consecutiveReadySnapshots = 0; const { label } = search; opts.onEvent?.(`dismissing first-run prompt '${label}'`); await appendLifecycleLog(opts.logPath, `Dismissing first-run prompt label: ${label}`); @@ -258,7 +261,6 @@ export async function dismissFirstRunPrompts(opts: { `${opts.config.name}: timed out during first-run prompt preflight; see ${opts.logPath}`, ); } - preflightSucceeded = true; } finally { await terminatePreflightApp({ config: opts.config, @@ -267,7 +269,7 @@ export async function dismissFirstRunPrompts(opts: { cwd: opts.cwd, logPath: opts.logPath, executor, - suppressFailure: !preflightSucceeded, + suppressFailure: true, }); } await appendLifecycleLog(opts.logPath, 'First-run prompt preflight: complete'); diff --git a/src/benchmarks/claude-ui/harness.ts b/src/benchmarks/claude-ui/harness.ts index 8ed19071..0100928a 100644 --- a/src/benchmarks/claude-ui/harness.ts +++ b/src/benchmarks/claude-ui/harness.ts @@ -1,23 +1,43 @@ import { spawn } from 'node:child_process'; +import { randomUUID } from 'node:crypto'; import { createWriteStream } from 'node:fs'; -import { access, mkdir, readdir, readFile, stat, writeFile } from 'node:fs/promises'; +import { access, cp, mkdir, readdir, readFile, stat, writeFile } from 'node:fs/promises'; import path from 'node:path'; +import { tmpdir } from 'node:os'; import { finished } from 'node:stream/promises'; -import { fileURLToPath } from 'node:url'; -import { stringify as stringifyYaml } from 'yaml'; import yargs from 'yargs/yargs'; import { hideBin } from 'yargs/helpers'; +import { + benchmarkContextEnv, + buildClaudeArgs, + parserToolArgs, + usesMcpServer, +} from './claude-invocation.ts'; import { compareBenchmark } from './compare.ts'; -import { loadSuite, sessionDefaultEnvNames, validateSessionDefaults } from './config.ts'; +import { loadSuite } from './config.ts'; +import { + bundledParserPath, + localSuitesDir, + mcpToolPrefix, + repoRoot, + suitesDir, +} from './constants.ts'; import { dismissFirstRunPrompts } from './first-run-preflight.ts'; +import { + claudeBenchmarkEnv, + requireFirstRunPreflightSimulatorId, + writeClaudeMcpConfig, +} from './mcp-config.ts'; +import { runPreflightCommands } from './preflight-commands.ts'; import { createProgressReporter, type ProgressReporter } from './progress.ts'; import { renderAggregate, renderSuiteReport } from './render.ts'; +import { deleteTemporarySimulator } from './simulator-deletion.ts'; import { - deleteTemporarySimulator, prepareTemporarySimulator, resolveTemporarySimulatorPlan, type CreatedTemporarySimulator, type LifecycleCommandExecutor, + type PreparedSimulator, } from './simulator-lifecycle.ts'; import { analyzeClaudeJsonl } from './transcript.ts'; import type { @@ -26,23 +46,28 @@ import type { BenchmarkResult, BenchmarkRunMetadata, TemporarySimulatorRunMetadata, + ToolAnalysisConfig, } from './types.ts'; -import type { SessionDefaults } from '../../utils/session-store.ts'; -const sourceDir = path.dirname(fileURLToPath(import.meta.url)); -const repoRoot = path.resolve(sourceDir, '../../..'); -const suitesDir = path.join(repoRoot, 'benchmarks/claude-ui/suites'); -const bundledParserPath = path.join(repoRoot, 'benchmarks/claude-ui/parse_claude_conversation.py'); const parserEnvName = 'CLAUDE_UI_BENCHMARK_PARSER'; -const serverName = 'xcodebuildmcp-dev'; -const mcpToolPrefix = `mcp__${serverName}__`; -const sessionDefaultEnvNameSet = new Set(Object.values(sessionDefaultEnvNames)); interface CommandResult { exitCode: number | null; durationSeconds: number; } +interface StreamJsonResult { + type?: unknown; + is_error?: unknown; +} +async function fileExists(filePath: string): Promise { + try { + await access(filePath); + return true; + } catch { + return false; + } +} -export function resolveSuitePath(suite: string): string { +export async function resolveSuitePath(suite: string): Promise { if ( path.isAbsolute(suite) || suite.includes(path.sep) || @@ -51,19 +76,47 @@ export function resolveSuitePath(suite: string): string { ) { return path.resolve(suite); } - return path.join(suitesDir, `${suite}.yml`); + + const candidates = [ + path.join(suitesDir, `${suite}.yml`), + path.join(localSuitesDir, `${suite}.yml`), + ]; + const matches = []; + for (const candidate of candidates) { + if (await fileExists(candidate)) matches.push(candidate); + } + if (matches.length === 1) return matches[0]!; + if (matches.length > 1) { + throw new Error( + `suite name '${suite}' matches multiple suite files; pass an explicit path:\n${matches.join('\n')}`, + ); + } + return candidates[0]!; } -export async function listSuitePaths(): Promise { - const entries = await readdir(suitesDir, { withFileTypes: true }); +async function listYamlFiles(directory: string, required: boolean): Promise { + let entries; + try { + entries = await readdir(directory, { withFileTypes: true }); + } catch (error) { + if (!required) return []; + throw error; + } return entries .filter( (entry) => entry.isFile() && (entry.name.endsWith('.yml') || entry.name.endsWith('.yaml')), ) - .map((entry) => path.join(suitesDir, entry.name)) + .map((entry) => path.join(directory, entry.name)) .sort(); } +export async function listSuitePaths(): Promise { + return [ + ...(await listYamlFiles(suitesDir, true)), + ...(await listYamlFiles(localSuitesDir, false)), + ]; +} + export function requireSuitePaths(suitePaths: string[]): string[] { if (suitePaths.length === 0) { throw new Error('no suite files found in benchmarks/claude-ui/suites'); @@ -91,129 +144,61 @@ function resolveFrom(baseDir: string, filePath: string): string { return path.isAbsolute(filePath) ? filePath : path.resolve(baseDir, filePath); } -export async function resolveParserPath(parserPath: string | undefined): Promise { - const configured = parserPath ?? process.env[parserEnvName] ?? bundledParserPath; - const resolved = path.resolve(configured); - try { - await access(resolved); - } catch { - throw new Error(`Claude UI benchmark parser does not exist: ${resolved}`); +async function installProjectSkills(opts: { + skillDirs: string[] | undefined; + claudeWorkingDirectory: string; +}): Promise { + if (!opts.skillDirs || opts.skillDirs.length === 0) return []; + const projectSkillsDirectory = path.join(opts.claudeWorkingDirectory, '.claude', 'skills'); + await mkdir(projectSkillsDirectory, { recursive: true }); + const installed: string[] = []; + for (const skillDir of opts.skillDirs) { + const skillName = path.basename(skillDir); + const target = path.join(projectSkillsDirectory, skillName); + await cp(skillDir, target, { recursive: true, force: true }); + installed.push(target); } - return resolved; + return installed; } -function sessionDefaultsWithTemporarySimulator( - config: BenchmarkConfig, - temporarySimulator: CreatedTemporarySimulator | undefined, -): SessionDefaults | undefined { - if (!temporarySimulator) return config.sessionDefaults; - const defaults = { ...config.sessionDefaults }; - delete defaults.simulatorName; - return { - ...defaults, - simulatorId: temporarySimulator.simulatorId, - }; -} - -const sessionDefaultPathKeys = new Set(['workspacePath', 'projectPath', 'derivedDataPath']); - -function shouldResolveSessionDefaultPath(key: string, value: string): boolean { - if (!sessionDefaultPathKeys.has(key)) return false; - if (path.isAbsolute(value) || value.startsWith('~')) return false; - return !/^[A-Za-z][A-Za-z0-9+.-]*:/.test(value); -} - -function isolatedSessionDefaults( - config: BenchmarkConfig, - workingDirectory: string, - temporarySimulator: CreatedTemporarySimulator | undefined, -): SessionDefaults | undefined { - const defaults = validateSessionDefaults( - sessionDefaultsWithTemporarySimulator(config, temporarySimulator), - ); - if (!defaults) return undefined; - - const resolved = { ...defaults }; - for (const [key, value] of Object.entries(defaults)) { - if (typeof value === 'string' && shouldResolveSessionDefaultPath(key, value)) { - if (key === 'workspacePath' || key === 'projectPath' || key === 'derivedDataPath') { - resolved[key] = path.resolve(workingDirectory, value); - } - } - } - return resolved; -} - -export function resolveBenchmarkSimulatorId( - config: BenchmarkConfig, - temporarySimulator: CreatedTemporarySimulator | undefined, -): string | undefined { - return ( - temporarySimulator?.simulatorId ?? - (typeof config.sessionDefaults?.simulatorId === 'string' - ? config.sessionDefaults.simulatorId - : undefined) +async function readActivatedSkillPrompt(opts: { + skillName: string; + installedSkillDirs: string[]; +}): Promise<{ prompt: string; skillPath: string }> { + const installedSkillDir = opts.installedSkillDirs.find( + (skillDir) => path.basename(skillDir) === opts.skillName, ); -} - -export function requireFirstRunPreflightSimulatorId( - config: BenchmarkConfig, - temporarySimulator: CreatedTemporarySimulator | undefined, -): string | undefined { - const simulatorId = resolveBenchmarkSimulatorId(config, temporarySimulator); - if (config.firstRunPromptDismissals && !simulatorId) { + if (!installedSkillDir) { throw new Error( - 'firstRunPromptDismissals requires a temporary simulator or sessionDefaults.simulatorId', + `claude.activateSkill '${opts.skillName}' was not installed from claude.skillDirs`, ); } - return simulatorId; -} - -export async function writeMcpConfig(opts: { - config: BenchmarkConfig; - mcpConfigPath: string; - mcpWorkspaceDirectory: string; - mcpWorkspaceConfigPath: string; - workingDirectory: string; - temporarySimulator?: CreatedTemporarySimulator; -}): Promise { - const sessionDefaults = isolatedSessionDefaults( - opts.config, - opts.workingDirectory, - opts.temporarySimulator, - ); - const isolatedConfig = { - schemaVersion: 1, - enabledWorkflows: ['simulator', 'ui-automation'], - debug: true, - sentryDisabled: true, - sessionDefaults: sessionDefaults ?? {}, - }; - const mcpConfig = { - mcpServers: { - [serverName]: { - type: 'stdio', - command: 'node', - args: [path.join(repoRoot, 'build/cli.js'), 'mcp'], - env: { - XCODEBUILDMCP_DEBUG: 'true', - XCODEBUILDMCP_SENTRY_DISABLED: 'true', - XCODEBUILDMCP_CWD: opts.mcpWorkspaceDirectory, - }, - }, - }, + const skillPath = path.join(installedSkillDir, 'SKILL.md'); + const skillBody = await readFile(skillPath, 'utf8'); + return { + skillPath, + prompt: [ + `Load this Claude Code skill for the benchmark session: ${opts.skillName}.`, + 'Use these instructions as the active skill context for subsequent turns.', + 'Do not begin the benchmark task yet; only acknowledge that the skill context is loaded.', + '', + ``, + skillBody, + '', + '', + ].join('\n'), }; - - await mkdir(path.dirname(opts.mcpWorkspaceConfigPath), { recursive: true }); - await writeFile(opts.mcpWorkspaceConfigPath, stringifyYaml(isolatedConfig), 'utf8'); - await writeFile(opts.mcpConfigPath, `${JSON.stringify(mcpConfig, null, 2)}\n`, 'utf8'); } -export function claudeBenchmarkEnv(source: NodeJS.ProcessEnv = process.env): NodeJS.ProcessEnv { - const env = { ...source }; - for (const name of sessionDefaultEnvNameSet) delete env[name]; - delete env.XCODEBUILDMCP_CWD; - return env; +export async function resolveParserPath(parserPath: string | undefined): Promise { + const configured = parserPath ?? process.env[parserEnvName] ?? bundledParserPath; + const resolved = path.resolve(configured); + try { + await access(resolved); + } catch { + throw new Error(`Claude UI benchmark parser does not exist: ${resolved}`); + } + return resolved; } function runCommand(opts: { @@ -224,24 +209,131 @@ function runCommand(opts: { stdoutPath: string; stderrPath: string; env?: NodeJS.ProcessEnv; + terminalJsonResultGraceMs?: number; + timeoutMs?: number; }): Promise { return new Promise((resolve, reject) => { const stdout = createWriteStream(opts.stdoutPath); const stderr = createWriteStream(opts.stderrPath); const started = process.hrtime.bigint(); + let stdoutBuffer = ''; + let terminalResultExitCode: number | undefined; + let terminalResultTimer: NodeJS.Timeout | undefined; + let timeoutTimer: NodeJS.Timeout | undefined; + let hardKillTimer: NodeJS.Timeout | undefined; + let timedOut = false; + let settled = false; const child = spawn(opts.command, opts.args, { cwd: opts.cwd, env: opts.env ?? process.env, stdio: ['pipe', 'pipe', 'pipe'], + detached: opts.terminalJsonResultGraceMs !== undefined, }); - child.stdout.pipe(stdout); - child.stderr.pipe(stderr); - child.on('error', reject); + const clearTerminalResultTimer = (): void => { + if (terminalResultTimer) clearTimeout(terminalResultTimer); + terminalResultTimer = undefined; + }; + + const clearTimeoutTimer = (): void => { + if (timeoutTimer) clearTimeout(timeoutTimer); + timeoutTimer = undefined; + }; + + const clearHardKillTimer = (): void => { + if (hardKillTimer) clearTimeout(hardKillTimer); + hardKillTimer = undefined; + }; + + const killChild = (signal: NodeJS.Signals): void => { + if (child.exitCode !== null || child.killed || child.pid === undefined) return; + try { + process.kill(-child.pid, signal); + } catch { + try { + child.kill(signal); + } catch { + // Ignore termination races; the close handler will resolve once stdio closes. + } + } + }; + + const terminateChild = (): void => { + if (child.exitCode !== null || child.killed || child.pid === undefined) return; + killChild('SIGTERM'); + if (hardKillTimer !== undefined) return; + hardKillTimer = setTimeout(() => { + killChild('SIGKILL'); + }, 5_000); + hardKillTimer.unref(); + }; + + const recordTerminalResult = (result: StreamJsonResult): void => { + if (terminalResultExitCode !== undefined || opts.terminalJsonResultGraceMs === undefined) + return; + terminalResultExitCode = result.is_error === true ? 1 : 0; + terminalResultTimer = setTimeout(terminateChild, opts.terminalJsonResultGraceMs); + terminalResultTimer.unref(); + }; + + if (opts.timeoutMs !== undefined) { + timeoutTimer = setTimeout(() => { + timedOut = true; + terminateChild(); + }, opts.timeoutMs); + timeoutTimer.unref(); + } + + const scanStdoutForTerminalResult = (chunk: Buffer): void => { + if (opts.terminalJsonResultGraceMs === undefined || terminalResultExitCode !== undefined) + return; + stdoutBuffer += chunk.toString('utf8'); + const lines = stdoutBuffer.split('\n'); + stdoutBuffer = lines.pop() ?? ''; + for (const line of lines) { + if (line.trim().length === 0) continue; + try { + const record = JSON.parse(line) as StreamJsonResult; + if (record.type === 'result') recordTerminalResult(record); + } catch { + // Claude stream-json records are newline-delimited JSON. Ignore non-JSON fragments. + } + } + }; + + child.stdout.on('data', (chunk: Buffer) => { + stdout.write(chunk); + scanStdoutForTerminalResult(chunk); + }); + child.stderr.on('data', (chunk: Buffer) => { + stderr.write(chunk); + }); + child.on('error', (error) => { + if (settled) return; + settled = true; + clearTerminalResultTimer(); + clearTimeoutTimer(); + clearHardKillTimer(); + stdout.destroy(); + stderr.destroy(); + reject(error); + }); child.on('close', (exitCode) => { + if (settled) return; + settled = true; + clearTerminalResultTimer(); + clearTimeoutTimer(); + clearHardKillTimer(); const durationSeconds = Number(process.hrtime.bigint() - started) / 1_000_000_000; + stdout.end(); + stderr.end(); Promise.all([finished(stdout), finished(stderr)]) - .then(() => resolve({ exitCode, durationSeconds })) + .then(() => + resolve({ + exitCode: timedOut ? 143 : (exitCode ?? terminalResultExitCode ?? null), + durationSeconds, + }), + ) .catch(reject); }); @@ -253,9 +345,15 @@ function runCommand(opts: { }); } +function claudeTaskTimeoutMs(config: BenchmarkConfig): number | undefined { + if (config.claude?.maxClaudeSeconds !== undefined) return config.claude.maxClaudeSeconds * 1000; + return undefined; +} + async function runParser( artifacts: BenchmarkArtifacts, parserPath: string, + toolAnalysis: ToolAnalysisConfig | undefined, ): Promise { const result = await runCommand({ command: 'python3', @@ -263,7 +361,7 @@ async function runParser( parserPath, artifacts.claudeJsonlPath, artifacts.parsedDirectory, - `--tool-prefix=${mcpToolPrefix}`, + ...parserToolArgs(toolAnalysis), ], cwd: repoRoot, stdoutPath: artifacts.parseLogPath, @@ -283,20 +381,19 @@ function normalizeStoredResult(result: BenchmarkResult): BenchmarkResult { ); } - const mode = result.sequence.mode ?? 'warn'; const matched = result.sequence.matched ?? (result.sequence.missing.length === 0 && result.sequence.additional.length === 0); - const sequencePass = matched || mode === 'warn'; + + if (typeof result.completed !== 'boolean' || !result.completion) { + throw new Error('unsupported result.json: expected completed and completion fields'); + } return { ...result, - pass: result.metrics.every((item) => item.pass) && result.failureMetric.pass && sequencePass, sequence: { ...result.sequence, - mode, matched, - pass: sequencePass, }, }; } @@ -311,7 +408,6 @@ async function readStoredResult( const raw = JSON.parse(await readFile(resultPath, 'utf8')) as BenchmarkResult | BenchmarkResult[]; return Array.isArray(raw) ? raw.map(normalizeStoredResult) : normalizeStoredResult(raw); } - function temporarySimulatorMetadata( temporarySimulator: CreatedTemporarySimulator | undefined, setupDurationSeconds: number, @@ -374,7 +470,7 @@ export async function runSuite( resultJsonPath: path.join(runDirectory, 'result.json'), }; - let temporarySimulator: CreatedTemporarySimulator | undefined; + let temporarySimulator: PreparedSimulator | undefined; let temporarySimulatorRun: TemporarySimulatorRunMetadata | undefined; let result: BenchmarkResult | undefined; @@ -401,10 +497,10 @@ export async function runSuite( const simulatorSetupDurationSeconds = Number(process.hrtime.bigint() - simulatorSetupStarted) / 1_000_000_000; temporarySimulatorRun = temporarySimulatorMetadata( - temporarySimulator, + temporarySimulator?.createdByHarness === true ? temporarySimulator : undefined, simulatorSetupDurationSeconds, ); - if (temporarySimulator) { + if (temporarySimulator?.createdByHarness === true) { progress?.event(`simulator setup took ${simulatorSetupDurationSeconds.toFixed(2)}s`); } @@ -420,16 +516,46 @@ export async function runSuite( }); } - const suiteDirectory = path.dirname(suitePath); - const promptPath = resolveFrom(suiteDirectory, config.prompt); const workingDirectory = config.workingDirectory ? resolveFrom(repoRoot, config.workingDirectory) : repoRoot; + const claudeWorkingDirectory = config.claude?.isolatedWorkingDirectory + ? path.join(tmpdir(), 'xcodebuildmcp-claude-ui-cwd', slug, runTimestamp) + : workingDirectory; + if (config.claude?.isolatedWorkingDirectory) { + await mkdir(claudeWorkingDirectory, { recursive: true }); + } + if (config.claude?.skillDirs && config.claude.isolatedWorkingDirectory !== true) { + throw new Error(`${config.name}: claude.skillDirs requires claude.isolatedWorkingDirectory`); + } + const installedSkillDirs = await installProjectSkills({ + skillDirs: config.claude?.skillDirs?.map((skillDir) => resolveFrom(repoRoot, skillDir)), + claudeWorkingDirectory, + }); + const contextEnv = benchmarkContextEnv({ + runDirectory, + workingDirectory, + simulatorId: effectiveSimulatorId, + }); + const benchmarkEnv = claudeBenchmarkEnv(process.env, contextEnv); + await runPreflightCommands({ + commands: config.preflightCommands, + cwd: workingDirectory, + env: benchmarkEnv, + logPath: artifacts.simulatorLifecycleLogPath, + simulatorId: effectiveSimulatorId, + onEvent: (message) => progress?.event(message), + }); + + const suiteDirectory = path.dirname(suitePath); + const promptPath = resolveFrom(suiteDirectory, config.prompt); const prompt = await readFile(promptPath, 'utf8'); await writeFile(artifacts.promptPath, prompt, 'utf8'); - await writeMcpConfig({ + const useMcpServer = usesMcpServer(config); + await writeClaudeMcpConfig({ config, + enabled: useMcpServer, mcpConfigPath: artifacts.mcpConfigPath, mcpWorkspaceDirectory: artifacts.mcpWorkspaceDirectory, mcpWorkspaceConfigPath: artifacts.mcpWorkspaceConfigPath, @@ -437,34 +563,89 @@ export async function runSuite( temporarySimulator, }); - const claudeArgs = [ - '-p', - '--verbose', - '--output-format', - 'stream-json', - '--mcp-config', - artifacts.mcpConfigPath, - '--strict-mcp-config', - '--permission-mode', - 'bypassPermissions', - '--allowedTools', - `${mcpToolPrefix}*`, - ]; + const claudeSessionId = config.claude?.activateSkill ? randomUUID() : undefined; + const baseClaudeArgs = { + config, + artifacts, + workingDirectory, + pluginDirs: config.claude?.pluginDirs?.map((pluginDir) => resolveFrom(repoRoot, pluginDir)), + simulatorId: effectiveSimulatorId, + }; + const claudeArgs = buildClaudeArgs({ + ...baseClaudeArgs, + resumeSessionId: claudeSessionId, + }); await writeFile( artifacts.claudeCommandLogPath, - `Run dir: ${runDirectory}\nCommand: claude ${claudeArgs.join(' ')} < ${artifacts.promptPath} > ${artifacts.claudeJsonlPath} 2> ${artifacts.claudeStderrPath}\nWorking directory: ${workingDirectory}\nMCP workspace: ${artifacts.mcpWorkspaceDirectory}\nMCP workspace config: ${artifacts.mcpWorkspaceConfigPath}\nSimulator lifecycle log: ${artifacts.simulatorLifecycleLogPath}\nSimulator ID: ${effectiveSimulatorId ?? 'suite/default'}\nStarted: ${new Date().toISOString()}\n`, + `Run dir: ${runDirectory}\nCommand: claude ${claudeArgs.join(' ')} < ${artifacts.promptPath} > ${artifacts.claudeJsonlPath} 2> ${artifacts.claudeStderrPath}\nWorking directory: ${claudeWorkingDirectory}\nBenchmark working directory: ${workingDirectory}\nMCP server enabled: ${String(useMcpServer)}\nMCP workspace: ${useMcpServer ? artifacts.mcpWorkspaceDirectory : 'disabled'}\nMCP workspace config: ${useMcpServer ? artifacts.mcpWorkspaceConfigPath : 'disabled'}\nSimulator lifecycle log: ${artifacts.simulatorLifecycleLogPath}\nSimulator ID: ${effectiveSimulatorId ?? 'suite/default'}\nClaude session ID: ${claudeSessionId ?? 'new task session'}\nStarted: ${new Date().toISOString()}\n`, 'utf8', ); + if (installedSkillDirs.length > 0) { + await writeFile( + artifacts.claudeCommandLogPath, + `Installed project skills:\n${installedSkillDirs.map((skillDir) => `- ${skillDir}`).join('\n')}\n`, + { flag: 'a' }, + ); + } + + if (config.claude?.activateSkill) { + const activationArgs = buildClaudeArgs({ + ...baseClaudeArgs, + sessionId: claudeSessionId, + }); + const activationStdoutPath = path.join(runDirectory, 'claude-skill-activation.jsonl'); + const activationStderrPath = path.join(runDirectory, 'claude-skill-activation.stderr'); + const activationPromptPath = path.join(runDirectory, 'claude-skill-activation.md'); + const activationPrompt = await readActivatedSkillPrompt({ + skillName: config.claude.activateSkill, + installedSkillDirs, + }); + await writeFile(activationPromptPath, activationPrompt.prompt, 'utf8'); + await writeFile( + artifacts.claudeCommandLogPath, + [ + `Skill activation source: ${activationPrompt.skillPath}`, + `Skill activation prompt: ${activationPromptPath}`, + `Skill activation command: claude ${activationArgs.join(' ')} < ${activationPromptPath} > ${activationStdoutPath} 2> ${activationStderrPath}`, + '', + ].join('\n'), + { flag: 'a' }, + ); + progress?.event(`loading skill ${config.claude.activateSkill}`); + const activation = await runCommand({ + command: 'claude', + args: activationArgs, + cwd: claudeWorkingDirectory, + stdin: activationPrompt.prompt, + stdoutPath: activationStdoutPath, + stderrPath: activationStderrPath, + env: benchmarkEnv, + terminalJsonResultGraceMs: 5_000, + timeoutMs: 60_000, + }); + await writeFile( + artifacts.claudeCommandLogPath, + `Skill activation finished: ${new Date().toISOString()}\nSkill activation exit status: ${activation.exitCode}\nSkill activation wall clock seconds: ${activation.durationSeconds.toFixed(2)}\n`, + { flag: 'a' }, + ); + if (activation.exitCode !== 0) { + throw new Error( + `${config.name}: skill activation /${config.claude.activateSkill} failed with exit ${activation.exitCode}`, + ); + } + } progress?.event('launching claude'); const claude = await runCommand({ command: 'claude', args: claudeArgs, - cwd: workingDirectory, + cwd: claudeWorkingDirectory, stdin: prompt, stdoutPath: artifacts.claudeJsonlPath, stderrPath: artifacts.claudeStderrPath, - env: claudeBenchmarkEnv(), + env: benchmarkEnv, + terminalJsonResultGraceMs: 5_000, + timeoutMs: claudeTaskTimeoutMs(config), }); progress?.event( `claude finished in ${claude.durationSeconds.toFixed(2)}s (exit ${claude.exitCode ?? 'null'})`, @@ -477,14 +658,17 @@ export async function runSuite( ); progress?.event('parsing transcript'); - const parserExitCode = await runParser(artifacts, parserPath); + const parserExitCode = await runParser(artifacts, parserPath, config.toolAnalysis); progress?.event(`parser finished (exit ${parserExitCode ?? 'null'})`); progress?.event('evaluating result'); const jsonl = await readFile(artifacts.claudeJsonlPath, 'utf8'); const audit = analyzeClaudeJsonl(jsonl, { mcpToolPrefix, + toolAnalysis: config.toolAnalysis, failurePatterns: config.failurePatterns, + failurePatternTargets: config.failurePatternTargets, + ignoredFailurePatterns: config.ignoredFailurePatterns, }); const run: BenchmarkRunMetadata = { suitePath, @@ -496,7 +680,7 @@ export async function runSuite( }; result = compareBenchmark(config, audit, run); } finally { - if (temporarySimulator) { + if (temporarySimulator?.createdByHarness === true) { progress?.event(`cleaning up simulator ${temporarySimulator.simulatorId}`); try { const deletion = await deleteTemporarySimulator(temporarySimulator, { @@ -564,7 +748,7 @@ export async function main(argv = hideBin(process.argv)): Promise { for (const item of results) process.stdout.write(renderSuiteReport(item)); if (results.length > 1) process.stdout.write(`\n${renderAggregate(results)}`); } - return results.every((item) => item.pass) ? 0 : 1; + return results.every((item) => item.completed) ? 0 : 1; } if ((args.all && args.suite) || (!args.all && !args.suite)) { @@ -572,7 +756,7 @@ export async function main(argv = hideBin(process.argv)): Promise { } const suitePaths = requireSuitePaths( - args.all ? await listSuitePaths() : [resolveSuitePath(args.suite as string)], + args.all ? await listSuitePaths() : [await resolveSuitePath(args.suite as string)], ); const progress = createProgressReporter({ enabled: !args.json }); const results: BenchmarkResult[] = []; @@ -585,7 +769,7 @@ export async function main(argv = hideBin(process.argv)): Promise { ); const item = await runSuite(suitePath, { progress, parserPath: args.parser }); results.push(item); - progress.event(`suite ${item.pass ? 'passed' : 'failed'}`); + progress.event(`suite ${item.completed ? 'completed' : 'incomplete'}`); if (!args.json) process.stdout.write(renderSuiteReport(item)); } @@ -595,5 +779,5 @@ export async function main(argv = hideBin(process.argv)): Promise { process.stdout.write(`\n${renderAggregate(results)}`); } - return results.every((item) => item.pass) ? 0 : 1; + return results.every((item) => item.completed) ? 0 : 1; } diff --git a/src/benchmarks/claude-ui/mcp-config.ts b/src/benchmarks/claude-ui/mcp-config.ts new file mode 100644 index 00000000..75274e0d --- /dev/null +++ b/src/benchmarks/claude-ui/mcp-config.ts @@ -0,0 +1,140 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { stringify as stringifyYaml } from 'yaml'; +import { sessionDefaultEnvNames, validateSessionDefaults } from './config.ts'; +import { repoRoot, serverName } from './constants.ts'; +import type { PreparedSimulator } from './simulator-lifecycle.ts'; +import type { BenchmarkConfig } from './types.ts'; +import type { SessionDefaults } from '../../utils/session-store.ts'; + +const sessionDefaultEnvNameSet = new Set(Object.values(sessionDefaultEnvNames)); + +function sessionDefaultsWithTemporarySimulator( + config: BenchmarkConfig, + temporarySimulator: PreparedSimulator | undefined, +): SessionDefaults | undefined { + if (!temporarySimulator) return config.sessionDefaults; + const defaults = { ...config.sessionDefaults }; + delete defaults.simulatorName; + return { + ...defaults, + simulatorId: temporarySimulator.simulatorId, + }; +} + +const sessionDefaultPathKeys = new Set(['workspacePath', 'projectPath', 'derivedDataPath']); + +function shouldResolveSessionDefaultPath(key: string, value: string): boolean { + if (!sessionDefaultPathKeys.has(key)) return false; + if (path.isAbsolute(value) || value.startsWith('~')) return false; + return !/^[A-Za-z][A-Za-z0-9+.-]*:/.test(value); +} + +function isolatedSessionDefaults( + config: BenchmarkConfig, + workingDirectory: string, + temporarySimulator: PreparedSimulator | undefined, +): SessionDefaults | undefined { + const defaults = validateSessionDefaults( + sessionDefaultsWithTemporarySimulator(config, temporarySimulator), + ); + if (!defaults) return undefined; + + const resolved = { ...defaults }; + for (const [key, value] of Object.entries(defaults)) { + if (typeof value === 'string' && shouldResolveSessionDefaultPath(key, value)) { + if (key === 'workspacePath' || key === 'projectPath' || key === 'derivedDataPath') { + resolved[key] = path.resolve(workingDirectory, value); + } + } + } + return resolved; +} + +export function resolveBenchmarkSimulatorId( + config: BenchmarkConfig, + temporarySimulator: PreparedSimulator | undefined, +): string | undefined { + return ( + temporarySimulator?.simulatorId ?? + (typeof config.sessionDefaults?.simulatorId === 'string' + ? config.sessionDefaults.simulatorId + : undefined) + ); +} + +export function requireFirstRunPreflightSimulatorId( + config: BenchmarkConfig, + temporarySimulator: PreparedSimulator | undefined, +): string | undefined { + const simulatorId = resolveBenchmarkSimulatorId(config, temporarySimulator); + if (config.firstRunPromptDismissals && !simulatorId) { + throw new Error( + 'firstRunPromptDismissals requires a temporary simulator or sessionDefaults.simulatorId', + ); + } + return simulatorId; +} + +export async function writeMcpConfig(opts: { + config: BenchmarkConfig; + mcpConfigPath: string; + mcpWorkspaceDirectory: string; + mcpWorkspaceConfigPath: string; + workingDirectory: string; + temporarySimulator?: PreparedSimulator; +}): Promise { + const sessionDefaults = isolatedSessionDefaults( + opts.config, + opts.workingDirectory, + opts.temporarySimulator, + ); + const isolatedConfig = { + schemaVersion: 1, + enabledWorkflows: ['simulator', 'ui-automation'], + debug: true, + sentryDisabled: true, + sessionDefaults: sessionDefaults ?? {}, + }; + const mcpConfig = { + mcpServers: { + [serverName]: { + type: 'stdio', + command: 'node', + args: [path.join(repoRoot, 'build/cli.js'), 'mcp'], + env: { + XCODEBUILDMCP_DEBUG: 'true', + XCODEBUILDMCP_SENTRY_DISABLED: 'true', + XCODEBUILDMCP_CWD: opts.mcpWorkspaceDirectory, + }, + }, + }, + }; + + await mkdir(path.dirname(opts.mcpWorkspaceConfigPath), { recursive: true }); + await writeFile(opts.mcpWorkspaceConfigPath, stringifyYaml(isolatedConfig), 'utf8'); + await writeFile(opts.mcpConfigPath, `${JSON.stringify(mcpConfig, null, 2)}\n`, 'utf8'); +} + +export async function writeEmptyMcpConfig(mcpConfigPath: string): Promise { + await writeFile(mcpConfigPath, `${JSON.stringify({ mcpServers: {} }, null, 2)}\n`, 'utf8'); +} + +export async function writeClaudeMcpConfig( + opts: Parameters[0] & { + enabled: boolean; + }, +): Promise { + if (!opts.enabled) return writeEmptyMcpConfig(opts.mcpConfigPath); + return writeMcpConfig(opts); +} + +export function claudeBenchmarkEnv( + source: NodeJS.ProcessEnv = process.env, + additions: NodeJS.ProcessEnv = {}, +): NodeJS.ProcessEnv { + const env = { ...source, ...additions }; + for (const name of sessionDefaultEnvNameSet) delete env[name]; + delete env.XCODEBUILDMCP_CWD; + return env; +} diff --git a/src/benchmarks/claude-ui/preflight-commands.ts b/src/benchmarks/claude-ui/preflight-commands.ts new file mode 100644 index 00000000..06f067f8 --- /dev/null +++ b/src/benchmarks/claude-ui/preflight-commands.ts @@ -0,0 +1,136 @@ +import { spawn } from 'node:child_process'; +import { writeFile } from 'node:fs/promises'; + +interface CapturedCommandResult { + exitCode: number | null; + durationSeconds: number; + stdout: string; + stderr: string; + timedOut: boolean; +} + +const defaultPreflightTimeoutMs = 30_000; +const forceKillDelayMs = 2_000; + +function shellSingleQuote(value: string): string { + return `'${value.replace(/'/g, `'"'"'`)}'`; +} + +function isRocketSimAppLaunchCommand(command: string): boolean { + return /^\s*open\s+.*(?:^|\s)RocketSim(?:\.app)?\s*$/.test(command); +} + +export function preflightCommandsWithFocusResign(opts: { + commands: string[] | undefined; + simulatorId?: string; +}): string[] { + const commands = opts.commands ?? []; + if (!opts.simulatorId) return commands; + + const focusSimulatorCommand = `open -a Simulator --args -CurrentDeviceUDID ${shellSingleQuote(opts.simulatorId)}`; + return commands.flatMap((command) => + isRocketSimAppLaunchCommand(command) ? [command, focusSimulatorCommand] : [command], + ); +} + +function runShellCommand(opts: { + command: string; + cwd: string; + env?: NodeJS.ProcessEnv; + timeoutMs?: number; +}): Promise { + return new Promise((resolve, reject) => { + const started = process.hrtime.bigint(); + const child = spawn('/bin/zsh', ['-lc', opts.command], { + cwd: opts.cwd, + env: opts.env ?? process.env, + stdio: ['ignore', 'pipe', 'pipe'], + detached: true, + }); + const stdout: Buffer[] = []; + const stderr: Buffer[] = []; + let timedOut = false; + let timeoutTimer: NodeJS.Timeout | undefined; + let forceKillTimer: NodeJS.Timeout | undefined; + const clearTimers = () => { + if (timeoutTimer) clearTimeout(timeoutTimer); + if (forceKillTimer) clearTimeout(forceKillTimer); + }; + const signalChild = (signal: NodeJS.Signals) => { + if (child.exitCode !== null || child.killed || child.pid === undefined) return; + try { + process.kill(-child.pid, signal); + } catch { + try { + child.kill(signal); + } catch { + // The process may have exited between the liveness check and signal delivery. + } + } + }; + timeoutTimer = setTimeout(() => { + timedOut = true; + signalChild('SIGTERM'); + forceKillTimer = setTimeout(() => signalChild('SIGKILL'), forceKillDelayMs); + forceKillTimer.unref(); + }, opts.timeoutMs ?? defaultPreflightTimeoutMs); + timeoutTimer.unref(); + child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk)); + child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk)); + child.on('error', (error) => { + clearTimers(); + reject(error); + }); + child.on('close', (exitCode) => { + clearTimers(); + const durationSeconds = Number(process.hrtime.bigint() - started) / 1_000_000_000; + resolve({ + exitCode: timedOut ? 143 : exitCode, + durationSeconds, + stdout: Buffer.concat(stdout).toString('utf8'), + stderr: Buffer.concat(stderr).toString('utf8'), + timedOut, + }); + }); + }); +} + +export async function runPreflightCommands(opts: { + commands: string[] | undefined; + cwd: string; + env: NodeJS.ProcessEnv; + logPath: string; + simulatorId?: string; + onEvent?: (message: string) => void; +}): Promise { + const commands = preflightCommandsWithFocusResign({ + commands: opts.commands, + simulatorId: opts.simulatorId, + }); + for (const [index, command] of commands.entries()) { + opts.onEvent?.(`preflight command ${index + 1}/${commands.length}`); + const result = await runShellCommand({ command, cwd: opts.cwd, env: opts.env }); + await writeFile( + opts.logPath, + [ + `\n$ ${command}`, + `exit=${result.exitCode} duration=${result.durationSeconds.toFixed(2)}s`, + result.timedOut + ? `timed out after ${(defaultPreflightTimeoutMs / 1000).toFixed(0)}s` + : undefined, + result.stdout ? `stdout:\n${result.stdout}` : undefined, + result.stderr ? `stderr:\n${result.stderr}` : undefined, + ] + .filter((line): line is string => line !== undefined) + .join('\n'), + { flag: 'a' }, + ); + if (result.exitCode !== 0) { + throw new Error( + result.timedOut + ? `preflight command timed out after ${(defaultPreflightTimeoutMs / 1000).toFixed(0)}s: ${command}` + : `preflight command failed (${result.exitCode}): ${command}`, + ); + } + } +} diff --git a/src/benchmarks/claude-ui/render.ts b/src/benchmarks/claude-ui/render.ts index 864cd1ae..d02d9162 100644 --- a/src/benchmarks/claude-ui/render.ts +++ b/src/benchmarks/claude-ui/render.ts @@ -39,17 +39,19 @@ function colorize(opts: ResolvedOptions, code: string, text: string): string { return opts.color ? `${code}${text}${ANSI.reset}` : text; } -function statusLabel(status: 'PASS' | 'FAIL' | 'WARN', opts: ResolvedOptions): string { - if (status === 'PASS') return colorize(opts, ANSI.green, 'PASS'); - if (status === 'FAIL') return colorize(opts, ANSI.red, 'FAIL'); - return colorize(opts, ANSI.yellow, 'WARN'); +function statusLabel( + status: 'COMPLETED' | 'INCOMPLETE' | 'OBSERVED', + opts: ResolvedOptions, +): string { + if (status === 'COMPLETED') return colorize(opts, ANSI.green, 'COMPLETED'); + if (status === 'INCOMPLETE') return colorize(opts, ANSI.red, 'INCOMPLETE'); + return colorize(opts, ANSI.dim, 'OBSERVED'); } -function statusGlyph(status: 'PASS' | 'FAIL' | 'WARN', opts: ResolvedOptions): string { - const glyph = status === 'PASS' ? '✓' : status === 'FAIL' ? '✗' : '!'; - if (status === 'PASS') return colorize(opts, ANSI.green, glyph); - if (status === 'FAIL') return colorize(opts, ANSI.red, glyph); - return colorize(opts, ANSI.yellow, glyph); +function statusGlyph(status: 'COMPLETED' | 'INCOMPLETE', opts: ResolvedOptions): string { + const glyph = status === 'COMPLETED' ? '✓' : '!'; + if (status === 'COMPLETED') return colorize(opts, ANSI.green, glyph); + return colorize(opts, ANSI.red, glyph); } function rule(ch: string, width: number): string { @@ -71,10 +73,8 @@ function suiteBanner(result: BenchmarkResult, opts: ResolvedOptions): string { return `${rule('─', opts.width)}\n${left}${' '.repeat(padWidth)}${right}`; } -function overallStatus(result: BenchmarkResult): 'PASS' | 'FAIL' | 'WARN' { - if (!result.pass) return 'FAIL'; - if (!result.sequence.matched) return 'WARN'; - return 'PASS'; +function overallStatus(result: BenchmarkResult): 'COMPLETED' | 'INCOMPLETE' { + return result.completed ? 'COMPLETED' : 'INCOMPLETE'; } function visibleLength(text: string): number { @@ -100,8 +100,8 @@ function formatNumber(value: number, isWallClock: boolean): string { return value.toFixed(2); } -function formatDelta(actual: number, expected: number, isWallClock: boolean): string { - const delta = actual - expected; +function formatDelta(actual: number, baseline: number, isWallClock: boolean): string { + const delta = actual - baseline; const sign = delta > 0 ? '+' : delta < 0 ? '−' : ' '; const magnitude = Math.abs(delta); return `${sign}${isWallClock ? magnitude.toFixed(2) : magnitude.toString()}`; @@ -121,10 +121,7 @@ interface MetricRow { name: string; actual: string; baseline: string; - variance: string; delta: string; - status: 'PASS' | 'FAIL'; - isWallClock: boolean; } function metricToRow(metric: MetricResult): MetricRow { @@ -133,11 +130,8 @@ function metricToRow(metric: MetricResult): MetricRow { return { name: isTool ? metric.name.slice('tool:'.length) : metric.name, actual: formatNumber(metric.actual, isWallClock), - baseline: formatNumber(metric.expected, isWallClock), - variance: `+${formatNumber(metric.allowedVariance, isWallClock)}`, - delta: formatDelta(metric.actual, metric.expected, isWallClock), - status: metric.pass ? 'PASS' : 'FAIL', - isWallClock, + baseline: formatNumber(metric.baseline, isWallClock), + delta: formatDelta(metric.actual, metric.baseline, isWallClock), }; } @@ -172,38 +166,25 @@ function renderMetricsSection(result: BenchmarkResult, opts: ResolvedOptions): s lines.push('', colorize(opts, ANSI.bold, 'Metrics')); const rows = headline .map(metricToRow) - .map((row) => [ - row.name, - row.actual, - row.baseline, - row.variance, - row.delta, - row.status === 'PASS' ? statusLabel('PASS', opts) : statusLabel('FAIL', opts), - ]); + .map((row) => [row.name, row.actual, row.baseline, row.delta]); const table = renderTable( - ['METRIC', 'ACTUAL', 'BASELINE', 'VARIANCE', 'DELTA', 'STATUS'], + ['METRIC', 'ACTUAL', 'BASELINE', 'DELTA'], rows, - ['left', 'right', 'right', 'right', 'right', 'left'], + ['left', 'right', 'right', 'right'], opts, ); for (const line of table) lines.push(` ${line}`); } if (tools.length > 0) { - lines.push('', colorize(opts, ANSI.bold, 'Tool calls (baseline-tracked)')); + lines.push('', colorize(opts, ANSI.bold, 'Tool calls (baseline-observed)')); const rows = tools .map(metricToRow) - .map((row) => [ - row.name, - row.actual, - row.baseline, - row.delta, - row.status === 'PASS' ? statusLabel('PASS', opts) : statusLabel('FAIL', opts), - ]); + .map((row) => [row.name, row.actual, row.baseline, row.delta]); const table = renderTable( - ['TOOL', 'ACTUAL', 'BASELINE', 'DELTA', 'STATUS'], + ['TOOL', 'ACTUAL', 'BASELINE', 'DELTA'], rows, - ['left', 'right', 'right', 'right', 'left'], + ['left', 'right', 'right', 'right'], opts, ); for (const line of table) lines.push(` ${line}`); @@ -212,15 +193,18 @@ function renderMetricsSection(result: BenchmarkResult, opts: ResolvedOptions): s return lines; } -function renderFailureSection(result: BenchmarkResult, opts: ResolvedOptions): string[] { +function renderStumbleSection(result: BenchmarkResult, opts: ResolvedOptions): string[] { const { failures, patternFailures, parseErrors } = result.audit; const { claudeExitCode, parserExitCode } = result.run; - const total = result.failureMetric.count; + const total = result.completion.issueCount; if (total === 0) { - return ['', `${statusLabel('PASS', opts)} failures/stumbles: 0`]; + return ['', `${statusLabel('OBSERVED', opts)} stumbles: 0`]; } - const lines: string[] = ['', `${statusLabel('FAIL', opts)} failures/stumbles: ${total}`]; + const lines: string[] = [ + '', + `${statusLabel(result.completion.completed ? 'OBSERVED' : 'INCOMPLETE', opts)} stumbles: ${total}`, + ]; if (claudeExitCode !== 0) { lines.push(` • claude exit code: ${claudeExitCode ?? 'null'}`); @@ -238,7 +222,7 @@ function renderFailureSection(result: BenchmarkResult, opts: ResolvedOptions): s } } if (failures.length > 0) { - lines.push(` • tool failures: ${failures.length}`); + lines.push(` • tool errors: ${failures.length}`); for (const failure of failures.slice(0, 5)) { const name = failure.shortName ?? failure.fullName ?? '(unknown)'; const msg = truncate(failure.message, 100); @@ -269,21 +253,14 @@ function truncate(text: string, max: number): string { } function renderSequenceSection(result: BenchmarkResult, opts: ResolvedOptions): string[] { - const expectedLen = result.sequence.expected.length; - if (expectedLen === 0) return []; + const baselineLen = result.sequence.baseline.length; + if (baselineLen === 0) return []; const lines: string[] = ['']; - const sequenceStatus = result.sequence.matched - ? 'PASS' - : result.sequence.mode === 'warn' - ? 'WARN' - : 'FAIL'; - const drift = result.sequence.matched + const comparison = result.sequence.matched ? 'matched' - : `drift: ${result.sequence.missing.length} missing, ${result.sequence.additional.length} additional`; - lines.push( - `${statusLabel(sequenceStatus, opts)} tool sequence (${result.sequence.mode}): ${drift}`, - ); + : `${result.sequence.missing.length} missing from baseline, ${result.sequence.additional.length} additional`; + lines.push(`${statusLabel('OBSERVED', opts)} tool sequence: ${comparison}`); if (result.sequence.diff.length === 0) return lines; @@ -294,20 +271,20 @@ function renderSequenceSection(result: BenchmarkResult, opts: ResolvedOptions): } function renderHunk(hunk: SequenceDiffHunk, opts: ResolvedOptions): string[] { - const expectedIndexes = hunk.lines - .map((l) => l.expectedIndex) + const baselineIndexes = hunk.lines + .map((l) => l.baselineIndex) .filter((v): v is number => v !== undefined); const actualIndexes = hunk.lines .map((l) => l.actualIndex) .filter((v): v is number => v !== undefined); - const expectedRange = formatRange(expectedIndexes); + const baselineRange = formatRange(baselineIndexes); const actualRange = formatRange(actualIndexes); - const headerText = ` @@ expected[${expectedRange}] actual[${actualRange}] @@`; + const headerText = ` @@ baseline[${baselineRange}] actual[${actualRange}] @@`; const lines = [colorize(opts, ANSI.cyan, headerText)]; - const expectedColWidth = Math.max( + const baselineColWidth = Math.max( 3, - ...hunk.lines.map((l) => (l.expectedIndex !== undefined ? String(l.expectedIndex).length : 0)), + ...hunk.lines.map((l) => (l.baselineIndex !== undefined ? String(l.baselineIndex).length : 0)), ); const actualColWidth = Math.max( 3, @@ -315,7 +292,7 @@ function renderHunk(hunk: SequenceDiffHunk, opts: ResolvedOptions): string[] { ); for (const line of hunk.lines) { - lines.push(renderHunkLine(line, expectedColWidth, actualColWidth, opts)); + lines.push(renderHunkLine(line, baselineColWidth, actualColWidth, opts)); } return lines; } @@ -329,21 +306,21 @@ function formatRange(indexes: number[]): string { function renderHunkLine( line: SequenceDiffLine, - expectedColWidth: number, + baselineColWidth: number, actualColWidth: number, opts: ResolvedOptions, ): string { const marker = line.kind === 'context' ? ' ' : line.kind === 'missing' ? '−' : '+'; - const expectedIdx = line.expectedIndex !== undefined ? String(line.expectedIndex) : ''; + const baselineIdx = line.baselineIndex !== undefined ? String(line.baselineIndex) : ''; const actualIdx = line.actualIndex !== undefined ? String(line.actualIndex) : ''; - const body = `${padStart(expectedIdx, expectedColWidth)} ${padStart(actualIdx, actualColWidth)} ${marker} ${line.tool}`; + const body = `${padStart(baselineIdx, baselineColWidth)} ${padStart(actualIdx, actualColWidth)} ${marker} ${line.tool}`; if (line.kind === 'missing') return ` ${colorize(opts, ANSI.red, body)}`; if (line.kind === 'additional') return ` ${colorize(opts, ANSI.green, body)}`; return ` ${colorize(opts, ANSI.dim, body)}`; } function renderInspectHints(result: BenchmarkResult, opts: ResolvedOptions): string[] { - if (result.pass && result.sequence.matched) return []; + if (result.completion.issueCount === 0) return []; const lines = ['', colorize(opts, ANSI.bold, 'Inspect')]; const runDir = relativePath(result.run.artifacts.runDirectory, opts.cwd); @@ -390,7 +367,7 @@ export function renderSuiteReport(result: BenchmarkResult, options?: RenderOptio sections.push(suiteBanner(result, opts)); sections.push(...renderMetadata(result, opts)); sections.push(...renderMetricsSection(result, opts)); - sections.push(...renderFailureSection(result, opts)); + sections.push(...renderStumbleSection(result, opts)); sections.push(...renderSequenceSection(result, opts)); sections.push(...renderInspectHints(result, opts)); return `${sections.join('\n')}\n`; @@ -416,9 +393,8 @@ export function renderAggregate( ): string { const opts = resolveOptions(options); const total = results.length; - const passed = results.filter((r) => r.pass).length; - const failed = total - passed; - const warned = results.filter((r) => r.pass && !r.sequence.matched).length; + const completed = results.filter((r) => r.completed).length; + const incomplete = total - completed; const wall = results.reduce((sum, r) => sum + r.run.wallClockSeconds, 0); const slowest = results.reduce( (acc, r) => (!acc || r.run.wallClockSeconds > acc.run.wallClockSeconds ? r : acc), @@ -428,16 +404,12 @@ export function renderAggregate( const lines: string[] = []; lines.push(header('Claude UI Benchmarks · Summary', opts)); - const passText = colorize(opts, ANSI.green, `${passed} passed`); - const failText = - failed > 0 - ? colorize(opts, ANSI.red, `${failed} failed`) - : colorize(opts, ANSI.dim, '0 failed'); - const warnText = - warned > 0 - ? colorize(opts, ANSI.yellow, `${warned} sequence warnings`) - : colorize(opts, ANSI.dim, '0 sequence warnings'); - lines.push(` Suites: ${total} total · ${passText} · ${failText} · ${warnText}`); + const completedText = colorize(opts, ANSI.green, `${completed} completed`); + const incompleteText = + incomplete > 0 + ? colorize(opts, ANSI.red, `${incomplete} incomplete`) + : colorize(opts, ANSI.dim, '0 incomplete'); + lines.push(` Suites: ${total} total · ${completedText} · ${incompleteText}`); const slowestText = slowest ? `${slowest.name} (${formatDuration(slowest.run.wallClockSeconds)})` : 'n/a'; @@ -451,17 +423,11 @@ export function renderAggregate( const rows = results.map((r) => { const status = overallStatus(r); const notes: string[] = []; - if (r.failureMetric.count > 0) { - notes.push(`${r.failureMetric.count} stumble${r.failureMetric.count === 1 ? '' : 's'}`); + if (r.completion.issueCount > 0) { + notes.push(`${r.completion.issueCount} stumble${r.completion.issueCount === 1 ? '' : 's'}`); } if (!r.sequence.matched) { - notes.push( - `sequence ${r.sequence.mode}: ${r.sequence.missing.length}m/${r.sequence.additional.length}a`, - ); - } - const failedMetrics = r.metrics.filter((m) => !m.pass).map((m) => m.name); - if (failedMetrics.length > 0) { - notes.push(`metrics: ${failedMetrics.slice(0, 3).join(', ')}`); + notes.push(`sequence delta: ${r.sequence.missing.length}m/${r.sequence.additional.length}a`); } return [ `${statusGlyph(status, opts)} ${statusLabel(status, opts)}`, diff --git a/src/benchmarks/claude-ui/simulator-deletion.ts b/src/benchmarks/claude-ui/simulator-deletion.ts new file mode 100644 index 00000000..8b7f259d --- /dev/null +++ b/src/benchmarks/claude-ui/simulator-deletion.ts @@ -0,0 +1,82 @@ +import { appendFile } from 'node:fs/promises'; +import { + runLoggedCommand, + type CreatedTemporarySimulator, + type LifecycleCommandExecutor, + type LifecycleLogWriter, +} from './simulator-lifecycle.ts'; + +const defaultLifecycleLogWriter: LifecycleLogWriter = async (logPath, message) => { + await appendFile(logPath, `${message}\n`, 'utf8'); +}; + +export interface DeleteTemporarySimulatorResult { + attempted: boolean; + succeeded: boolean; + exitCode: number | null; + error?: string; +} + +async function tryAppendLifecycleLog( + logPath: string, + message: string, + logWriter: LifecycleLogWriter = defaultLifecycleLogWriter, +): Promise { + try { + await logWriter(logPath, message); + return undefined; + } catch (error) { + return error instanceof Error ? error.message : String(error); + } +} + +export async function deleteTemporarySimulator( + simulator: CreatedTemporarySimulator, + opts: { + cwd: string; + executor?: LifecycleCommandExecutor; + logWriter?: LifecycleLogWriter; + }, +): Promise { + if (simulator.createdByHarness !== true) { + throw new Error('refusing to delete simulator not created by this harness'); + } + + const executor = opts.executor ?? runLoggedCommand; + const logWriter = opts.logWriter ?? defaultLifecycleLogWriter; + const logErrors: string[] = []; + const startLogError = await tryAppendLifecycleLog( + simulator.logPath, + `Deleting simulatorId: ${simulator.simulatorId}\nName: ${simulator.name}`, + logWriter, + ); + if (startLogError) logErrors.push(startLogError); + + try { + const result = await executor({ + command: 'xcrun', + args: ['simctl', 'delete', simulator.simulatorId], + cwd: opts.cwd, + logPath: simulator.logPath, + }); + const succeeded = result.exitCode === 0; + const resultLogError = await tryAppendLifecycleLog( + simulator.logPath, + `Delete ${succeeded ? 'succeeded' : 'failed'} for simulatorId: ${simulator.simulatorId}`, + logWriter, + ); + if (resultLogError) logErrors.push(resultLogError); + const deletion = { attempted: true, succeeded, exitCode: result.exitCode }; + return logErrors.length > 0 ? { ...deletion, error: logErrors.join('; ') } : deletion; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + logErrors.push(message); + const failureLogError = await tryAppendLifecycleLog( + simulator.logPath, + `Delete failed for simulatorId: ${simulator.simulatorId}\nError: ${message}`, + logWriter, + ); + if (failureLogError) logErrors.push(failureLogError); + return { attempted: true, succeeded: false, exitCode: null, error: logErrors.join('; ') }; + } +} diff --git a/src/benchmarks/claude-ui/simulator-lifecycle.ts b/src/benchmarks/claude-ui/simulator-lifecycle.ts index 71305ca8..a3d685bb 100644 --- a/src/benchmarks/claude-ui/simulator-lifecycle.ts +++ b/src/benchmarks/claude-ui/simulator-lifecycle.ts @@ -34,6 +34,7 @@ export interface TemporarySimulatorPlan { reason?: string; deviceTypeName?: string; existingSimulatorId?: string; + existingSimulatorName?: string; } export interface CreatedTemporarySimulator { @@ -44,15 +45,17 @@ export interface CreatedTemporarySimulator { logPath: string; } -export type LifecycleProgressReporter = (message: string) => void; - -export interface DeleteTemporarySimulatorResult { - attempted: boolean; - succeeded: boolean; - exitCode: number | null; - error?: string; +export interface ExistingSimulator { + createdByHarness: false; + simulatorId: string; + name: string; + logPath: string; } +export type PreparedSimulator = CreatedTemporarySimulator | ExistingSimulator; + +export type LifecycleProgressReporter = (message: string) => void; + function sessionDefaultString(config: BenchmarkConfig, key: SessionDefaultKey): string | undefined { const value = config.sessionDefaults?.[key]; if (value === undefined) return undefined; @@ -64,9 +67,15 @@ function sessionDefaultString(config: BenchmarkConfig, key: SessionDefaultKey): export function resolveTemporarySimulatorPlan(config: BenchmarkConfig): TemporarySimulatorPlan { const existingSimulatorId = sessionDefaultString(config, 'simulatorId'); + const deviceTypeName = sessionDefaultString(config, 'simulatorName'); if (config.temporarySimulator === false) { - return { enabled: false, reason: 'temporarySimulator is false', existingSimulatorId }; + return { + enabled: false, + reason: 'temporarySimulator is false', + existingSimulatorId, + existingSimulatorName: existingSimulatorId === undefined ? deviceTypeName : undefined, + }; } if (existingSimulatorId !== undefined) { @@ -82,7 +91,6 @@ export function resolveTemporarySimulatorPlan(config: BenchmarkConfig): Temporar }; } - const deviceTypeName = sessionDefaultString(config, 'simulatorName'); if (deviceTypeName === undefined) { throw new Error( `${config.name}: temporary simulator requires sessionDefaults.simulatorName or temporarySimulator: false`, @@ -93,7 +101,7 @@ export function resolveTemporarySimulatorPlan(config: BenchmarkConfig): Temporar } export function temporarySimulatorName(suiteSlug: string, timestamp: string): string { - return `XcodeBuildMCP Claude UI ${suiteSlug} ${timestamp}`; + return `Claude UI ${suiteSlug} ${timestamp}`; } async function appendLifecycleLog( @@ -130,6 +138,134 @@ function isAlreadyBooted(result: LoggedCommandResult): boolean { return /already booted|current state:\s*Booted|state:\s*Booted/i.test(commandOutput(result)); } +interface SimctlDevice { + name?: unknown; + udid?: unknown; + isAvailable?: unknown; +} + +interface SimctlListDevices { + devices?: Record; +} + +function resolveSimulatorIdFromList(output: string, simulatorName: string): string { + const parsed = JSON.parse(output) as SimctlListDevices; + for (const devices of Object.values(parsed.devices ?? {})) { + for (const device of devices) { + if ( + device.name === simulatorName && + device.isAvailable !== false && + typeof device.udid === 'string' + ) { + return device.udid; + } + } + } + throw new Error(`no available simulator found named '${simulatorName}'`); +} + +async function bootAndOpenSimulator(opts: { + configName: string; + simulatorId: string; + cwd: string; + logPath: string; + executor: LifecycleCommandExecutor; + onEvent?: LifecycleProgressReporter; + readinessDelayMs?: number; + logWriter?: LifecycleLogWriter; + readyLogPrefix: string; + bootstatusSubject: string; +}): Promise { + const bootArgs = ['simctl', 'boot', opts.simulatorId]; + opts.onEvent?.(`booting simulator ${opts.simulatorId}`); + const bootResult = await opts.executor({ + command: 'xcrun', + args: bootArgs, + cwd: opts.cwd, + logPath: opts.logPath, + }); + if (!isAlreadyBooted(bootResult)) { + throw new Error( + `${opts.configName}: failed to boot simulator with ${commandText('xcrun', bootArgs)} (exit ${bootResult.exitCode}); see ${opts.logPath}`, + ); + } + if (bootResult.exitCode !== 0) { + await appendLifecycleLog( + opts.logPath, + 'Boot command reported simulator was already booted; continuing', + opts.logWriter, + ); + } + + opts.onEvent?.(`waiting for simulator ${opts.simulatorId} bootstatus`); + const bootstatusArgs = ['simctl', 'bootstatus', opts.simulatorId, '-b']; + const bootstatusResult = await opts.executor({ + command: 'xcrun', + args: bootstatusArgs, + cwd: opts.cwd, + logPath: opts.logPath, + }); + if (bootstatusResult.exitCode !== 0) { + throw new Error( + `${opts.configName}: ${opts.bootstatusSubject} did not reach bootstatus with ${commandText('xcrun', bootstatusArgs)} (exit ${bootstatusResult.exitCode}); see ${opts.logPath}`, + ); + } + + opts.onEvent?.(`opening Simulator.app for ${opts.simulatorId}`); + const openArgs = ['-a', 'Simulator', '--args', '-CurrentDeviceUDID', opts.simulatorId]; + let openResult: LoggedCommandResult | undefined; + for (let attempt = 1; attempt <= 3; attempt += 1) { + openResult = await opts.executor({ + command: 'open', + args: openArgs, + cwd: opts.cwd, + logPath: opts.logPath, + }); + if (openResult.exitCode === 0) break; + if (attempt === 3) { + throw new Error( + `${opts.configName}: failed to open Simulator.app with ${commandText('open', openArgs)} (exit ${openResult.exitCode}); see ${opts.logPath}`, + ); + } + const delayMs = attempt * 2_000; + await appendLifecycleLog( + opts.logPath, + `Open Simulator.app attempt ${attempt} failed with exit ${openResult.exitCode}; retrying in ${(delayMs / 1000).toFixed(1)}s`, + opts.logWriter, + ); + if (/error -1712/i.test(commandOutput(openResult))) { + await appendLifecycleLog( + opts.logPath, + 'Simulator.app did not respond to LaunchServices; terminating the UI process before retry', + opts.logWriter, + ); + await opts.executor({ + command: 'killall', + args: ['-9', 'Simulator'], + cwd: opts.cwd, + logPath: opts.logPath, + }); + } + opts.onEvent?.(`Simulator.app open attempt ${attempt} failed; retrying`); + await new Promise((resolve) => { + setTimeout(resolve, delayMs); + }); + } + + await waitForReadinessDelay({ + logPath: opts.logPath, + milliseconds: opts.readinessDelayMs ?? 2_000, + onEvent: opts.onEvent, + logWriter: opts.logWriter, + }); + await appendLifecycleLog( + opts.logPath, + `${opts.readyLogPrefix}: ${opts.simulatorId}`, + opts.logWriter, + ); + opts.onEvent?.(`simulator ready ${opts.simulatorId}`); +} + async function waitForReadinessDelay(opts: { logPath: string; milliseconds: number; @@ -217,7 +353,7 @@ export async function prepareTemporarySimulator(opts: { logWriter?: LifecycleLogWriter; onEvent?: LifecycleProgressReporter; readinessDelayMs?: number; -}): Promise { +}): Promise { const plan = resolveTemporarySimulatorPlan(opts.config); const logWriter = opts.logWriter ?? defaultLifecycleLogWriter; @@ -235,7 +371,42 @@ export async function prepareTemporarySimulator(opts: { .join('\n'), logWriter, ); - return undefined; + if (!plan.existingSimulatorName) return undefined; + + const executor = opts.executor ?? runLoggedCommand; + opts.onEvent?.(`resolving simulator ${plan.existingSimulatorName}`); + const listResult = await executor({ + command: 'xcrun', + args: ['simctl', 'list', 'devices', 'available', '--json'], + cwd: opts.cwd, + logPath: opts.logPath, + }); + if (listResult.exitCode !== 0) { + throw new Error( + `${opts.config.name}: failed to list simulators (exit ${listResult.exitCode}); see ${opts.logPath}`, + ); + } + + const simulatorId = resolveSimulatorIdFromList(listResult.stdout, plan.existingSimulatorName); + opts.onEvent?.(`using simulator ${simulatorId}`); + await bootAndOpenSimulator({ + configName: opts.config.name, + simulatorId, + cwd: opts.cwd, + logPath: opts.logPath, + executor, + onEvent: opts.onEvent, + readinessDelayMs: opts.readinessDelayMs, + logWriter, + readyLogPrefix: 'Existing simulator ready', + bootstatusSubject: 'simulator', + }); + return { + createdByHarness: false, + simulatorId, + name: plan.existingSimulatorName, + logPath: opts.logPath, + }; } const executor = opts.executor ?? runLoggedCommand; @@ -281,63 +452,18 @@ export async function prepareTemporarySimulator(opts: { } satisfies CreatedTemporarySimulator; try { - opts.onEvent?.(`booting simulator ${simulatorId}`); - const bootArgs = ['simctl', 'boot', simulatorId]; - const bootResult = await executor({ - command: 'xcrun', - args: bootArgs, - cwd: opts.cwd, - logPath: opts.logPath, - }); - if (!isAlreadyBooted(bootResult)) { - throw new Error( - `${opts.config.name}: failed to boot temporary simulator with ${commandText('xcrun', bootArgs)} (exit ${bootResult.exitCode}); see ${opts.logPath}`, - ); - } - if (bootResult.exitCode !== 0) { - await appendLifecycleLog( - opts.logPath, - 'Boot command reported simulator was already booted; continuing', - logWriter, - ); - } - - opts.onEvent?.(`waiting for simulator ${simulatorId} bootstatus`); - const bootstatusArgs = ['simctl', 'bootstatus', simulatorId, '-b']; - const bootstatusResult = await executor({ - command: 'xcrun', - args: bootstatusArgs, - cwd: opts.cwd, - logPath: opts.logPath, - }); - if (bootstatusResult.exitCode !== 0) { - throw new Error( - `${opts.config.name}: temporary simulator did not reach bootstatus with ${commandText('xcrun', bootstatusArgs)} (exit ${bootstatusResult.exitCode}); see ${opts.logPath}`, - ); - } - - opts.onEvent?.(`opening Simulator.app for ${simulatorId}`); - const openArgs = ['-a', 'Simulator', '--args', '-CurrentDeviceUDID', simulatorId]; - const openResult = await executor({ - command: 'open', - args: openArgs, + await bootAndOpenSimulator({ + configName: opts.config.name, + simulatorId, cwd: opts.cwd, logPath: opts.logPath, - }); - if (openResult.exitCode !== 0) { - throw new Error( - `${opts.config.name}: failed to open Simulator.app with ${commandText('open', openArgs)} (exit ${openResult.exitCode}); see ${opts.logPath}`, - ); - } - - await waitForReadinessDelay({ - logPath: opts.logPath, - milliseconds: opts.readinessDelayMs ?? 2_000, + executor, onEvent: opts.onEvent, + readinessDelayMs: opts.readinessDelayMs, logWriter, + readyLogPrefix: 'Temporary simulator ready', + bootstatusSubject: 'temporary simulator', }); - await appendLifecycleLog(opts.logPath, `Temporary simulator ready: ${simulatorId}`, logWriter); - opts.onEvent?.(`simulator ready ${simulatorId}`); return simulator; } catch (error) { @@ -369,54 +495,3 @@ export async function prepareTemporarySimulator(opts: { throw error; } } - -export async function deleteTemporarySimulator( - simulator: CreatedTemporarySimulator, - opts: { - cwd: string; - executor?: LifecycleCommandExecutor; - logWriter?: LifecycleLogWriter; - }, -): Promise { - if (simulator.createdByHarness !== true) { - throw new Error('refusing to delete simulator not created by this harness'); - } - - const executor = opts.executor ?? runLoggedCommand; - const logWriter = opts.logWriter ?? defaultLifecycleLogWriter; - const logErrors: string[] = []; - const startLogError = await tryAppendLifecycleLog( - simulator.logPath, - `Deleting simulatorId: ${simulator.simulatorId}\nName: ${simulator.name}`, - logWriter, - ); - if (startLogError) logErrors.push(startLogError); - - try { - const result = await executor({ - command: 'xcrun', - args: ['simctl', 'delete', simulator.simulatorId], - cwd: opts.cwd, - logPath: simulator.logPath, - }); - const succeeded = result.exitCode === 0; - const resultLogError = await tryAppendLifecycleLog( - simulator.logPath, - `Delete ${succeeded ? 'succeeded' : 'failed'} for simulatorId: ${simulator.simulatorId}`, - logWriter, - ); - if (resultLogError) logErrors.push(resultLogError); - const deletion = { attempted: true, succeeded, exitCode: result.exitCode }; - return logErrors.length > 0 ? { ...deletion, error: logErrors.join('; ') } : deletion; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - logErrors.push(message); - const failureLogError = await tryAppendLifecycleLog( - simulator.logPath, - `Delete failed for simulatorId: ${simulator.simulatorId}\nError: ${message}`, - logWriter, - ); - if (failureLogError) logErrors.push(failureLogError); - return { attempted: true, succeeded: false, exitCode: null, error: logErrors.join('; ') }; - } -} diff --git a/src/benchmarks/claude-ui/transcript.ts b/src/benchmarks/claude-ui/transcript.ts index 5a3543ae..0ac9ad00 100644 --- a/src/benchmarks/claude-ui/transcript.ts +++ b/src/benchmarks/claude-ui/transcript.ts @@ -2,10 +2,13 @@ import type { PatternFailureRecord, ToolCallRecord, ToolFailureRecord, + ToolAnalysisConfig, + ToolMatcher, TranscriptAudit, + FailurePatternTarget, } from './types.ts'; -const UI_AUTOMATION_TOOLS = new Set([ +const DEFAULT_UI_AUTOMATION_TOOLS = [ 'batch', 'button', 'drag', @@ -20,11 +23,22 @@ const UI_AUTOMATION_TOOLS = new Set([ 'touch', 'type_text', 'wait_for_ui', -]); +]; interface AnalyzeOptions { - mcpToolPrefix: string; + mcpToolPrefix?: string; + toolAnalysis?: ToolAnalysisConfig; failurePatterns?: string[]; + failurePatternTargets?: FailurePatternTarget[]; + ignoredFailurePatterns?: string[]; +} + +interface ToolClassification { + shortName: string; + isMcp: boolean; + isUiAutomation: boolean; + offset: number; + matchLength: number; } function isRecord(value: unknown): value is Record { @@ -40,6 +54,19 @@ function shortToolName(fullName: string): string { return parts[parts.length - 1] ?? fullName; } +function defaultToolAnalysisConfig(mcpToolPrefix: string): ToolAnalysisConfig { + return { + matchers: [ + { + kind: 'namePrefix', + prefix: mcpToolPrefix, + shortName: 'afterLastDoubleUnderscore', + uiAutomationNames: DEFAULT_UI_AUTOMATION_TOOLS, + }, + ], + }; +} + function incrementCount(counts: Record, name: string): void { counts[name] = (counts[name] ?? 0) + 1; } @@ -114,16 +141,149 @@ function createPatternMatchers( return (patterns ?? []).map((pattern) => ({ pattern, regex: new RegExp(pattern, 'i') })); } +function matchesAnyPattern( + text: string, + matchers: Array<{ pattern: string; regex: RegExp }>, +): boolean { + return matchers.some((matcher) => matcher.regex.test(text)); +} + +function appendPatternFailures(opts: { + text: string; + line: number; + excerpt: string; + patternMatchers: Array<{ pattern: string; regex: RegExp }>; + ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>; + patternFailures: PatternFailureRecord[]; +}): void { + if (matchesAnyPattern(opts.text, opts.ignoredFailureMatchers)) return; + for (const matcher of opts.patternMatchers) { + if (matcher.regex.test(opts.text)) { + opts.patternFailures.push({ + pattern: matcher.pattern, + line: opts.line, + excerpt: opts.excerpt, + }); + } + } +} + +function commandFromInput(input: unknown): string | undefined { + if (!isRecord(input)) return undefined; + return asString(input.command); +} + +function classifyNamePrefixTool( + fullName: string, + matcher: Extract, +): ToolClassification | undefined { + if (!fullName.startsWith(matcher.prefix)) return undefined; + + let shortName: string; + switch (matcher.shortName) { + case 'afterPrefix': + shortName = fullName.slice(matcher.prefix.length); + break; + case 'full': + shortName = fullName; + break; + default: + shortName = shortToolName(fullName); + break; + } + + return { + shortName, + isMcp: matcher.prefix.includes('__'), + isUiAutomation: (matcher.uiAutomationNames ?? []).includes(shortName), + offset: 0, + matchLength: matcher.prefix.length, + }; +} + +function commandPrefixMatchesAt(command: string, prefix: string, index: number): boolean { + if (!command.startsWith(prefix, index)) return false; + + const before = command.slice(0, index).trimEnd(); + if (before.length > 0 && !/[;&|]$/.test(before)) return false; + + const next = command[index + prefix.length]; + return next === undefined || /\s/.test(next); +} + +function commandPrefixOffsets(command: string, prefix: string): number[] { + const offsets: number[] = []; + let start = 0; + while (start < command.length) { + const index = command.indexOf(prefix, start); + if (index === -1) break; + if (commandPrefixMatchesAt(command, prefix, index)) offsets.push(index); + start = index + prefix.length; + } + return offsets; +} + +function classifyBashCommandTool( + fullName: string, + input: unknown, + matcher: Extract, +): ToolClassification[] { + if (fullName !== 'Bash') return []; + const command = commandFromInput(input); + if (!command) return []; + return commandPrefixOffsets(command, matcher.commandPrefix).map((offset) => ({ + shortName: matcher.shortName, + isMcp: false, + isUiAutomation: matcher.uiAutomation === true, + offset, + matchLength: matcher.commandPrefix.length, + })); +} + +function classifyToolUse( + fullName: string, + input: unknown, + toolAnalysis: ToolAnalysisConfig, +): ToolClassification[] { + const classifications: ToolClassification[] = []; + for (const matcher of toolAnalysis.matchers) { + if (matcher.kind === 'namePrefix') { + const classification = classifyNamePrefixTool(fullName, matcher); + if (classification) classifications.push(classification); + } + if (matcher.kind === 'bashCommand') { + classifications.push(...classifyBashCommandTool(fullName, input, matcher)); + } + } + const mostSpecificByOffset = new Map(); + for (const classification of classifications) { + const existing = mostSpecificByOffset.get(classification.offset); + if (!existing || classification.matchLength > existing.matchLength) { + mostSpecificByOffset.set(classification.offset, classification); + } + } + return [...mostSpecificByOffset.values()].sort((left, right) => left.offset - right.offset); +} + export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): TranscriptAudit { - const toolNameById = new Map(); + const trackedToolsById = new Map>(); const parseErrors: string[] = []; const failures: ToolFailureRecord[] = []; const patternFailures: PatternFailureRecord[] = []; + const trackedSequence: ToolCallRecord[] = []; const mcpSequence: ToolCallRecord[] = []; const totalToolCallsByName: Record = {}; + const trackedToolCallsByName: Record = {}; const mcpToolCallsByName: Record = {}; const uiAutomationCallsByName: Record = {}; const patternMatchers = createPatternMatchers(options.failurePatterns); + const ignoredFailureMatchers = createPatternMatchers(options.ignoredFailurePatterns); + const failurePatternTargets = new Set( + options.failurePatternTargets ?? ['commands', 'toolResults'], + ); + const toolAnalysis = + options.toolAnalysis ?? + defaultToolAnalysisConfig(options.mcpToolPrefix ?? 'mcp__xcodebuildmcp-dev__'); let records = 0; let finalText: string | undefined; let resultSummary: Record | undefined; @@ -152,12 +312,6 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans const entryType = asString(entry.type); const lineText = rawLine.length > 600 ? `${rawLine.slice(0, 600)}…` : rawLine; - for (const matcher of patternMatchers) { - if (matcher.regex.test(rawLine)) { - patternFailures.push({ pattern: matcher.pattern, line, excerpt: lineText }); - } - } - if (entryType === 'result') { resultSummary = entry; finalText = asString(entry.result) ?? finalText; @@ -180,30 +334,50 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans const id = asString(block.id); if (!fullName || !id) continue; - toolNameById.set(id, fullName); + const command = commandFromInput(block.input); + if (command && failurePatternTargets.has('commands')) { + appendPatternFailures({ + text: command, + line, + excerpt: command.length > 600 ? `${command.slice(0, 600)}…` : command, + patternMatchers, + ignoredFailureMatchers, + patternFailures, + }); + } + incrementCount(totalToolCallsByName, fullName); - const shortName = shortToolName(fullName); - const isMcp = fullName.startsWith(options.mcpToolPrefix); - const isUiAutomation = isMcp && UI_AUTOMATION_TOOLS.has(shortName); + const classifications = classifyToolUse(fullName, block.input, toolAnalysis); + if (classifications.length === 0) continue; - if (isMcp) { - incrementCount(mcpToolCallsByName, shortName); + trackedToolsById.set( + id, + classifications.map((classification) => ({ ...classification, fullName })), + ); + for (const classification of classifications) { + incrementCount(trackedToolCallsByName, classification.shortName); const record: ToolCallRecord = { id, fullName, - shortName, + shortName: classification.shortName, input: block.input, line, timestamp, - isMcp, - isUiAutomation, + isTracked: true, + isMcp: classification.isMcp, + isUiAutomation: classification.isUiAutomation, }; - mcpSequence.push(record); - } + trackedSequence.push(record); + + if (classification.isMcp) { + incrementCount(mcpToolCallsByName, classification.shortName); + mcpSequence.push(record); + } - if (isUiAutomation) { - incrementCount(uiAutomationCallsByName, shortName); + if (classification.isUiAutomation) { + incrementCount(uiAutomationCallsByName, classification.shortName); + } } } continue; @@ -213,19 +387,34 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans for (const block of extractContentBlocks(entry)) { if (!isRecord(block) || block.type !== 'tool_result') continue; const id = asString(block.tool_use_id); - const fullName = id ? toolNameById.get(id) : undefined; - if (!fullName?.startsWith(options.mcpToolPrefix)) continue; + const trackedTools = id ? trackedToolsById.get(id) : undefined; + if (!trackedTools || trackedTools.length === 0) continue; const structured = extractStructuredResult(block, entry); + const message = stringifyContent(block.content); + if (failurePatternTargets.has('toolResults')) { + appendPatternFailures({ + text: message, + line, + excerpt: message.length > 600 ? `${message.slice(0, 600)}…` : message, + patternMatchers, + ignoredFailureMatchers, + patternFailures, + }); + } if (!resultDidError(block, structured)) continue; - failures.push({ - id, - fullName, - shortName: shortToolName(fullName), - line, - message: stringifyContent(block.content), - }); + if (matchesAnyPattern(message, ignoredFailureMatchers)) continue; + + for (const trackedTool of trackedTools) { + failures.push({ + id, + fullName: trackedTool.fullName, + shortName: trackedTool.shortName, + line, + message, + }); + } } } } @@ -235,6 +424,8 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans parseErrors, totalToolCalls: Object.values(totalToolCallsByName).reduce((sum, count) => sum + count, 0), totalToolCallsByName, + trackedToolCalls: trackedSequence.length, + trackedToolCallsByName, mcpToolCalls: mcpSequence.length, mcpToolCallsByName, uiAutomationCalls: Object.values(uiAutomationCallsByName).reduce( @@ -242,6 +433,7 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans 0, ), uiAutomationCallsByName, + trackedSequence, mcpSequence, failures, patternFailures, diff --git a/src/benchmarks/claude-ui/types.ts b/src/benchmarks/claude-ui/types.ts index fb073e03..58b5da00 100644 --- a/src/benchmarks/claude-ui/types.ts +++ b/src/benchmarks/claude-ui/types.ts @@ -1,32 +1,57 @@ import type { SessionDefaults } from '../../utils/session-store.ts'; -export interface AllowedVariance { - totalToolCalls: number; - mcpToolCalls: number; - uiAutomationCalls: number; - wallClockSeconds: number; - toolCalls: number; -} - export interface BenchmarkBaseline { totalToolCalls?: number; + trackedToolCalls?: number; mcpToolCalls?: number; uiAutomationCalls?: number; wallClockSeconds?: number; tools?: Record; } -export type SequenceMode = 'warn' | 'fail'; - -export interface SequenceConfig { - mode?: SequenceMode; -} +export type FailurePatternTarget = 'commands' | 'toolResults'; export interface FirstRunPromptDismissals { labels: string[]; timeoutSeconds?: number; } +export interface ClaudeInvocationConfig { + useMcpServer?: boolean; + permissionMode?: 'default' | 'bypassPermissions'; + tools?: string[]; + allowedTools?: string[]; + appendSystemPrompt?: string; + extraArgs?: string[]; + pluginDirs?: string[]; + skillDirs?: string[]; + activateSkill?: string; + isolatedWorkingDirectory?: boolean; + maxClaudeSeconds?: number; +} + +export type ToolMatcherShortName = 'afterLastDoubleUnderscore' | 'afterPrefix' | 'full'; + +export interface NamePrefixToolMatcher { + kind: 'namePrefix'; + prefix: string; + shortName?: ToolMatcherShortName; + uiAutomationNames?: string[]; +} + +export interface BashCommandToolMatcher { + kind: 'bashCommand'; + commandPrefix: string; + shortName: string; + uiAutomation?: boolean; +} + +export type ToolMatcher = NamePrefixToolMatcher | BashCommandToolMatcher; + +export interface ToolAnalysisConfig { + matchers: ToolMatcher[]; +} + export interface BenchmarkConfig { name: string; prompt: string; @@ -34,11 +59,14 @@ export interface BenchmarkConfig { sessionDefaults?: SessionDefaults; temporarySimulator?: boolean; firstRunPromptDismissals?: FirstRunPromptDismissals; + preflightCommands?: string[]; baseline?: BenchmarkBaseline; - expectedToolSequence?: string[]; - sequence?: SequenceConfig; - allowedVariance?: Partial; + baselineToolSequence?: string[]; failurePatterns?: string[]; + failurePatternTargets?: FailurePatternTarget[]; + ignoredFailurePatterns?: string[]; + claude?: ClaudeInvocationConfig; + toolAnalysis?: ToolAnalysisConfig; } export interface ToolCallRecord { @@ -48,6 +76,7 @@ export interface ToolCallRecord { input: unknown; line: number; timestamp?: string; + isTracked: boolean; isMcp: boolean; isUiAutomation: boolean; } @@ -71,10 +100,13 @@ export interface TranscriptAudit { parseErrors: string[]; totalToolCalls: number; totalToolCallsByName: Record; + trackedToolCalls: number; + trackedToolCallsByName: Record; mcpToolCalls: number; mcpToolCallsByName: Record; uiAutomationCalls: number; uiAutomationCallsByName: Record; + trackedSequence: ToolCallRecord[]; mcpSequence: ToolCallRecord[]; failures: ToolFailureRecord[]; patternFailures: PatternFailureRecord[]; @@ -85,9 +117,7 @@ export interface TranscriptAudit { export interface MetricResult { name: string; actual: number; - expected: number; - allowedVariance: number; - pass: boolean; + baseline: number; } export type SequenceDiffLineKind = 'context' | 'missing' | 'additional'; @@ -95,7 +125,7 @@ export type SequenceDiffLineKind = 'context' | 'missing' | 'additional'; export interface SequenceDiffLine { kind: SequenceDiffLineKind; tool: string; - expectedIndex?: number; + baselineIndex?: number; actualIndex?: number; } @@ -140,17 +170,15 @@ export interface BenchmarkRunMetadata { export interface BenchmarkResult { name: string; - pass: boolean; + completed: boolean; metrics: MetricResult[]; - failureMetric: { - pass: boolean; - count: number; + completion: { + completed: boolean; + issueCount: number; }; sequence: { - mode: SequenceMode; - pass: boolean; matched: boolean; - expected: string[]; + baseline: string[]; actual: string[]; diff: SequenceDiffHunk[]; missing: string[]; @@ -159,11 +187,3 @@ export interface BenchmarkResult { audit: TranscriptAudit; run: BenchmarkRunMetadata; } - -export const DEFAULT_ALLOWED_VARIANCE: AllowedVariance = { - totalToolCalls: 0, - mcpToolCalls: 0, - uiAutomationCalls: 0, - wallClockSeconds: 30, - toolCalls: 0, -}; From 3ab280ad53b6ad6ee484d1b5cf0609a53027e63a Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 03:02:39 +0100 Subject: [PATCH 2/9] fix(benchmarks): Enforce Claude UI failure conditions Treat configured failure pattern matches as incomplete benchmark runs so CI exits non-zero for explicitly declared failure conditions. Reject activateSkill configs without skillDirs during suite parsing to avoid late failures after expensive setup. Co-Authored-By: OpenAI Codex --- .../__tests__/claude-ui-benchmark.test.ts | 62 +++++++++++++++++++ src/benchmarks/claude-ui/compare.ts | 1 + src/benchmarks/claude-ui/config.ts | 10 ++- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts index 2a0244d1..8c4ccca4 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts @@ -231,6 +231,52 @@ describe('Claude UI benchmark analysis', () => { ); expect(result.completion.issueCount).toBe(1); + expect(result.completion.completed).toBe(false); + expect(result.completed).toBe(false); + }); + + it('marks the benchmark incomplete when configured failure patterns match', () => { + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { type: 'tool_use', id: 'tool-1', name: `${toolPrefix}wait_for_ui`, input: {} }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [ + { + type: 'tool_result', + tool_use_id: 'tool-1', + is_error: false, + content: 'BUILD FAILED', + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { + mcpToolPrefix: toolPrefix, + failurePatterns: ['BUILD FAILED'], + }); + + expect(audit.failures).toEqual([]); + expect(audit.patternFailures).toHaveLength(1); + + const result = compareBenchmark( + { name: 'weather', prompt: 'prompt.md' }, + audit, + runMetadata(10), + ); + + expect(result.completion.issueCount).toBe(1); + expect(result.completion.completed).toBe(false); + expect(result.completed).toBe(false); }); it('counts parser failures once when malformed JSONL also records parse errors', () => { @@ -284,6 +330,22 @@ describe('Claude UI benchmark analysis', () => { ).toThrow('weather.yml.failurePatterns[1]: invalid regular expression'); }); + it('rejects activateSkill without skillDirs when loading config', () => { + expect(() => + readConfig( + { + name: 'weather', + prompt: 'prompt.md', + claude: { + activateSkill: 'vendor-cli', + isolatedWorkingDirectory: true, + }, + }, + 'weather.yml', + ), + ).toThrow('weather.yml.claude.activateSkill: requires skillDirs'); + }); + it('rejects invalid session defaults when loading config', () => { expect(() => readConfig( diff --git a/src/benchmarks/claude-ui/compare.ts b/src/benchmarks/claude-ui/compare.ts index de406bbe..267d6bcb 100644 --- a/src/benchmarks/claude-ui/compare.ts +++ b/src/benchmarks/claude-ui/compare.ts @@ -134,6 +134,7 @@ function processCompleted(run: BenchmarkRunMetadata, audit: TranscriptAudit): bo if (audit.parseErrors.length > 0) return false; if (run.claudeExitCode !== 0) return false; if (run.parserExitCode !== 0) return false; + if (audit.patternFailures.length > 0) return false; return !audit.failures.some(isTerminalClaudeFailure); } diff --git a/src/benchmarks/claude-ui/config.ts b/src/benchmarks/claude-ui/config.ts index 8fc3f228..714064b4 100644 --- a/src/benchmarks/claude-ui/config.ts +++ b/src/benchmarks/claude-ui/config.ts @@ -115,6 +115,12 @@ function readClaudeInvocationConfig( ) { throw new Error(`${source}.permissionMode: expected 'default' or 'bypassPermissions'`); } + const skillDirs = readOptionalStringArray(raw, 'skillDirs', source); + const activateSkill = readOptionalString(raw, 'activateSkill', source); + if (activateSkill !== undefined && (!skillDirs || skillDirs.length === 0)) { + throw new Error(`${source}.activateSkill: requires skillDirs`); + } + return { useMcpServer: readOptionalBoolean(raw, 'useMcpServer', source), permissionMode, @@ -123,8 +129,8 @@ function readClaudeInvocationConfig( appendSystemPrompt: readOptionalString(raw, 'appendSystemPrompt', source), extraArgs: readOptionalStringArray(raw, 'extraArgs', source), pluginDirs: readOptionalStringArray(raw, 'pluginDirs', source), - skillDirs: readOptionalStringArray(raw, 'skillDirs', source), - activateSkill: readOptionalString(raw, 'activateSkill', source), + skillDirs, + activateSkill, isolatedWorkingDirectory: readOptionalBoolean(raw, 'isolatedWorkingDirectory', source), maxClaudeSeconds: readOptionalNumber(raw, 'maxClaudeSeconds', source), }; From b8e93535cf487cd923c70bf59755bf8b1dccfa3e Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 03:10:18 +0100 Subject: [PATCH 3/9] fix(benchmarks): Reject removed sequence config key Reject the old sequence suite config key with an explicit migration message so migrated benchmarks do not silently drop sequence checks. Co-Authored-By: OpenAI Codex --- .../claude-ui/__tests__/claude-ui-benchmark.test.ts | 13 +++++++++++++ src/benchmarks/claude-ui/config.ts | 1 + 2 files changed, 14 insertions(+) diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts index 8c4ccca4..5535556e 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts @@ -315,6 +315,19 @@ describe('Claude UI benchmark analysis', () => { 'weather.yml', ), ).toThrow('weather.yml.allowedVariance: removed; baselines are observed data only'); + + expect(() => + readConfig( + { + name: 'weather', + prompt: 'prompt.md', + sequence: { mode: 'fail' }, + }, + 'weather.yml', + ), + ).toThrow( + 'weather.yml.sequence: removed; use baselineToolSequence for observed sequence reporting', + ); }); it('rejects malformed failure pattern regexes when loading config', () => { diff --git a/src/benchmarks/claude-ui/config.ts b/src/benchmarks/claude-ui/config.ts index 714064b4..707f9e87 100644 --- a/src/benchmarks/claude-ui/config.ts +++ b/src/benchmarks/claude-ui/config.ts @@ -247,6 +247,7 @@ function rejectRemovedConfigKeys(raw: Record, source: string): allowedVariance: 'removed; baselines are observed data only', expectedFailures: 'removed; benchmark stumbles are observed data', expectedToolSequence: 'renamed to baselineToolSequence', + sequence: 'removed; use baselineToolSequence for observed sequence reporting', }; for (const [key, message] of Object.entries(removedKeys)) { if (raw[key] !== undefined) throw new Error(`${source}.${key}: ${message}`); From 3dccc3b64dac2d5036ecb65b2cebe6f27df1450c Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 03:59:50 +0100 Subject: [PATCH 4/9] fix(benchmarks): Harden Claude UI review feedback Validate activated benchmark skills before setup, preserve authoritative Claude stream results only for harness-terminated runs, and make transcript failure accounting robust for missing or duplicate Bash tool results. Co-Authored-By: OpenAI Codex --- .../__tests__/claude-ui-benchmark.test.ts | 36 +++ .../__tests__/claude-ui-tool-config.test.ts | 240 ++++++++++++------ src/benchmarks/claude-ui/config.ts | 18 ++ src/benchmarks/claude-ui/harness.ts | 21 +- .../claude-ui/simulator-deletion.ts | 20 +- .../claude-ui/simulator-lifecycle.ts | 4 +- src/benchmarks/claude-ui/transcript.ts | 8 +- 7 files changed, 247 insertions(+), 100 deletions(-) diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts index 5535556e..d0e1bb88 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts @@ -359,6 +359,42 @@ describe('Claude UI benchmark analysis', () => { ).toThrow('weather.yml.claude.activateSkill: requires skillDirs'); }); + it('rejects activateSkill that does not match skillDirs when loading config', () => { + expect(() => + readConfig( + { + name: 'weather', + prompt: 'prompt.md', + claude: { + skillDirs: ['benchmarks/claude-ui/local/skills/vendor-cli'], + activateSkill: 'other-skill', + isolatedWorkingDirectory: true, + }, + }, + 'weather.yml', + ), + ).toThrow('weather.yml.claude.activateSkill: must match a basename from skillDirs'); + }); + + it('rejects duplicate skillDir basenames when loading config', () => { + expect(() => + readConfig( + { + name: 'weather', + prompt: 'prompt.md', + claude: { + skillDirs: [ + 'benchmarks/claude-ui/local/skills/vendor-cli', + 'benchmarks/claude-ui/fixtures/skills/vendor-cli', + ], + isolatedWorkingDirectory: true, + }, + }, + 'weather.yml', + ), + ).toThrow("weather.yml.claude.skillDirs: duplicate basename 'vendor-cli'"); + }); + it('rejects invalid session defaults when loading config', () => { expect(() => readConfig( diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts index 605ce457..c57c4ea3 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts @@ -1,16 +1,10 @@ -import { spawn } from 'node:child_process'; -import { mkdtemp, readdir, readFile, rm, writeFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; import path from 'node:path'; -import { fileURLToPath } from 'node:url'; import { buildClaudeArgs } from '../claude-invocation.ts'; import { compareBenchmark } from '../compare.ts'; import { readConfig } from '../config.ts'; import { analyzeClaudeJsonl } from '../transcript.ts'; import type { BenchmarkArtifacts, BenchmarkRunMetadata } from '../types.ts'; -const repoRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../../../..'); - function line(value: unknown): string { return JSON.stringify(value); } @@ -42,28 +36,6 @@ function runMetadata(wallClockSeconds: number): BenchmarkRunMetadata { }; } -function runParserScript(args: string[]): Promise<{ - exitCode: number | null; - stdout: string; - stderr: string; -}> { - return new Promise((resolve, reject) => { - const child = spawn('python3', args, { stdio: ['ignore', 'pipe', 'pipe'] }); - const stdout: Buffer[] = []; - const stderr: Buffer[] = []; - child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk)); - child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk)); - child.on('error', reject); - child.on('close', (exitCode) => { - resolve({ - exitCode, - stdout: Buffer.concat(stdout).toString('utf8'), - stderr: Buffer.concat(stderr).toString('utf8'), - }); - }); - }); -} - describe('Claude UI benchmark tool configuration', () => { it('loads Claude invocation and tool analysis from suite config', () => { const config = readConfig( @@ -612,6 +584,120 @@ describe('Claude UI benchmark tool configuration', () => { }); }); + it('handles tracked tool results without content', () => { + const config = readConfig( + { + name: 'private CLI weather', + prompt: 'weather.md', + failurePatterns: ['WAIT_TIMEOUT'], + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'privatecli', + shortName: 'privatecli.other', + }, + ], + }, + }, + 'private-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'privatecli --version' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [{ type: 'tool_result', tool_use_id: 'tool-1', is_error: true }], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { + toolAnalysis: config.toolAnalysis, + failurePatterns: config.failurePatterns, + }); + + expect(audit.failures).toEqual([ + { + id: 'tool-1', + fullName: 'Bash', + shortName: 'privatecli.other', + line: 2, + message: '', + }, + ]); + expect(audit.patternFailures).toEqual([]); + }); + + it('counts repeated matches in one Bash failure result once', () => { + const config = readConfig( + { + name: 'private CLI weather', + prompt: 'weather.md', + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'privatecli', + shortName: 'privatecli.other', + }, + ], + }, + }, + 'private-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'privatecli one && privatecli two' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [ + { + type: 'tool_result', + tool_use_id: 'tool-1', + is_error: true, + content: 'Exit code 1', + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis }); + const result = compareBenchmark(config, audit, runMetadata(600)); + + expect(audit.trackedSequence.map((call) => call.shortName)).toEqual([ + 'privatecli.other', + 'privatecli.other', + ]); + expect(audit.failures).toHaveLength(1); + expect(result.completion.issueCount).toBe(1); + }); + it('marks the benchmark incomplete when Claude exits non-zero', () => { const config = readConfig( { @@ -633,53 +719,55 @@ describe('Claude UI benchmark tool configuration', () => { }); }); - it('lets the parser include configured non-MCP tool names', async () => { - const dir = await mkdtemp(path.join(tmpdir(), 'claude-ui-parser-')); - try { - const jsonlPath = path.join(dir, 'claude.jsonl'); - const outputPath = path.join(dir, 'parsed'); - await writeFile( - jsonlPath, - [ - line({ - type: 'assistant', - message: { - content: [ - { - type: 'tool_use', - id: 'tool-1', - name: 'Bash', - input: { command: 'vendorcli ui screen --json' }, - }, - ], - }, - }), - line({ - type: 'user', - message: { content: [{ type: 'tool_result', tool_use_id: 'tool-1', content: 'ok' }] }, - }), - ].join('\n'), - 'utf8', - ); - - const result = await runParserScript([ - path.join(repoRoot, 'benchmarks/claude-ui/parse_claude_conversation.py'), - jsonlPath, - outputPath, - '--tool-prefix=mcp__xcodebuildmcp', - '--tool-name=Bash', - ]); - - expect(result.exitCode).toBe(0); - expect(await readdir(outputPath)).toEqual([ - '0001_tool_call_Bash.md', - '0002_tool_result_Bash.md', - ]); - expect(await readFile(path.join(outputPath, '0001_tool_call_Bash.md'), 'utf8')).toContain( - 'vendorcli ui screen --json', - ); - } finally { - await rm(dir, { recursive: true, force: true }); - } + it('keeps configured non-MCP tool names in transcript analysis', () => { + const config = readConfig( + { + name: 'vendor CLI weather', + prompt: 'weather.md', + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'vendorcli ui screen', + shortName: 'vendorcli.screen', + }, + ], + }, + }, + 'vendor-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'vendorcli ui screen --json' }, + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { toolAnalysis: config.toolAnalysis }); + + expect( + audit.trackedSequence.map((call) => ({ + fullName: call.fullName, + shortName: call.shortName, + isUiAutomation: call.isUiAutomation, + line: call.line, + })), + ).toEqual([ + { + fullName: 'Bash', + shortName: 'vendorcli.screen', + isUiAutomation: false, + line: 1, + }, + ]); }); }); diff --git a/src/benchmarks/claude-ui/config.ts b/src/benchmarks/claude-ui/config.ts index 707f9e87..958d44ce 100644 --- a/src/benchmarks/claude-ui/config.ts +++ b/src/benchmarks/claude-ui/config.ts @@ -1,4 +1,5 @@ import { readFile } from 'node:fs/promises'; +import path from 'node:path'; import { parse as parseYaml } from 'yaml'; import * as z from 'zod'; import { sessionDefaultsSchema } from '../../utils/session-defaults-schema.ts'; @@ -116,10 +117,27 @@ function readClaudeInvocationConfig( throw new Error(`${source}.permissionMode: expected 'default' or 'bypassPermissions'`); } const skillDirs = readOptionalStringArray(raw, 'skillDirs', source); + if (skillDirs !== undefined) { + const basenames = new Set(); + for (const skillDir of skillDirs) { + const basename = path.basename(skillDir); + if (basenames.has(basename)) { + throw new Error(`${source}.skillDirs: duplicate basename '${basename}'`); + } + basenames.add(basename); + } + } const activateSkill = readOptionalString(raw, 'activateSkill', source); if (activateSkill !== undefined && (!skillDirs || skillDirs.length === 0)) { throw new Error(`${source}.activateSkill: requires skillDirs`); } + if ( + activateSkill !== undefined && + skillDirs !== undefined && + !skillDirs.some((skillDir) => path.basename(skillDir) === activateSkill) + ) { + throw new Error(`${source}.activateSkill: must match a basename from skillDirs`); + } return { useMcpServer: readOptionalBoolean(raw, 'useMcpServer', source), diff --git a/src/benchmarks/claude-ui/harness.ts b/src/benchmarks/claude-ui/harness.ts index 0100928a..edac196a 100644 --- a/src/benchmarks/claude-ui/harness.ts +++ b/src/benchmarks/claude-ui/harness.ts @@ -218,6 +218,7 @@ function runCommand(opts: { const started = process.hrtime.bigint(); let stdoutBuffer = ''; let terminalResultExitCode: number | undefined; + let terminalResultRequestedTermination = false; let terminalResultTimer: NodeJS.Timeout | undefined; let timeoutTimer: NodeJS.Timeout | undefined; let hardKillTimer: NodeJS.Timeout | undefined; @@ -272,13 +273,20 @@ function runCommand(opts: { if (terminalResultExitCode !== undefined || opts.terminalJsonResultGraceMs === undefined) return; terminalResultExitCode = result.is_error === true ? 1 : 0; - terminalResultTimer = setTimeout(terminateChild, opts.terminalJsonResultGraceMs); + terminalResultTimer = setTimeout(() => { + terminalResultRequestedTermination = true; + terminateChild(); + }, opts.terminalJsonResultGraceMs); terminalResultTimer.unref(); }; if (opts.timeoutMs !== undefined) { timeoutTimer = setTimeout(() => { - timedOut = true; + if (terminalResultExitCode === undefined) { + timedOut = true; + } else { + terminalResultRequestedTermination = true; + } terminateChild(); }, opts.timeoutMs); timeoutTimer.unref(); @@ -325,12 +333,19 @@ function runCommand(opts: { clearTimeoutTimer(); clearHardKillTimer(); const durationSeconds = Number(process.hrtime.bigint() - started) / 1_000_000_000; + const resolvedExitCode = + terminalResultExitCode !== undefined && + (terminalResultRequestedTermination || exitCode === 0 || exitCode === null) + ? terminalResultExitCode + : timedOut + ? 143 + : (exitCode ?? null); stdout.end(); stderr.end(); Promise.all([finished(stdout), finished(stderr)]) .then(() => resolve({ - exitCode: timedOut ? 143 : (exitCode ?? terminalResultExitCode ?? null), + exitCode: resolvedExitCode, durationSeconds, }), ) diff --git a/src/benchmarks/claude-ui/simulator-deletion.ts b/src/benchmarks/claude-ui/simulator-deletion.ts index 8b7f259d..48868be1 100644 --- a/src/benchmarks/claude-ui/simulator-deletion.ts +++ b/src/benchmarks/claude-ui/simulator-deletion.ts @@ -1,15 +1,12 @@ -import { appendFile } from 'node:fs/promises'; import { + defaultLifecycleLogWriter, runLoggedCommand, + tryAppendLifecycleLog, type CreatedTemporarySimulator, type LifecycleCommandExecutor, type LifecycleLogWriter, } from './simulator-lifecycle.ts'; -const defaultLifecycleLogWriter: LifecycleLogWriter = async (logPath, message) => { - await appendFile(logPath, `${message}\n`, 'utf8'); -}; - export interface DeleteTemporarySimulatorResult { attempted: boolean; succeeded: boolean; @@ -17,19 +14,6 @@ export interface DeleteTemporarySimulatorResult { error?: string; } -async function tryAppendLifecycleLog( - logPath: string, - message: string, - logWriter: LifecycleLogWriter = defaultLifecycleLogWriter, -): Promise { - try { - await logWriter(logPath, message); - return undefined; - } catch (error) { - return error instanceof Error ? error.message : String(error); - } -} - export async function deleteTemporarySimulator( simulator: CreatedTemporarySimulator, opts: { diff --git a/src/benchmarks/claude-ui/simulator-lifecycle.ts b/src/benchmarks/claude-ui/simulator-lifecycle.ts index a3d685bb..cdaa5cad 100644 --- a/src/benchmarks/claude-ui/simulator-lifecycle.ts +++ b/src/benchmarks/claude-ui/simulator-lifecycle.ts @@ -25,7 +25,7 @@ export type LifecycleCommandExecutor = ( export type LifecycleLogWriter = (logPath: string, message: string) => Promise; -const defaultLifecycleLogWriter: LifecycleLogWriter = async (logPath, message) => { +export const defaultLifecycleLogWriter: LifecycleLogWriter = async (logPath, message) => { await appendFile(logPath, `${message}\n`, 'utf8'); }; @@ -112,7 +112,7 @@ async function appendLifecycleLog( await logWriter(logPath, message); } -async function tryAppendLifecycleLog( +export async function tryAppendLifecycleLog( logPath: string, message: string, logWriter: LifecycleLogWriter = defaultLifecycleLogWriter, diff --git a/src/benchmarks/claude-ui/transcript.ts b/src/benchmarks/claude-ui/transcript.ts index 0ac9ad00..1aad6fb2 100644 --- a/src/benchmarks/claude-ui/transcript.ts +++ b/src/benchmarks/claude-ui/transcript.ts @@ -85,7 +85,7 @@ function parseEmbeddedJson(value: unknown): unknown { function stringifyContent(value: unknown): string { if (typeof value === 'string') return value; - return JSON.stringify(value); + return JSON.stringify(value) ?? ''; } function extractContentBlocks(entry: Record): unknown[] { @@ -269,6 +269,7 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans const trackedToolsById = new Map>(); const parseErrors: string[] = []; const failures: ToolFailureRecord[] = []; + const failureKeys = new Set(); const patternFailures: PatternFailureRecord[] = []; const trackedSequence: ToolCallRecord[] = []; const mcpSequence: ToolCallRecord[] = []; @@ -407,6 +408,11 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans if (matchesAnyPattern(message, ignoredFailureMatchers)) continue; for (const trackedTool of trackedTools) { + const failureKey = [id, trackedTool.fullName, trackedTool.shortName, line, message].join( + '\0', + ); + if (failureKeys.has(failureKey)) continue; + failureKeys.add(failureKey); failures.push({ id, fullName: trackedTool.fullName, From cd53c71e7bb4ad51d8a440d569542369f56e372f Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 04:50:00 +0100 Subject: [PATCH 5/9] fix(benchmarks): Harden Claude UI process cleanup Detach timed Claude commands before process-group termination and fix RocketSim preflight launch detection for direct app/path commands. Co-Authored-By: OpenAI Codex --- .../claude-ui/__tests__/preflight-commands.test.ts | 14 ++++++++++++++ src/benchmarks/claude-ui/harness.ts | 2 +- src/benchmarks/claude-ui/preflight-commands.ts | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts b/src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts index 8236ff26..15a3c3e9 100644 --- a/src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts +++ b/src/benchmarks/claude-ui/__tests__/preflight-commands.test.ts @@ -22,6 +22,20 @@ describe('Claude UI benchmark preflight commands', () => { expect(preflightCommandsWithFocusResign({ commands })).toBe(commands); }); + it('detects simple and path-based RocketSim launch commands', () => { + expect( + preflightCommandsWithFocusResign({ + commands: ['open RocketSim', 'open /Applications/RocketSim.app'], + simulatorId: 'SIM-123', + }), + ).toEqual([ + 'open RocketSim', + "open -a Simulator --args -CurrentDeviceUDID 'SIM-123'", + 'open /Applications/RocketSim.app', + "open -a Simulator --args -CurrentDeviceUDID 'SIM-123'", + ]); + }); + it('shell-quotes simulator IDs used by the focus command', () => { expect( preflightCommandsWithFocusResign({ diff --git a/src/benchmarks/claude-ui/harness.ts b/src/benchmarks/claude-ui/harness.ts index edac196a..7bb9d99a 100644 --- a/src/benchmarks/claude-ui/harness.ts +++ b/src/benchmarks/claude-ui/harness.ts @@ -228,7 +228,7 @@ function runCommand(opts: { cwd: opts.cwd, env: opts.env ?? process.env, stdio: ['pipe', 'pipe', 'pipe'], - detached: opts.terminalJsonResultGraceMs !== undefined, + detached: opts.terminalJsonResultGraceMs !== undefined || opts.timeoutMs !== undefined, }); const clearTerminalResultTimer = (): void => { diff --git a/src/benchmarks/claude-ui/preflight-commands.ts b/src/benchmarks/claude-ui/preflight-commands.ts index 06f067f8..13bfb713 100644 --- a/src/benchmarks/claude-ui/preflight-commands.ts +++ b/src/benchmarks/claude-ui/preflight-commands.ts @@ -17,7 +17,7 @@ function shellSingleQuote(value: string): string { } function isRocketSimAppLaunchCommand(command: string): boolean { - return /^\s*open\s+.*(?:^|\s)RocketSim(?:\.app)?\s*$/.test(command); + return /^\s*open\s+(?:.*(?:\s|\/))?RocketSim(?:\.app)?\s*$/.test(command); } export function preflightCommandsWithFocusResign(opts: { From efffe6914a54910dbac1637fa362812cd1121675 Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 19:42:51 +0100 Subject: [PATCH 6/9] fix(benchmarks): Handle Claude command stdin pipe errors Handle stdin stream errors from benchmark child processes so an early child exit does not crash the harness with an unhandled EPIPE. Co-Authored-By: Codex --- src/benchmarks/claude-ui/harness.ts | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/benchmarks/claude-ui/harness.ts b/src/benchmarks/claude-ui/harness.ts index 7bb9d99a..d51d772d 100644 --- a/src/benchmarks/claude-ui/harness.ts +++ b/src/benchmarks/claude-ui/harness.ts @@ -246,6 +246,17 @@ function runCommand(opts: { hardKillTimer = undefined; }; + const rejectCommand = (error: Error): void => { + if (settled) return; + settled = true; + clearTerminalResultTimer(); + clearTimeoutTimer(); + clearHardKillTimer(); + stdout.destroy(); + stderr.destroy(); + reject(error); + }; + const killChild = (signal: NodeJS.Signals): void => { if (child.exitCode !== null || child.killed || child.pid === undefined) return; try { @@ -317,14 +328,11 @@ function runCommand(opts: { stderr.write(chunk); }); child.on('error', (error) => { - if (settled) return; - settled = true; - clearTerminalResultTimer(); - clearTimeoutTimer(); - clearHardKillTimer(); - stdout.destroy(); - stderr.destroy(); - reject(error); + rejectCommand(error); + }); + child.stdin.on('error', (error: NodeJS.ErrnoException) => { + if (error.code === 'EPIPE') return; + rejectCommand(error); }); child.on('close', (exitCode) => { if (settled) return; From 1a06b6da5d4c59b0549f34a6857fc5a81207af88 Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 20:24:08 +0100 Subject: [PATCH 7/9] fix(benchmarks): Isolate local suite discovery tests Allow Claude UI suite discovery helpers to receive suite directories so tests can exercise local suite lookup without writing fake files into the real repository tree. Co-Authored-By: Codex --- .../__tests__/claude-ui-benchmark.test.ts | 22 ++++++++++++------- src/benchmarks/claude-ui/harness.ts | 21 +++++++++++++----- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts index d0e1bb88..828c1250 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts @@ -89,16 +89,22 @@ describe('Claude UI benchmark harness', () => { }); it('discovers local private suites when present', async () => { - const localSuitesDir = path.join(repoRoot, 'benchmarks/claude-ui/local/suites'); - const suiteName = `unit-private-suite-${process.pid}`; - const suitePath = path.join(localSuitesDir, `${suiteName}.yml`); - await mkdir(localSuitesDir, { recursive: true }); - await writeFile(suitePath, `name: ${suiteName}\nprompt: ../prompts/weather.md\n`, 'utf8'); + const dir = await mkdtemp(path.join(tmpdir(), 'claude-ui-suites-')); try { - await expect(resolveSuitePath(suiteName)).resolves.toBe(suitePath); - await expect(listSuitePaths()).resolves.toContain(suitePath); + const suiteDirectories = { + suitesDir: path.join(dir, 'suites'), + localSuitesDir: path.join(dir, 'local-suites'), + }; + const suiteName = `unit-private-suite-${process.pid}`; + const suitePath = path.join(suiteDirectories.localSuitesDir, `${suiteName}.yml`); + await mkdir(suiteDirectories.suitesDir, { recursive: true }); + await mkdir(suiteDirectories.localSuitesDir, { recursive: true }); + await writeFile(suitePath, `name: ${suiteName}\nprompt: ../prompts/weather.md\n`, 'utf8'); + + await expect(resolveSuitePath(suiteName, suiteDirectories)).resolves.toBe(suitePath); + await expect(listSuitePaths(suiteDirectories)).resolves.toContain(suitePath); } finally { - await rm(suitePath, { force: true }); + await rm(dir, { recursive: true, force: true }); } }); }); diff --git a/src/benchmarks/claude-ui/harness.ts b/src/benchmarks/claude-ui/harness.ts index d51d772d..7042b29f 100644 --- a/src/benchmarks/claude-ui/harness.ts +++ b/src/benchmarks/claude-ui/harness.ts @@ -58,6 +58,10 @@ interface StreamJsonResult { type?: unknown; is_error?: unknown; } +interface SuiteDirectories { + suitesDir: string; + localSuitesDir: string; +} async function fileExists(filePath: string): Promise { try { await access(filePath); @@ -67,7 +71,10 @@ async function fileExists(filePath: string): Promise { } } -export async function resolveSuitePath(suite: string): Promise { +export async function resolveSuitePath( + suite: string, + directories: SuiteDirectories = { suitesDir, localSuitesDir }, +): Promise { if ( path.isAbsolute(suite) || suite.includes(path.sep) || @@ -78,8 +85,8 @@ export async function resolveSuitePath(suite: string): Promise { } const candidates = [ - path.join(suitesDir, `${suite}.yml`), - path.join(localSuitesDir, `${suite}.yml`), + path.join(directories.suitesDir, `${suite}.yml`), + path.join(directories.localSuitesDir, `${suite}.yml`), ]; const matches = []; for (const candidate of candidates) { @@ -110,10 +117,12 @@ async function listYamlFiles(directory: string, required: boolean): Promise { +export async function listSuitePaths( + directories: SuiteDirectories = { suitesDir, localSuitesDir }, +): Promise { return [ - ...(await listYamlFiles(suitesDir, true)), - ...(await listYamlFiles(localSuitesDir, false)), + ...(await listYamlFiles(directories.suitesDir, true)), + ...(await listYamlFiles(directories.localSuitesDir, false)), ]; } From 28893383c187fd086993af31a2c9c92d8fde5602 Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 20:46:51 +0100 Subject: [PATCH 8/9] fix(benchmarks): Terminate Claude command on stdin errors Stop the child process when benchmark command stdin fails with a non-EPIPE error, and ignore late stdout/stderr data after the command has settled. Co-Authored-By: Codex --- src/benchmarks/claude-ui/harness.ts | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/benchmarks/claude-ui/harness.ts b/src/benchmarks/claude-ui/harness.ts index 7042b29f..65d8e710 100644 --- a/src/benchmarks/claude-ui/harness.ts +++ b/src/benchmarks/claude-ui/harness.ts @@ -255,17 +255,6 @@ function runCommand(opts: { hardKillTimer = undefined; }; - const rejectCommand = (error: Error): void => { - if (settled) return; - settled = true; - clearTerminalResultTimer(); - clearTimeoutTimer(); - clearHardKillTimer(); - stdout.destroy(); - stderr.destroy(); - reject(error); - }; - const killChild = (signal: NodeJS.Signals): void => { if (child.exitCode !== null || child.killed || child.pid === undefined) return; try { @@ -289,6 +278,18 @@ function runCommand(opts: { hardKillTimer.unref(); }; + const rejectCommand = (error: Error): void => { + if (settled) return; + settled = true; + clearTerminalResultTimer(); + clearTimeoutTimer(); + clearHardKillTimer(); + terminateChild(); + stdout.destroy(); + stderr.destroy(); + reject(error); + }; + const recordTerminalResult = (result: StreamJsonResult): void => { if (terminalResultExitCode !== undefined || opts.terminalJsonResultGraceMs === undefined) return; @@ -330,10 +331,12 @@ function runCommand(opts: { }; child.stdout.on('data', (chunk: Buffer) => { + if (settled) return; stdout.write(chunk); scanStdoutForTerminalResult(chunk); }); child.stderr.on('data', (chunk: Buffer) => { + if (settled) return; stderr.write(chunk); }); child.on('error', (error) => { From d1625f502ea0f1adac8681c3c80b90ef1565fcb2 Mon Sep 17 00:00:00 2001 From: Cameron Cooke Date: Tue, 26 May 2026 20:59:41 +0100 Subject: [PATCH 9/9] fix(benchmarks): Harden Claude UI benchmark validation Tighten transcript failure suppression, validate Claude timeout config, and make aggregate artifact roots path-aware. Co-Authored-By: Codex --- .../__tests__/claude-ui-benchmark.test.ts | 45 +++++++++++++ .../__tests__/claude-ui-tool-config.test.ts | 67 +++++++++++++++++++ src/benchmarks/claude-ui/config.ts | 15 ++++- src/benchmarks/claude-ui/render.ts | 7 +- src/benchmarks/claude-ui/transcript.ts | 27 +++++++- 5 files changed, 157 insertions(+), 4 deletions(-) diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts index 828c1250..669839c6 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-benchmark.test.ts @@ -3,6 +3,7 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { compareBenchmark, diffToolSequence } from '../compare.ts'; +import { renderAggregate } from '../render.ts'; import { readConfig } from '../config.ts'; import { listSuitePaths, @@ -336,6 +337,21 @@ describe('Claude UI benchmark analysis', () => { ); }); + it('rejects invalid Claude timeout values when loading config', () => { + for (const maxClaudeSeconds of [0, -1, Number.NaN, Number.POSITIVE_INFINITY]) { + expect(() => + readConfig( + { + name: 'weather', + prompt: 'prompt.md', + claude: { maxClaudeSeconds }, + }, + 'weather.yml', + ), + ).toThrow('weather.yml.claude.maxClaudeSeconds: expected finite positive number'); + } + }); + it('rejects malformed failure pattern regexes when loading config', () => { expect(() => readConfig( @@ -602,6 +618,35 @@ describe('Claude UI benchmark analysis', () => { expect(result.completed).toBe(false); }); + it('renders path-aware aggregate artifact roots', () => { + const first = compareBenchmark( + { name: 'first', prompt: 'prompt.md' }, + analyzeClaudeJsonl('', { mcpToolPrefix: toolPrefix }), + { + ...runMetadata(10), + artifacts: { + ...runMetadata(10).artifacts, + runDirectory: '/tmp/run/first/20260101T000000Z', + }, + }, + ); + const second = compareBenchmark( + { name: 'second', prompt: 'prompt.md' }, + analyzeClaudeJsonl('', { mcpToolPrefix: toolPrefix }), + { + ...runMetadata(20), + artifacts: { + ...runMetadata(20).artifacts, + runDirectory: '/tmp/run-extra/second/20260101T000000Z', + }, + }, + ); + + expect(renderAggregate([first, second], { color: false, cwd: '/tmp' })).toContain( + 'Artifacts: /tmp/', + ); + }); + it('returns no sequence hunks when expected and actual match', () => { expect(diffToolSequence(['a', 'b'], ['a', 'b'])).toEqual([]); }); diff --git a/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts b/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts index c57c4ea3..fb43b88c 100644 --- a/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts +++ b/src/benchmarks/claude-ui/__tests__/claude-ui-tool-config.test.ts @@ -322,6 +322,73 @@ describe('Claude UI benchmark tool configuration', () => { ]); }); + it('reports real failures when ignored and reportable patterns share a result', () => { + const config = readConfig( + { + name: 'private CLI weather', + prompt: 'weather.md', + failurePatterns: ['WAIT_TIMEOUT'], + ignoredFailurePatterns: ['element_disabled'], + toolAnalysis: { + matchers: [ + { + kind: 'bashCommand', + commandPrefix: 'privatecli wait', + shortName: 'privatecli.wait', + uiAutomation: true, + }, + ], + }, + }, + 'private-cli.yml', + ); + const transcript = [ + line({ + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'tool-1', + name: 'Bash', + input: { command: 'privatecli wait element --label Weather --timeout 1' }, + }, + ], + }, + }), + line({ + type: 'user', + message: { + content: [ + { + type: 'tool_result', + tool_use_id: 'tool-1', + is_error: true, + content: 'Exit code 1\n{"error":{"code":"element_disabled"}}\nWAIT_TIMEOUT', + }, + ], + }, + }), + ].join('\n'); + + const audit = analyzeClaudeJsonl(transcript, { + toolAnalysis: config.toolAnalysis, + failurePatterns: config.failurePatterns, + ignoredFailurePatterns: config.ignoredFailurePatterns, + }); + const result = compareBenchmark(config, audit, runMetadata(10)); + + expect(audit.failures).toHaveLength(1); + expect(audit.patternFailures).toEqual([ + { + pattern: 'WAIT_TIMEOUT', + line: 2, + excerpt: 'Exit code 1\n{"error":{"code":"element_disabled"}}\nWAIT_TIMEOUT', + }, + ]); + expect(result.completed).toBe(false); + }); + it('ignores configured non-terminal tool failures', () => { const config = readConfig( { diff --git a/src/benchmarks/claude-ui/config.ts b/src/benchmarks/claude-ui/config.ts index 958d44ce..6bad073d 100644 --- a/src/benchmarks/claude-ui/config.ts +++ b/src/benchmarks/claude-ui/config.ts @@ -91,6 +91,19 @@ function readOptionalNumber( return raw; } +function readOptionalPositiveFiniteNumber( + value: Record, + key: string, + source: string, +): number | undefined { + const raw = readOptionalNumber(value, key, source); + if (raw === undefined) return undefined; + if (!Number.isFinite(raw) || raw <= 0) { + throw new Error(`${source}.${key}: expected finite positive number`); + } + return raw; +} + function readNumberMap(value: unknown, source: string): Record | undefined { if (value === undefined) return undefined; if (!isRecord(value)) throw new Error(`${source}: expected object`); @@ -150,7 +163,7 @@ function readClaudeInvocationConfig( skillDirs, activateSkill, isolatedWorkingDirectory: readOptionalBoolean(raw, 'isolatedWorkingDirectory', source), - maxClaudeSeconds: readOptionalNumber(raw, 'maxClaudeSeconds', source), + maxClaudeSeconds: readOptionalPositiveFiniteNumber(raw, 'maxClaudeSeconds', source), }; } diff --git a/src/benchmarks/claude-ui/render.ts b/src/benchmarks/claude-ui/render.ts index d02d9162..aa10f2f0 100644 --- a/src/benchmarks/claude-ui/render.ts +++ b/src/benchmarks/claude-ui/render.ts @@ -373,12 +373,17 @@ export function renderSuiteReport(result: BenchmarkResult, options?: RenderOptio return `${sections.join('\n')}\n`; } +function pathContainsOrEquals(root: string, target: string): boolean { + const relative = path.relative(root, target); + return relative === '' || (!relative.startsWith('..') && !path.isAbsolute(relative)); +} + function commonArtifactRoot(results: readonly BenchmarkResult[]): string | undefined { if (results.length === 0) return undefined; const dirs = results.map((r) => path.dirname(r.run.artifacts.runDirectory)); let root = dirs[0]!; for (const dir of dirs.slice(1)) { - while (!dir.startsWith(root)) { + while (!pathContainsOrEquals(root, dir)) { const next = path.dirname(root); if (next === root) return root; root = next; diff --git a/src/benchmarks/claude-ui/transcript.ts b/src/benchmarks/claude-ui/transcript.ts index 1aad6fb2..594d215c 100644 --- a/src/benchmarks/claude-ui/transcript.ts +++ b/src/benchmarks/claude-ui/transcript.ts @@ -148,6 +148,24 @@ function matchesAnyPattern( return matchers.some((matcher) => matcher.regex.test(text)); } +function patternMatcherIsIgnored( + matcher: { pattern: string }, + ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>, +): boolean { + return matchesAnyPattern(matcher.pattern, ignoredFailureMatchers); +} + +function hasReportablePatternMatch( + text: string, + patternMatchers: Array<{ pattern: string; regex: RegExp }>, + ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>, +): boolean { + return patternMatchers.some( + (matcher) => + matcher.regex.test(text) && !patternMatcherIsIgnored(matcher, ignoredFailureMatchers), + ); +} + function appendPatternFailures(opts: { text: string; line: number; @@ -156,8 +174,8 @@ function appendPatternFailures(opts: { ignoredFailureMatchers: Array<{ pattern: string; regex: RegExp }>; patternFailures: PatternFailureRecord[]; }): void { - if (matchesAnyPattern(opts.text, opts.ignoredFailureMatchers)) return; for (const matcher of opts.patternMatchers) { + if (patternMatcherIsIgnored(matcher, opts.ignoredFailureMatchers)) continue; if (matcher.regex.test(opts.text)) { opts.patternFailures.push({ pattern: matcher.pattern, @@ -405,7 +423,12 @@ export function analyzeClaudeJsonl(text: string, options: AnalyzeOptions): Trans } if (!resultDidError(block, structured)) continue; - if (matchesAnyPattern(message, ignoredFailureMatchers)) continue; + if ( + matchesAnyPattern(message, ignoredFailureMatchers) && + !hasReportablePatternMatch(message, patternMatchers, ignoredFailureMatchers) + ) { + continue; + } for (const trackedTool of trackedTools) { const failureKey = [id, trackedTool.fullName, trackedTool.shortName, line, message].join(