Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,4 @@ DerivedData
/repros
/.xcodebuildmcp
/out.nosync
/benchmarks/claude-ui/local/
162 changes: 111 additions & 51 deletions benchmarks/claude-ui/README.md

Large diffs are not rendered by default.

26 changes: 19 additions & 7 deletions benchmarks/claude-ui/parse_claude_conversation.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/usr/bin/env python3
"""Parse a Claude Code NDJSON session log into per-turn files.

Filters to: user prompts, assistant text replies, and XcodeBuildMCP tool
Filters to: user prompts, assistant text replies, and configured tool
calls/results. Strips screenshot image blobs to a short placeholder.

Usage:
parse_claude_conversation.py <session.jsonl> [output_dir] \\
[--tool-prefix=mcp__xcodebuildmcp]
[--tool-prefix=mcp__xcodebuildmcp] [--tool-name=Bash]
"""

from __future__ import annotations
Expand Down Expand Up @@ -101,10 +101,14 @@ def extract_user_text(entry: dict) -> str:
return "\n\n".join(parts)


def parse(path: Path, out_dir: Path, tool_prefix: str) -> bool:
def matches_tool_name(name: str, tool_prefixes: list[str], tool_names: set[str]) -> bool:
return name in tool_names or any(name.startswith(prefix) for prefix in tool_prefixes)


def parse(path: Path, out_dir: Path, tool_prefixes: list[str], tool_names: set[str]) -> bool:
out_dir.mkdir(parents=True, exist_ok=True)

# Track tool_use_ids that target our prefix so we keep matching results.
# Track tool_use_ids that target configured tools so we keep matching results.
tracked_ids: set[str] = set()
tool_name_by_id: dict[str, str] = {}
counter = 0
Expand Down Expand Up @@ -185,7 +189,7 @@ def next_path(kind: str, label: str | None = None) -> Path:
)
elif btype == "tool_use":
name = block.get("name", "")
if not name.startswith(tool_prefix):
if not matches_tool_name(name, tool_prefixes, tool_names):
continue
tool_id = block.get("id", "")
tracked_ids.add(tool_id)
Expand Down Expand Up @@ -220,17 +224,25 @@ def main() -> int:
)
ap.add_argument(
"--tool-prefix",
default="mcp__xcodebuildmcp",
action="append",
default=None,
help="Only include tool calls whose name starts with this prefix",
)
ap.add_argument(
"--tool-name",
action="append",
default=[],
help="Also include tool calls whose name exactly matches this value",
)
args = ap.parse_args()

if not args.jsonl.is_file():
print(f"error: not a file: {args.jsonl}", file=sys.stderr)
return 1

out = args.output or args.jsonl.with_name(f"{args.jsonl.stem}_conversation")
return 0 if parse(args.jsonl, out, args.tool_prefix) else 1
tool_prefixes = args.tool_prefix or ["mcp__xcodebuildmcp"]
return 0 if parse(args.jsonl, out, tool_prefixes, set(args.tool_name)) else 1


if __name__ == "__main__":
Expand Down
26 changes: 13 additions & 13 deletions benchmarks/claude-ui/suites/contacts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,38 +9,38 @@ firstRunPromptDismissals:
- Continue
- Not Now
- OK
timeoutSeconds: 8
timeoutSeconds: 30
baseline:
totalToolCalls: 14
mcpToolCalls: 13
uiAutomationCalls: 11
wallClockSeconds: 97
totalToolCalls: 19
trackedToolCalls: 18
mcpToolCalls: 18
uiAutomationCalls: 16
wallClockSeconds: 102.94
tools:
session_show_defaults: 1
launch_app_sim: 1
snapshot_ui: 1
snapshot_ui: 6
tap: 5
type_text: 5
allowedVariance:
totalToolCalls: 3
mcpToolCalls: 3
uiAutomationCalls: 3
wallClockSeconds: 45
toolCalls: 2
expectedToolSequence:
baselineToolSequence:
- session_show_defaults
- launch_app_sim
- snapshot_ui
- tap
- snapshot_ui
- tap
- type_text
- snapshot_ui
- type_text
- type_text
- tap
- snapshot_ui
- type_text
- tap
- snapshot_ui
- type_text
- tap
- snapshot_ui
failurePatterns:
- STALE_ELEMENT_REF
- SNAPSHOT_MISSING
Expand Down
24 changes: 11 additions & 13 deletions benchmarks/claude-ui/suites/reminders.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,29 @@ firstRunPromptDismissals:
labels:
- Continue
- Not Now
timeoutSeconds: 12
timeoutSeconds: 30
baseline:
totalToolCalls: 15
mcpToolCalls: 14
uiAutomationCalls: 12
wallClockSeconds: 85
totalToolCalls: 17
trackedToolCalls: 16
mcpToolCalls: 16
uiAutomationCalls: 14
wallClockSeconds: 92.79
tools:
session_show_defaults: 1
launch_app_sim: 1
snapshot_ui: 1
tap: 4
tap: 5
wait_for_ui: 1
type_text: 4
key_press: 2
batch: 1
allowedVariance:
totalToolCalls: 5
mcpToolCalls: 5
uiAutomationCalls: 5
wallClockSeconds: 60
toolCalls: 3
expectedToolSequence:
baselineToolSequence:
- session_show_defaults
- launch_app_sim
- snapshot_ui
- tap
- wait_for_ui
- tap
- type_text
- tap
- tap
Expand Down
20 changes: 8 additions & 12 deletions benchmarks/claude-ui/suites/weather.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,20 @@ sessionDefaults:
scheme: Weather
simulatorName: iPhone 17 Pro Max
baseline:
totalToolCalls: 13
mcpToolCalls: 12
uiAutomationCalls: 10
wallClockSeconds: 98
totalToolCalls: 14
trackedToolCalls: 13
mcpToolCalls: 13
uiAutomationCalls: 11
wallClockSeconds: 100.03
tools:
session_show_defaults: 1
build_run_sim: 1
snapshot_ui: 1
snapshot_ui: 2
tap: 6
batch: 1
type_text: 1
swipe: 1
allowedVariance:
totalToolCalls: 2
mcpToolCalls: 2
uiAutomationCalls: 2
wallClockSeconds: 45
toolCalls: 2
expectedToolSequence:
baselineToolSequence:
- session_show_defaults
- build_run_sim
- snapshot_ui
Expand All @@ -36,6 +31,7 @@ expectedToolSequence:
- tap
- tap
- swipe
- snapshot_ui
- tap
failurePatterns:
- STALE_ELEMENT_REF
Expand Down
Loading
Loading