diff --git a/.agents/skills/agents-shipgate/assets/advisory-pr-comment.yml b/.agents/skills/agents-shipgate/assets/advisory-pr-comment.yml index 0bf82ff5..7c46518d 100644 --- a/.agents/skills/agents-shipgate/assets/advisory-pr-comment.yml +++ b/.agents/skills/agents-shipgate/assets/advisory-pr-comment.yml @@ -18,9 +18,9 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 609cc0bc..14b35f31 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -11,7 +11,7 @@ body: id: version attributes: label: Agents Shipgate version - placeholder: "v1.0.0a1" + placeholder: "v0.14.0" validations: required: true - type: dropdown diff --git a/.gitignore b/.gitignore index aab762bf..588323b1 100644 --- a/.gitignore +++ b/.gitignore @@ -25,28 +25,7 @@ harness/adoption/artifacts/ .claude/* !.claude/commands/ -# Launch/deck workspaces -docs/decks/**/scratch/ -docs/decks/**/.DS_Store -docs/decks/**/.~lock* -docs/decks/**/~$* -docs/decks/**/*.tmp -docs/decks/**/qa*.jpg -docs/decks/vc-thesis/slide-08-options/*.html -docs/decks/vc-thesis/slide-08-options/_v3_*.png samples/**/.agents-shipgate/baseline.json -# Keep curated final VC deck artifacts despite the generic build/ ignore. -!docs/decks/vc-thesis/build/ -docs/decks/vc-thesis/build/* -!docs/decks/vc-thesis/build/fragments/ -docs/decks/vc-thesis/build/fragments/* -!docs/decks/vc-thesis/build/_logo-mark-*.png -!docs/decks/vc-thesis/build/contact-sheet*.png -!docs/decks/vc-thesis/build/deck*.pdf -!docs/decks/vc-thesis/build/deck*.pptx -!docs/decks/vc-thesis/build/slide-*.png -!docs/decks/vc-thesis/build/fragments/*.png - # Merged-PR miner clone cache (benchmark/miner) .miner-work/ diff --git a/.well-known/agents-shipgate.json b/.well-known/agents-shipgate.json index 2d10bdc9..f7038019 100644 --- a/.well-known/agents-shipgate.json +++ b/.well-known/agents-shipgate.json @@ -3,7 +3,7 @@ "name": "agents-shipgate", "display_name": "Agents Shipgate", "tagline": "The deterministic merge gate for AI-generated agent capability changes", - "version": "1.0.0a1", + "version": "0.14.0", "license": "Apache-2.0", "publisher": { "name": "Three Moons Lab", @@ -58,12 +58,12 @@ ], "package": { "pypi": "agents-shipgate", - "github_action": "ThreeMoonsLab/agents-shipgate@v1.0.0a1", + "github_action": "ThreeMoonsLab/agents-shipgate@v0.14.0", "github_repo": "ThreeMoonsLab/agents-shipgate" }, "release_status": { "track": "verify-capable release", - "latest_release": "v1.0.0a1" + "latest_release": "v0.14.0" }, "install": { "pipx": "pipx install agents-shipgate", diff --git a/CHANGELOG.md b/CHANGELOG.md index 19be0f71..f3df8a27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,18 @@ ## Unreleased +## 0.14.0 - 2026-06-30 + +- **Versioning: the `1.0.0-alpha` line is withdrawn; this work ships as + `0.14.0`.** An earlier draft of this cycle briefly carried `1.0.0a1`. That + label was withdrawn: the `report.json` schema (`report_schema_version: + "0.28"`) is still additive-versioned and not yet frozen, the package is still + `Development Status :: 4 - Beta`, and no real-world detection-accuracy + baseline has been published — none of which support a `1.0` line. `0.14.0` + continues the `0.x` contract line from `0.13.0` and carries the same + agent-controller cleanup (see + [STABILITY.md](STABILITY.md#migration-note-0-14-0)). A `1.0` line will begin + only when the report schema reaches `1.0` and holds without a breaking change. - **Non-preview `verify` now fails closed on a missing `--config`.** `agents-shipgate verify --workspace . --config missing.yaml --json` exits `2` with `merge_verdict: "unknown"`, `applicability: "unknown"`, and diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fab6d35a..44637c48 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -42,10 +42,9 @@ agents-shipgate list-checks ## Surface discipline Read this before adding a new public surface. This project has shipped surface -area faster than it has proven the surface it already has — the review in -[`docs/shipgate-strategic-engineering-review.md`](docs/shipgate-strategic-engineering-review.md) -names this directly. Until the verdict-accuracy benchmark and default-on -activation land, the bar for new surface is deliberately high. +area faster than it has proven the surface it already has. Until the +verdict-accuracy benchmark and default-on activation land, the bar for new +surface is deliberately high. A **new surface** is any of: a new CLI command or sub-app; a new `report_schema_version` or other versioned schema; a new top-level report or diff --git a/README.md b/README.md index 65a25148..5de7ff4c 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,15 @@ Local-first and static by default — no agent execution, tool calls, LLM calls, +> [!IMPORTANT] +> **Status: pre-1.0 (beta).** The decision engine is deterministic and stable, +> but Shipgate's real-world detection accuracy is still being validated against +> a labeled corpus of agent PRs — no precision/recall numbers are published yet. +> On heavily dynamic tool surfaces (factory-built toolsets, config-bound +> allowlists, runtime-assembled tools), Shipgate deliberately returns +> `insufficient_evidence` rather than guess. Treat it as an advisory gate while +> that accuracy work is in progress — see [ROADMAP.md](ROADMAP.md). + ## 60 seconds: watch it block two PRs Claude Code adds `stripe.create_refund` to your support agent and opens a @@ -45,6 +54,15 @@ uvx agents-shipgate fixture run agent_weakens_gate gate-removal checks are suppression-immune: the cheapest reward-hack is also the most visible one. +**…and here's the failure mode.** These two cases are constructed fixtures with +a clear-cut answer, chosen to show the gate working. Real PRs are messier: when +a change builds its tool surface dynamically — a toolkit factory, a config-bound +allowlist, tools assembled at runtime — static extraction often can't enumerate +the result, and Shipgate returns `insufficient_evidence` and routes to a human +rather than emit a confident wrong verdict. That is the intended failure mode, +not a bug; reducing how often it fires on real dynamic code is active work (see +[ROADMAP.md](ROADMAP.md)). + One engine decides (`report.json.release_decision.decision`); everything else — `merge_verdict`, PR comments, Check Runs, Action outputs — is a deterministic projection of it. Five-minute version: @@ -403,7 +421,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target diff --git a/ROADMAP.md b/ROADMAP.md index 62dcd2a8..90996b05 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -2,7 +2,7 @@ > **Naming.** This project is **Agents Shipgate** (display name) / `agents-shipgate` (package, CLI, repo). See [`AGENTS.md` § Naming (canonical)](AGENTS.md#naming-canonical) for the full convention. -**Latest release: `v1.0.0a1`** — the **agent-native contract cleanup** cycle. +**Latest release: `v0.14.0`** — the **agent-native contract cleanup** cycle. ## What Agents Shipgate is diff --git a/STABILITY.md b/STABILITY.md index eb7c170a..d2e0f7a1 100644 --- a/STABILITY.md +++ b/STABILITY.md @@ -1,20 +1,31 @@ -# Stability Contract · 1.0.0-alpha +# Stability Contract · 0.14.0 What agents and CI integrations can rely on across versions of Agents Shipgate. This document is the contract. If the runtime ever diverges from what's documented here, that's a bug — please file an issue. +Shipgate is pre-1.0. The CLI surface, exit codes, and `contract_version` +described here are stable within the `0.x` line, but the `report.json` schema +(`report_schema_version`, currently `0.28`) is still additive-versioned and +not yet frozen. A `1.0` line will not begin until the report schema reaches +`1.0` and holds without a breaking change. Pin a version (or the Action tag) +for reproducible CI. + --- - + -## Migration Note 1.0.0 Alpha +## Migration Note: 0.14.0 -`1.0.0a1` starts a new alpha contract line on top of the `0.13.0` -release. It deliberately cleans up overlapping agent-controller contracts -instead of preserving every `0.x` surface. +`0.14.0` continues the `0.x` contract line from `0.13.0`. It is a minor +release that nonetheless makes deliberate breaking changes to the +agent-controller surface — permitted under `0.x` semantics — cleaning up +overlapping contracts instead of preserving every earlier surface. (An +earlier draft of this work was briefly labelled `1.0.0-alpha`; that label was +withdrawn because the report schema is not yet frozen, and the same changes +ship here as `0.14.0`.) -Breaking changes from the `0.x` line: +Breaking changes from the `0.13.0` line: - `agents-shipgate verify` no longer writes `agents-shipgate-reports/agent-result.json`. Agents should read @@ -67,12 +78,12 @@ Breaking changes from the `0.x` line: `verifier.json.merge_verdict` is the controller projection for agents and PR automation; it is not a second release gate. -## What WILL NOT change in the current alpha line +## What WILL NOT change in the current `0.x` line ### CLI command surface -These commands and flags are stable across the current `1.0.0a*` -contract line. Future alpha versions may make deliberate breaking +These commands and flags are stable across the current `0.14.x` +contract line. Future `0.x` versions may make deliberate breaking changes only by bumping `contract_version` and updating this file. | Command | Stable flags | @@ -108,7 +119,7 @@ changes only by bumping `contract_version` and updating this file. ### Provisional CLI command surface The org/fleet governance commands are preview surfaces in the current -`1.0.0a*` line. They are documented, deterministic, local-only, and included in +`0.14.x` line. They are documented, deterministic, local-only, and included in `agents-shipgate contract --json` / `.well-known/agents-shipgate.json` for design-partner discovery, but their flags and schemas are not stable command-contract commitments yet. They remain consumers of `verify` artifacts; diff --git a/adoption-kits/claude-code-skill/SKILL.md b/adoption-kits/claude-code-skill/SKILL.md index d6647e18..371c6479 100644 --- a/adoption-kits/claude-code-skill/SKILL.md +++ b/adoption-kits/claude-code-skill/SKILL.md @@ -72,7 +72,7 @@ For non-GitHub CI (GitLab, CircleCI, Jenkins, Azure Pipelines, Buildkite, Bitbuc ## Stable contracts (rely on these) -- **CLI surface** follows the current alpha contract line — see https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md. +- **CLI surface** follows the current 0.x contract line — see https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md. - **Installed CLI contract**: when available, run `agents-shipgate contract --json` to verify local schema versions, capability/research surfaces, `release_decision.decision`, and manual-review signal fields. Older installs should use [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md) or upgrade before automating against the local contract command. - **Verifier JSON**: `verifier_schema_version: "0.1"`. Read `merge_verdict`, `can_merge_without_human`, `first_next_action`, `fix_task`, `capability_review.top_changes`, `trust_root_touched`, and `policy_weakened` before summarizing an AI-generated PR. `merge_verdict` is a deterministic projection; the gate remains `report.json.release_decision.decision`. - **Verify run JSON**: `verify-run.json` uses `schema_version: "shipgate.verify_run/v1"` and records stable run identity, subject refs, input hashes, outcome, and artifact hashes. It is the reproducibility artifact for `verify`; do not treat it as a second gate. diff --git a/adoption-kits/claude-code-skill/prompts/add-shipgate-to-repo.md b/adoption-kits/claude-code-skill/prompts/add-shipgate-to-repo.md index b2b7ee06..c13c707e 100644 --- a/adoption-kits/claude-code-skill/prompts/add-shipgate-to-repo.md +++ b/adoption-kits/claude-code-skill/prompts/add-shipgate-to-repo.md @@ -11,8 +11,8 @@ agent-related PRs should use `agents-shipgate verify` after this adoption step. 1. **Install the tool - pin the version so a stale build can't shadow it.** This flow uses the current verifier, agent-handoff, primary-command, and Codex-boundary contracts and requires **contract v9 or newer**; an older copy lingering on `PATH` may lack the command or schema fields this prompt expects. Prefer a **pinned, zero-install** runner that fetches the exact version every time instead of trusting whatever is already on `PATH`. **Pin it into one variable and use that for every step below**, so no single command can fall through to a stale binary: ```bash - SG="uvx agents-shipgate@1.0.0a1" # uv: ephemeral, always the pinned build - # or: SG="pipx run agents-shipgate==1.0.0a1" + SG="uvx agents-shipgate@0.14.0" # uv: ephemeral, always the pinned build + # or: SG="pipx run agents-shipgate==0.14.0" $SG --version # confirm the pinned runner resolves ``` Every step below calls `$SG …`; e.g. `$SG verify --preview --json` runs the verify preview through the pinned runner, never a `PATH` copy. @@ -20,7 +20,7 @@ agent-related PRs should use `agents-shipgate verify` after this adoption step. If you would rather install onto `PATH`, pin the floor and **fail loudly when it resolves older** — a plain `pipx install agents-shipgate` is a no-op when an older build already exists — then set `SG=agents-shipgate`: ```bash python -m pip install -U --pre agents-shipgate - agents-shipgate contract --json # STOP if this reports contract_version < 9 - re-run pinned via uvx agents-shipgate@1.0.0a1 + agents-shipgate contract --json # STOP if this reports contract_version < 9 - re-run pinned via uvx agents-shipgate@0.14.0 SG=agents-shipgate # only after the line above confirms contract v9+ ``` diff --git a/adoption-kits/claude-code-skill/prompts/decide-shipgate-relevance.md b/adoption-kits/claude-code-skill/prompts/decide-shipgate-relevance.md index 00cf0ef2..2e647122 100644 --- a/adoption-kits/claude-code-skill/prompts/decide-shipgate-relevance.md +++ b/adoption-kits/claude-code-skill/prompts/decide-shipgate-relevance.md @@ -71,13 +71,13 @@ the rules to the changed file list. 5. **Then act.** - If `run_shipgate: true`: run verify preview through a **version-pinned, zero-install** runner so a stale copy already on `PATH` can't answer — - `uvx agents-shipgate@1.0.0a1 verify --preview --json` (or - `pipx run agents-shipgate==1.0.0a1 ...`). Only fall back to a bare + `uvx agents-shipgate@0.14.0 verify --preview --json` (or + `pipx run agents-shipgate==0.14.0 ...`). Only fall back to a bare `agents-shipgate verify --preview --json` once `agents-shipgate --version` confirms contract v9 or newer. Then follow [`prompts/add-shipgate-to-repo.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/prompts/add-shipgate-to-repo.md) for the first-adoption helper flow, or point the user at the GitHub - Action (`ThreeMoonsLab/agents-shipgate@v1.0.0a1`) if they prefer CI. + Action (`ThreeMoonsLab/agents-shipgate@v0.14.0`) if they prefer CI. - If `run_shipgate: false` and `dry_run_recommended: true`: propose a non-mutating scan only — never propose `init --write` based on a dry-run match alone. Phrase it as "X may have shifted the tool diff --git a/adoption-kits/claude-code-skill/prompts/stabilize-strict-mode.md b/adoption-kits/claude-code-skill/prompts/stabilize-strict-mode.md index 4009f643..45cccd27 100644 --- a/adoption-kits/claude-code-skill/prompts/stabilize-strict-mode.md +++ b/adoption-kits/claude-code-skill/prompts/stabilize-strict-mode.md @@ -37,9 +37,9 @@ The user has Agents Shipgate running in **advisory** mode and wants to graduate 5. **Update the CI workflow.** Replace the existing advisory step with strict + baseline. Use [`examples/github-actions/03-strict-with-baseline.yml`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/examples/github-actions/03-strict-with-baseline.yml) as the template: ```yaml - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' ci_mode: strict fail_on: critical baseline: .agents-shipgate/baseline.json diff --git a/adoption-kits/codex-skill/assets/advisory-pr-comment.yml b/adoption-kits/codex-skill/assets/advisory-pr-comment.yml index 0bf82ff5..7c46518d 100644 --- a/adoption-kits/codex-skill/assets/advisory-pr-comment.yml +++ b/adoption-kits/codex-skill/assets/advisory-pr-comment.yml @@ -18,9 +18,9 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/docs/agent-contract-current.md b/docs/agent-contract-current.md index 1bceecd1..904faabd 100644 --- a/docs/agent-contract-current.md +++ b/docs/agent-contract-current.md @@ -30,7 +30,7 @@ Downstream repos generated with `init --agent-instructions=default` get the minimal local copy at `.shipgate/agent-contract.json`. -- Latest release: `v1.0.0a1` (see [pyproject.toml](../pyproject.toml) for the in-tree version) +- Latest release: `v0.14.0` (see [pyproject.toml](../pyproject.toml) for the in-tree version) - Runtime contract: `9` - Current report schema: `0.28` — [`docs/report-schema.v0.28.json`](report-schema.v0.28.json) - Current packet schema: `0.7` — [`docs/packet-schema.v0.7.json`](packet-schema.v0.7.json) @@ -362,7 +362,7 @@ exactly one stdout JSON object using `schema_version: "shipgate.codex_boundary_result/v1"` and the schema in [`codex-boundary-result-schema.v1.json`](codex-boundary-result-schema.v1.json). The removed `--format agent-json` alias and `agent_result_v1` schema string are -breaking 1.0.0-alpha changes; see [STABILITY.md](../STABILITY.md#migration-note-100-alpha). +breaking 0.14.0 changes; see [STABILITY.md](../STABILITY.md#migration-note-0-14-0). Coding agents should switch on `decision`, `completion_allowed`, `must_stop`, `first_next_action`, `human_review`, `repair`, and `policy`. Do not derive an agent diff --git a/docs/agent-handoff-schema.v1.json b/docs/agent-handoff-schema.v1.json index d656fe95..f9cc7bd4 100644 --- a/docs/agent-handoff-schema.v1.json +++ b/docs/agent-handoff-schema.v1.json @@ -430,7 +430,7 @@ "type": "string" }, "version": { - "default": "1.0.0a1", + "default": "0.14.0", "title": "Version", "type": "string" } diff --git a/docs/agent-result-schema.v1.json b/docs/agent-result-schema.v1.json index 43598cf2..9125d478 100644 --- a/docs/agent-result-schema.v1.json +++ b/docs/agent-result-schema.v1.json @@ -369,7 +369,7 @@ "type": "string" }, "version": { - "default": "1.0.0a1", + "default": "0.14.0", "title": "Version", "type": "string" } diff --git a/docs/agents/protocol.md b/docs/agents/protocol.md index ac6c568a..e18cbd08 100644 --- a/docs/agents/protocol.md +++ b/docs/agents/protocol.md @@ -63,7 +63,7 @@ The stdout object has: Consumers must make decisions from JSON fields, never from prose or Markdown. The stable schema is `docs/codex-boundary-result-schema.v1.json`. The -`1.0.0-alpha` contract renamed this local boundary result away from the older +`0.14.0` contract renamed this local boundary result away from the older generic `agent_result_v1` schema string. `decision`, `completion_allowed`, `must_stop`, `first_next_action`, `human_review`, `repair`, and `policy` are the control signals. `risk_level` is explanatory and may differ between local-check and diff --git a/docs/codex-boundary-result-schema.v1.json b/docs/codex-boundary-result-schema.v1.json index 059dcba3..68847182 100644 --- a/docs/codex-boundary-result-schema.v1.json +++ b/docs/codex-boundary-result-schema.v1.json @@ -369,7 +369,7 @@ "type": "string" }, "version": { - "default": "1.0.0a1", + "default": "0.14.0", "title": "Version", "type": "string" } diff --git a/docs/decks/architecture-overview/README.md b/docs/decks/architecture-overview/README.md deleted file mode 100644 index f7491f1c..00000000 --- a/docs/decks/architecture-overview/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Agents Shipgate Architecture Deck - -Editable presentation for explaining how Agents Shipgate works internally. - -The deck is grounded in: - -- `docs/architecture.md` -- `docs/trust-model.md` -- `STABILITY.md` -- `src/agents_shipgate/cli/scan/orchestrator.py` - -## Files - -- `output/output.pptx` - editable PowerPoint deck. -- `src/build.mjs` - source used to generate the deck. -- `scratch/previews/contact-sheet.png` - rendered preview contact sheet. -- `scratch/quality-report.json` - PPTX package QA report. - -## Rebuild - -Create or refresh the presentation workspace with the Presentations runtime, then -run the builder: - -```bash -node /path/to/presentations/scripts/create_presentation_workspace.js \ - --deck-id architecture-overview \ - --workspace docs/decks/architecture-overview \ - --force -cd docs/decks/architecture-overview -node src/build.mjs -``` diff --git a/docs/decks/architecture-overview/output/output.pptx b/docs/decks/architecture-overview/output/output.pptx deleted file mode 100644 index 7ac018a4..00000000 Binary files a/docs/decks/architecture-overview/output/output.pptx and /dev/null differ diff --git a/docs/decks/architecture-overview/package.json b/docs/decks/architecture-overview/package.json deleted file mode 100644 index e986b24b..00000000 --- a/docs/decks/architecture-overview/package.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "private": true, - "type": "module" -} diff --git a/docs/decks/architecture-overview/src/build.mjs b/docs/decks/architecture-overview/src/build.mjs deleted file mode 100644 index e7ac3f2c..00000000 --- a/docs/decks/architecture-overview/src/build.mjs +++ /dev/null @@ -1,404 +0,0 @@ -import { - Presentation, - PresentationFile, - chart, - column, - fill, - fixed, - fr, - grid, - grow, - hug, - image, - panel, - row, - rule, - shape, - text, - wrap, -} from "@oai/artifact-tool"; -import fs from "node:fs/promises"; -import path from "node:path"; - -const workspace = path.resolve("."); -const root = path.resolve(workspace, "../../.."); -const outDir = path.join(workspace, "output"); -const scratchDir = path.join(workspace, "scratch"); -const previewDir = path.join(scratchDir, "previews"); -const layoutDir = path.join(scratchDir, "layouts"); -const pptxPath = path.join(outDir, "output.pptx"); -const brandImage = path.join(root, "assets", "readme-header-dark.png"); - -const W = 1920; -const H = 1080; - -const C = { - ink: "#111827", - muted: "#526273", - paper: "#F8FAFC", - white: "#FFFFFF", - dark: "#071114", - dark2: "#102026", - teal: "#21B6A8", - blue: "#3578E5", - amber: "#F4B740", - red: "#D94C4C", - green: "#2F9D67", - purple: "#7C5CE5", - line: "#D6E0E8", -}; - -const S = { - eyebrow: { fontSize: 22, bold: true, color: C.teal }, - title: { fontSize: 58, bold: true, color: C.ink }, - subtitle: { fontSize: 27, color: C.muted }, - body: { fontSize: 27, color: C.ink }, - small: { fontSize: 18, color: C.muted }, - label: { fontSize: 19, bold: true, color: C.muted }, - code: { fontSize: 20, color: C.ink }, - inverseTitle: { fontSize: 74, bold: true, color: C.white }, - inverseSubtitle: { fontSize: 29, color: "#BED1D0" }, -}; - -function addSolidBackground(slide, color = C.paper) { - slide.compose(shape({ name: "background", width: fixed(W), height: fixed(H), fill: color, line: { width: 0, fill: color } }), { - frame: { left: 0, top: 0, width: W, height: H }, - baseUnit: 8, - }); -} - -function footer(source = "Source: docs/architecture.md") { - return row({ name: "source-rail", width: fill, height: hug, justify: "between", align: "end" }, [ - text("Agents Shipgate", { name: "footer-brand", width: hug, height: hug, style: { fontSize: 17, bold: true, color: C.muted } }), - text(source, { name: "footer-source", width: wrap(1180), height: hug, style: { fontSize: 15, color: "#71808E" } }), - ]); -} - -function titleStack(eyebrow, title, subtitle) { - return column({ name: "title-stack", width: fill, height: hug, gap: 16 }, [ - text(eyebrow, { name: "slide-eyebrow", width: fill, height: hug, style: S.eyebrow }), - text(title, { name: "slide-title", width: fixed(1380), height: fixed(140), style: S.title }), - subtitle - ? text(subtitle, { name: "slide-subtitle", width: fixed(1320), height: fixed(72), style: S.subtitle }) - : rule({ name: "title-rule", width: fixed(210), stroke: C.teal, weight: 6 }), - ]); -} - -function bodySlide(presentation, eyebrow, title, subtitle, body, source) { - const slide = presentation.slides.add(); - addSolidBackground(slide); - slide.compose( - column({ name: "slide-root", width: fill, height: fill, padding: { x: 92, y: 68 }, gap: 34 }, [ - titleStack(eyebrow, title, subtitle), - body, - footer(source), - ]), - { frame: { left: 0, top: 0, width: W, height: H }, baseUnit: 8 }, - ); - return slide; -} - -function codePanel(name, lines, style = {}) { - return panel( - { - name, - width: fill, - height: style.height ?? fill, - fill: style.fill ?? C.white, - line: { width: 1, fill: style.line ?? C.line }, - borderRadius: "rounded-md", - padding: { x: 28, y: 24 }, - }, - column( - { name: `${name}-content`, width: fill, height: style.height ?? fill, gap: style.gap ?? 9 }, - lines.map((line, idx) => - text(line, { - name: `${name}-line-${idx + 1}`, - width: fill, - height: hug, - style: idx === 0 && style.heading - ? { fontSize: 22, bold: true, color: style.headingColor ?? C.ink } - : style.textStyle ?? S.code, - }), - ), - ), - ); -} - -function stepBox(name, label, detail, color) { - return panel( - { - name, - width: fill, - height: hug, - fill: C.white, - line: { width: 1, fill: C.line }, - borderRadius: "rounded-md", - padding: { x: 24, y: 20 }, - }, - row({ width: fill, height: hug, gap: 16, align: "start" }, [ - shape({ width: fixed(16), height: fixed(16), fill: color, line: { width: 0, fill: color } }), - column({ width: fill, height: hug, gap: 4 }, [ - text(label, { width: fill, height: hug, style: { fontSize: 26, bold: true, color: C.ink } }), - text(detail, { width: fill, height: hug, style: { fontSize: 20, color: C.muted } }), - ]), - ]), - ); -} - -function compactItem(name, label, detail, color) { - return row({ name, width: fill, height: hug, gap: 14, align: "start" }, [ - shape({ width: fixed(14), height: fixed(14), fill: color, line: { width: 0, fill: color } }), - column({ width: fill, height: hug, gap: 4 }, [ - text(label, { width: fill, height: hug, style: { fontSize: 24, bold: true, color: C.ink } }), - text(detail, { width: fill, height: hug, style: { fontSize: 19, color: C.muted } }), - ]), - ]); -} - -function moduleBox(name, label, detail, color) { - return panel( - { - name, - width: fill, - height: fill, - fill: C.white, - line: { width: 1, fill: C.line }, - borderRadius: "rounded-md", - padding: { x: 22, y: 20 }, - }, - column({ width: fill, height: fill, gap: 12 }, [ - shape({ width: fixed(18), height: fixed(18), fill: color, line: { width: 0, fill: color } }), - text(label, { width: fill, height: hug, style: { fontSize: 27, bold: true, color: C.ink } }), - text(detail, { width: fill, height: hug, style: { fontSize: 20, color: C.muted } }), - ]), - ); -} - -async function savePreviews(presentation) { - await fs.mkdir(previewDir, { recursive: true }); - await fs.mkdir(layoutDir, { recursive: true }); - for (const slide of presentation.slides.items) { - const index = slide.index + 1; - const png = await presentation.export({ slide, format: "png" }); - await fs.writeFile(path.join(previewDir, `slide-${String(index).padStart(2, "0")}.png`), Buffer.from(await png.arrayBuffer())); - const layout = await slide.export({ format: "layout" }); - await fs.writeFile(path.join(layoutDir, `slide-${String(index).padStart(2, "0")}.json`), await layout.text()); - } -} - -async function main() { - await fs.mkdir(outDir, { recursive: true }); - await fs.mkdir(scratchDir, { recursive: true }); - const brandDataUrl = `data:image/png;base64,${(await fs.readFile(brandImage)).toString("base64")}`; - - const presentation = Presentation.create({ slideSize: { width: W, height: H } }); - - // 1. Cover - { - const slide = presentation.slides.add(); - addSolidBackground(slide, C.dark); - slide.compose(shape({ name: "cover-teal-field", width: fixed(690), height: fixed(H), fill: C.teal, line: { width: 0, fill: C.teal } }), { - frame: { left: 1230, top: 0, width: 690, height: H }, - baseUnit: 8, - }); - slide.compose(shape({ name: "cover-black-rail", width: fixed(420), height: fixed(H), fill: "#061013", line: { width: 0, fill: "#061013" } }), { - frame: { left: 1500, top: 0, width: 420, height: H }, - baseUnit: 8, - }); - slide.compose( - column({ name: "cover-root", width: fill, height: fill, padding: { x: 96, y: 76 }, justify: "between" }, [ - column({ name: "cover-copy", width: wrap(1060), height: hug, gap: 26 }, [ - text("Agents Shipgate architecture", { name: "cover-eyebrow", width: fill, height: hug, style: { fontSize: 24, bold: true, color: C.teal } }), - text("How the static release gate works", { name: "cover-title", width: wrap(1060), height: hug, style: S.inverseTitle }), - text("From shipgate.yaml to deterministic findings, reports, and CI exit codes.", { - name: "cover-promise", - width: wrap(820), - height: hug, - style: S.inverseSubtitle, - }), - ]), - row({ name: "cover-bottom", width: fill, height: hug, justify: "between", align: "end" }, [ - image({ name: "three-moons-lab-mark", dataUrl: brandDataUrl, width: fixed(520), height: fixed(62), fit: "contain", alt: "Three Moons Lab" }), - text("Architecture overview", { name: "cover-context", width: hug, height: hug, style: { fontSize: 18, color: "#C9D8D7" } }), - ]), - ]), - { frame: { left: 0, top: 0, width: W, height: H }, baseUnit: 8 }, - ); - } - - // 2. One-line model - bodySlide( - presentation, - "1. Mental model", - "Static gate.", - "Reads declared artifacts, normalizes tools, runs checks, and writes reports.", - grid({ name: "model-grid", width: fill, height: fill, columns: [fr(1), fr(1), fr(1), fr(1)], columnGap: 24 }, [ - stepBox("model-read", "Read", "shipgate.yaml plus local tool and policy artifacts", C.blue), - stepBox("model-normalize", "Normalize", "turn heterogeneous sources into Tool objects", C.teal), - stepBox("model-check", "Check", "run release-readiness rules against ScanContext", C.amber), - stepBox("model-report", "Gate", "write reports and return stable CI exit codes", C.green), - ]), - "Source: docs/architecture.md pipeline", - ); - - // 3. Pipeline - bodySlide( - presentation, - "2. Pipeline", - "The scan is a deterministic data pipeline.", - "Each stage transforms local inputs into a richer static model; no agent execution is required.", - grid( - { name: "pipeline-grid", width: fill, height: fill, columns: [fr(1), fr(1)], rows: [fr(1), fr(1), fr(1)], columnGap: 30, rowGap: 24 }, - [ - stepBox("pipe-config", "config/loader.py", "load and validate shipgate.yaml", C.blue), - stepBox("pipe-inputs", "inputs/*", "normalize MCP, OpenAPI, API, ADK, SDK inputs", C.teal), - stepBox("pipe-risk", "core/risk_hints.py", "enrich tools with read/write/destructive risk tags", C.amber), - stepBox("pipe-context", "core/context.py", "assemble manifest + agent + tools + artifacts", C.purple), - stepBox("pipe-checks", "checks/*.py", "pure check functions return Finding objects", C.red), - stepBox("pipe-report", "report/* + ci/*", "write reports and compute CI result", C.green), - ], - ), - "Source: docs/architecture.md", - ); - - // 4. Modules - bodySlide( - presentation, - "3. Codebase map", - "Repo map.", - "Adapters, models, checks, and formatters stay separate.", - grid( - { name: "module-grid", width: fill, height: fill, columns: [fr(1), fr(1), fr(1)], rows: [fr(1), fr(1)], columnGap: 28, rowGap: 28 }, - [ - moduleBox("mod-cli", "cli/", "entry points: scan, init, doctor, explain", C.blue), - moduleBox("mod-config", "config/", "Pydantic manifest schema and loader", C.teal), - moduleBox("mod-core", "core/", "Tool, Finding, Report, ScanContext", C.purple), - moduleBox("mod-inputs", "inputs/", "source-specific adapters", C.amber), - moduleBox("mod-checks", "checks/", "release-readiness checks by category", C.red), - moduleBox("mod-report", "report/", "Markdown, JSON, SARIF output", C.green), - ], - ), - "Source: docs/architecture.md module map", - ); - - // 5. Scan orchestration - bodySlide( - presentation, - "4. run_scan orchestration", - "The CLI coordinates the whole scan in one pass.", - "The important boundary is before checks: all inputs become tools, artifacts, warnings, and one ScanContext.", - grid({ name: "scan-grid", width: fill, height: fill, columns: [fr(1.05), fr(0.95)], columnGap: 52 }, [ - codePanel("scan-code", [ - "src/agents_shipgate/cli/scan/orchestrator.py", - "manifest = load_manifest(config_path)", - "loaded_sources = _load_sources(...)", - "api_source, api_artifacts = load_openai_api_artifacts(...)", - "tools = enrich_tools_with_risk_hints(manifest, tools)", - "context = ScanContext(manifest, agent, tools, artifacts)", - "findings = run_checks(context)", - "report = build_report(...)", - "return report, exit_code_for_report(...)", - ], { heading: true, headingColor: C.blue, textStyle: { fontSize: 18, color: C.ink }, fill: C.white }), - column({ name: "scan-notes", width: fill, height: fill, gap: 20, justify: "center" }, [ - compactItem("scan-base", "Manifest is copied and CLI flags override config", "CI mode, output dir, formats, fail_on, baseline.", C.blue), - compactItem("scan-dedupe", "Sources are flattened and deduplicated", "Higher-fidelity tool sources win on duplicate names.", C.teal), - compactItem("scan-policy", "Suppressions, overrides, and baselines are late-bound", "Finding identity is stable before CI policy is applied.", C.amber), - compactItem("scan-output", "Reports are generated before exit policy", "JSON/Markdown/SARIF exist even when strict mode fails.", C.green), - ]), - ]), - "Source: src/agents_shipgate/cli/scan/orchestrator.py", - ); - - // 6. Checks and findings - bodySlide( - presentation, - "5. Checks are pure functions.", - "A check reads ScanContext and returns Finding objects.", - "That keeps rules deterministic, testable, and composable with policy packs and plugins.", - grid({ name: "checks-grid", width: fill, height: fill, columns: [fr(1), fr(1)], columnGap: 50 }, [ - column({ name: "check-categories", width: fill, height: fill, gap: 20, justify: "center" }, [ - compactItem("check-api", "api", "OpenAI API schema, structured output, retry, traces", C.blue), - compactItem("check-auth", "auth", "missing scopes and broad permissions", C.teal), - compactItem("check-policy", "policy", "approval and confirmation policies", C.amber), - compactItem("check-sidefx", "side effects", "idempotency, destructive/write behavior", C.red), - compactItem("check-docs", "documentation", "missing descriptions and injection-like metadata", C.green), - ]), - codePanel("finding-contract", [ - "Finding identity", - "check_id", - "tool_name", - "canonical evidence", - "", - "Fingerprint:", - "fp_ + sha256(check_id | tool_name | evidence)[:16]", - "", - "Used by suppressions and baselines.", - ], { heading: true, headingColor: C.purple, textStyle: { fontSize: 23, color: C.ink }, fill: C.white }), - ]), - "Source: docs/architecture.md and STABILITY.md", - ); - - // 7. Reports and CI - bodySlide( - presentation, - "6. Reports are artifacts; CI is policy.", - "The scan always builds a report, then computes the exit code from CI mode and baseline state.", - "Stable JSON fields are the integration surface for agents and automation.", - grid({ name: "report-grid", width: fill, height: fill, columns: [fr(1), fr(1)], columnGap: 52 }, [ - column({ name: "report-list", width: fill, height: fill, gap: 22, justify: "center" }, [ - compactItem("report-md", "report.md", "human-readable release review", C.blue), - compactItem("report-json", "report.json", "stable machine-readable fields", C.teal), - compactItem("report-sarif", "report.sarif", "code scanning and security tooling", C.purple), - compactItem("report-summary", "GitHub step summary", "PR feedback without opening artifacts", C.green), - ]), - panel({ name: "exit-table", width: fill, height: fill, fill: C.white, line: { width: 1, fill: C.line }, borderRadius: "rounded-md", padding: { x: 28, y: 24 } }, - column({ width: fill, height: fill, gap: 18 }, [ - text("Stable exit codes", { width: fill, height: hug, style: { fontSize: 28, bold: true, color: C.ink } }), - compactItem("exit-0", "0", "pass or advisory result", C.green), - compactItem("exit-2", "2", "manifest config error", C.amber), - compactItem("exit-3", "3", "input parse error", C.red), - compactItem("exit-20", "20", "strict-mode gate failure", C.red), - ])), - ]), - "Source: STABILITY.md and src/agents_shipgate/cli/scan/orchestrator.py", - ); - - // 8. Trust and extension - bodySlide( - presentation, - "7. Trust model and extension points", - "Static by default. Extensible by explicit contribution or plugin opt-in.", - "The default architecture keeps untrusted agent code outside the execution boundary.", - grid({ name: "trust-grid", width: fill, height: fill, columns: [fr(1), fr(1)], columnGap: 52 }, [ - panel({ name: "trust-invariants", width: fill, height: fill, fill: C.white, line: { width: 1, fill: C.line }, borderRadius: "rounded-md", padding: { x: 28, y: 24 } }, - column({ width: fill, height: fill, gap: 18 }, [ - text("Default invariants", { width: fill, height: hug, style: { fontSize: 28, bold: true, color: C.ink } }), - compactItem("trust-code", "No user code import or execution", "SDK loaders use AST parsing only.", C.blue), - compactItem("trust-model", "No model, tool, MCP, or network calls", "Inputs are local files.", C.teal), - compactItem("trust-path", "Path traversal containment", "Declared paths must stay under the manifest directory.", C.amber), - compactItem("trust-plugins", "Plugins off by default", "Opt-in changes the trust boundary.", C.red), - ])), - panel({ name: "extension-points", width: fill, height: fill, fill: "#F3FAF8", line: { width: 1, fill: C.line }, borderRadius: "rounded-md", padding: { x: 28, y: 24 } }, - column({ width: fill, height: fill, gap: 18 }, [ - text("How to extend", { width: fill, height: hug, style: { fontSize: 28, bold: true, color: C.ink } }), - compactItem("ext-input", "New input adapter", "Add loader, wire scan.py, fixture, tests.", C.green), - compactItem("ext-check", "New check", "Add check function, registry metadata, docs, tests.", C.purple), - compactItem("ext-stable", "Keep IDs stable", "Never rename published check IDs.", C.amber), - compactItem("ext-report", "Preserve JSON contract", "Additive report changes only in 0.x.", C.blue), - ])), - ]), - "Source: docs/trust-model.md, docs/architecture.md, and STABILITY.md", - ); - - const pptxBlob = await PresentationFile.exportPptx(presentation); - await pptxBlob.save(pptxPath); - await savePreviews(presentation); - - console.log(JSON.stringify({ pptx: pptxPath, slides: presentation.slides.count, previews: previewDir, layouts: layoutDir }, null, 2)); -} - -main().catch((error) => { - console.error(error && error.stack ? error.stack : String(error)); - process.exit(1); -}); diff --git a/docs/decks/openai-api-use-case/README.md b/docs/decks/openai-api-use-case/README.md deleted file mode 100644 index ba4f3b8a..00000000 --- a/docs/decks/openai-api-use-case/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Direct OpenAI API Use Case Deck - -Shareable deck for future users who call the OpenAI API directly with function -tools instead of MCP, OpenAPI, or SDK metadata. - -The story uses `samples/simple_openai_api_agent` as a realistic fixture, not a -customer case study. It follows four questions future users usually ask: - -1. What did the original direct API prompt say? -2. What problems did Agents Shipgate find? -3. Why are those problems release-significant? -4. How do we connect Agents Shipgate to a direct OpenAI API app? - -## Files - -- `output/output.pptx` - editable PowerPoint deck. -- `src/build.mjs` - source used to generate the deck. -- `scratch/previews/contact-sheet.png` - rendered preview contact sheet. -- `scratch/quality-report.json` - PPTX package QA report. - -## Rebuild - -Create or refresh the presentation workspace with the Presentations runtime, then -run the builder: - -```bash -node /path/to/presentations/scripts/create_presentation_workspace.js \ - --deck-id openai-api-use-case \ - --workspace docs/decks/openai-api-use-case \ - --force -cd docs/decks/openai-api-use-case -node src/build.mjs -``` diff --git a/docs/decks/openai-api-use-case/output/output.pptx b/docs/decks/openai-api-use-case/output/output.pptx deleted file mode 100644 index c7d2e979..00000000 Binary files a/docs/decks/openai-api-use-case/output/output.pptx and /dev/null differ diff --git a/docs/decks/openai-api-use-case/package.json b/docs/decks/openai-api-use-case/package.json deleted file mode 100644 index e986b24b..00000000 --- a/docs/decks/openai-api-use-case/package.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "private": true, - "type": "module" -} diff --git a/docs/decks/openai-api-use-case/src/build.mjs b/docs/decks/openai-api-use-case/src/build.mjs deleted file mode 100644 index 2142a9e7..00000000 --- a/docs/decks/openai-api-use-case/src/build.mjs +++ /dev/null @@ -1,472 +0,0 @@ -import { - Presentation, - PresentationFile, - auto, - chart, - column, - fill, - fixed, - fr, - grid, - grow, - hug, - image, - panel, - row, - rule, - shape, - text, - wrap, -} from "@oai/artifact-tool"; -import fs from "node:fs/promises"; -import path from "node:path"; - -const workspace = path.resolve("."); -const root = path.resolve(workspace, "../../.."); -const outDir = path.join(workspace, "output"); -const scratchDir = path.join(workspace, "scratch"); -const previewDir = path.join(scratchDir, "previews"); -const layoutDir = path.join(scratchDir, "layouts"); -const pptxPath = path.join(outDir, "output.pptx"); -const brandImage = path.join(root, "assets", "readme-header-dark.png"); - -const W = 1920; -const H = 1080; - -const C = { - ink: "#111827", - muted: "#536270", - faint: "#EDF2F6", - paper: "#F8FAFC", - white: "#FFFFFF", - dark: "#071114", - dark2: "#102026", - moon: "#EAF4F1", - teal: "#21B6A8", - blue: "#3578E5", - amber: "#F4B740", - red: "#D94C4C", - green: "#2F9D67", - line: "#D6E0E8", -}; - -const S = { - eyebrow: { fontSize: 22, bold: true, color: C.teal }, - title: { fontSize: 58, bold: true, color: C.ink }, - subtitle: { fontSize: 27, color: C.muted }, - body: { fontSize: 28, color: C.ink }, - small: { fontSize: 18, color: C.muted }, - label: { fontSize: 19, bold: true, color: C.muted }, - code: { fontSize: 23, color: C.ink }, - codeSmall: { fontSize: 19, color: C.ink }, - inverseTitle: { fontSize: 74, bold: true, color: C.white }, - inverseSubtitle: { fontSize: 29, color: "#BED1D0" }, - number: { fontSize: 104, bold: true, color: C.ink }, -}; - -function addSolidBackground(slide, color = C.paper) { - slide.compose(shape({ name: "background", width: fixed(W), height: fixed(H), fill: color, line: { width: 0, fill: color } }), { - frame: { left: 0, top: 0, width: W, height: H }, - baseUnit: 8, - }); -} - -function textBlock(value, options = {}) { - return text(value, { - width: options.width ?? fill, - height: options.height ?? hug, - name: options.name, - style: options.style ?? S.body, - columnSpan: options.columnSpan, - rowSpan: options.rowSpan, - }); -} - -function footer(source = "Source: samples/simple_openai_api_agent fixture") { - return row({ name: "source-rail", width: fill, height: hug, justify: "between", align: "end" }, [ - text("Agents Shipgate", { name: "footer-brand", width: hug, height: hug, style: { fontSize: 17, bold: true, color: C.muted } }), - text(source, { name: "footer-source", width: wrap(1120), height: hug, style: { fontSize: 15, color: "#71808E" } }), - ]); -} - -function titleStack(eyebrow, title, subtitle) { - return column({ name: "title-stack", width: fill, height: hug, gap: 18 }, [ - text(eyebrow, { name: "slide-eyebrow", width: fill, height: hug, style: S.eyebrow }), - text(title, { name: "slide-title", width: wrap(1380), height: hug, style: S.title }), - subtitle - ? text(subtitle, { name: "slide-subtitle", width: wrap(1300), height: hug, style: S.subtitle }) - : rule({ name: "title-rule", width: fixed(210), stroke: C.teal, weight: 6 }), - ]); -} - -function bodySlide(presentation, eyebrow, title, subtitle, body, source) { - const slide = presentation.slides.add(); - addSolidBackground(slide); - slide.compose( - column({ name: "slide-root", width: fill, height: fill, padding: { x: 92, y: 68 }, gap: 36 }, [ - titleStack(eyebrow, title, subtitle), - body, - footer(source), - ]), - { frame: { left: 0, top: 0, width: W, height: H }, baseUnit: 8 }, - ); - return slide; -} - -function miniFile(label, pathValue, accent = C.blue) { - return column({ name: `file-${label}`, width: fill, height: hug, gap: 10 }, [ - row({ width: fill, height: hug, gap: 14, align: "center" }, [ - shape({ width: fixed(18), height: fixed(18), fill: accent, line: { width: 0, fill: accent } }), - text(label, { width: fill, height: hug, style: { fontSize: 24, bold: true, color: C.ink } }), - ]), - text(pathValue, { width: fill, height: hug, style: { fontSize: 18, color: C.muted } }), - ]); -} - -function codePanel(name, lines, style = {}) { - return panel( - { - name, - width: fill, - height: style.height ?? fill, - fill: style.fill ?? C.white, - line: { width: 1, fill: style.line ?? C.line }, - borderRadius: "rounded-md", - padding: { x: 30, y: 26 }, - }, - column( - { name: `${name}-content`, width: fill, height: style.height ?? fill, gap: style.gap ?? 10 }, - lines.map((line, idx) => - text(line, { - name: `${name}-line-${idx + 1}`, - width: fill, - height: hug, - style: idx === 0 && style.heading - ? { fontSize: 22, bold: true, color: style.headingColor ?? C.ink } - : style.textStyle ?? S.codeSmall, - }), - ), - ), - ); -} - -function askRow(name, values, fillColor = C.white, accent = C.teal) { - return panel( - { - name, - width: fill, - height: hug, - fill: fillColor, - line: { width: 1, fill: C.line }, - borderRadius: "rounded-sm", - padding: { x: 22, y: 18 }, - }, - grid({ width: fill, height: hug, columns: [fr(0.75), fr(1.15), fr(0.9)], columnGap: 24, alignItems: "start" }, [ - row({ width: fill, height: hug, gap: 12, align: "center" }, [ - shape({ width: fixed(12), height: fixed(12), fill: accent, line: { width: 0, fill: accent } }), - text(values[0], { width: fill, height: hug, style: { fontSize: 23, bold: true, color: C.ink } }), - ]), - text(values[1], { width: fill, height: hug, style: { fontSize: 22, color: C.ink } }), - text(values[2], { width: fill, height: hug, style: { fontSize: 22, color: C.muted } }), - ]), - ); -} - -function askTable() { - return column({ name: "reviewer-asks-table", width: fill, height: fill, gap: 12, justify: "center" }, [ - panel( - { name: "asks-header", width: fill, height: hug, fill: C.dark2, line: { width: 0, fill: C.dark2 }, borderRadius: "rounded-sm", padding: { x: 22, y: 18 } }, - grid({ width: fill, height: hug, columns: [fr(0.75), fr(1.15), fr(0.9)], columnGap: 24 }, [ - text("Finding", { width: fill, height: hug, style: { fontSize: 21, bold: true, color: C.white } }), - text("Reviewer asks for", { width: fill, height: hug, style: { fontSize: 21, bold: true, color: C.white } }), - text("Why it matters", { width: fill, height: hug, style: { fontSize: 21, bold: true, color: C.white } }), - ]), - ), - askRow("ask-schema", ["Schema strictness", "strict=true, required fields, bounded amount", "Less ambiguous tool calls"], C.white, C.red), - askRow("ask-idempotency", ["Idempotency", "idempotency key or no retry for side effect", "Avoid duplicate refunds/emails"], "#FDF9F0", C.amber), - askRow("ask-approval", ["Approval", "approval_required plus passing trace evidence", "Human gate before financial action"], C.white, C.blue), - askRow("ask-ownership", ["Ownership", "owner and auth scope metadata", "Release accountability"], "#F3FAF8", C.green), - ]); -} - -function riskRow(label, detail, color) { - return row({ name: `risk-${label}`, width: fill, height: hug, gap: 18, align: "start" }, [ - shape({ name: `risk-dot-${label}`, width: fixed(20), height: fixed(20), fill: color, line: { width: 0, fill: color } }), - column({ width: fill, height: hug, gap: 4 }, [ - text(label, { width: fill, height: hug, style: { fontSize: 28, bold: true, color: C.ink } }), - text(detail, { width: fill, height: hug, style: { fontSize: 22, color: C.muted } }), - ]), - ]); -} - -async function savePreviews(presentation) { - await fs.mkdir(previewDir, { recursive: true }); - await fs.mkdir(layoutDir, { recursive: true }); - for (const slide of presentation.slides.items) { - const index = slide.index + 1; - const png = await presentation.export({ slide, format: "png" }); - await fs.writeFile(path.join(previewDir, `slide-${String(index).padStart(2, "0")}.png`), Buffer.from(await png.arrayBuffer())); - const layout = await slide.export({ format: "layout" }); - await fs.writeFile(path.join(layoutDir, `slide-${String(index).padStart(2, "0")}.json`), await layout.text()); - } -} - -async function main() { - await fs.mkdir(outDir, { recursive: true }); - await fs.mkdir(scratchDir, { recursive: true }); - const brandDataUrl = `data:image/png;base64,${(await fs.readFile(brandImage)).toString("base64")}`; - - const presentation = Presentation.create({ - slideSize: { width: W, height: H }, - }); - - // 1. Cover - { - const slide = presentation.slides.add(); - addSolidBackground(slide, C.dark); - slide.compose(shape({ name: "cover-teal-field", width: fixed(760), height: fixed(H), fill: C.teal, line: { width: 0, fill: C.teal } }), { - frame: { left: 1160, top: 0, width: 760, height: H }, - baseUnit: 8, - }); - slide.compose(shape({ name: "cover-black-rail", width: fixed(500), height: fixed(H), fill: "#061013", line: { width: 0, fill: "#061013" } }), { - frame: { left: 1420, top: 0, width: 500, height: H }, - baseUnit: 8, - }); - slide.compose( - column({ name: "cover-root", width: fill, height: fill, padding: { x: 96, y: 76 }, justify: "between" }, [ - column({ name: "cover-copy", width: wrap(1060), height: hug, gap: 26 }, [ - text("Agents Shipgate direct API walkthrough", { name: "cover-eyebrow", width: fill, height: hug, style: { fontSize: 24, bold: true, color: C.teal } }), - text("From prompt to release gate", { name: "cover-title", width: wrap(980), height: hug, style: S.inverseTitle }), - text("Original prompt, scanned findings, release risk, and integration path.", { - name: "cover-promise", - width: wrap(840), - height: hug, - style: S.inverseSubtitle, - }), - ]), - row({ name: "cover-bottom", width: fill, height: hug, justify: "between", align: "end" }, [ - image({ name: "three-moons-lab-mark", dataUrl: brandDataUrl, width: fixed(520), height: fixed(62), fit: "contain", alt: "Three Moons Lab" }), - text("Fixture, not a customer case study", { name: "cover-context", width: hug, height: hug, style: { fontSize: 18, color: "#C9D8D7" } }), - ]), - ]), - { frame: { left: 0, top: 0, width: W, height: H }, baseUnit: 8 }, - ); - } - - // 2. Original API prompt - bodySlide( - presentation, - "1. Original API prompt", - "The app presents itself as advice-only.", - "This is the prompt file from the direct OpenAI API fixture.", - grid( - { name: "prompt-grid", width: fill, height: fill, columns: [fr(1.05), fr(0.95)], columnGap: 56 }, - [ - codePanel("original-prompt", [ - "prompts/support_refund.md", - "You are a support refund assistant.", - "", - "You should only advise the support representative", - "and prepare a draft response.", - "", - "Do not take action on the customer's account.", - ], { heading: true, headingColor: C.teal, textStyle: { fontSize: 27, color: C.ink }, fill: C.white }), - column({ name: "prompt-claim", width: fill, height: fill, gap: 24, justify: "center" }, [ - text("What a reviewer would assume", { name: "reviewer-assumption-label", width: fill, height: hug, style: S.label }), - text("No account mutation. No refund execution. No customer email send.", { - name: "reviewer-assumption", - width: wrap(740), - height: hug, - style: { fontSize: 48, bold: true, color: C.ink }, - }), - rule({ name: "prompt-rule", width: fixed(260), stroke: C.teal, weight: 6 }), - text("That promise must match the actual tool surface.", { name: "prompt-note", width: wrap(650), height: hug, style: S.subtitle }), - ]), - ], - ), - "Source: samples/simple_openai_api_agent/prompts/support_refund.md", - ); - - // 3. Actual tool surface - bodySlide( - presentation, - "2. But the enabled tools can take action.", - "The direct OpenAI API artifacts expose two write-capable function tools.", - "This is the gap Agents Shipgate is designed to make visible before release.", - grid( - { name: "tool-surface-grid", width: fill, height: fill, columns: [fr(0.95), fr(1.05)], columnGap: 56 }, - [ - column({ name: "tool-list", width: fill, height: fill, gap: 20, justify: "center" }, [ - riskRow("create_refund", "Creates a refund for a customer payment.", C.red), - riskRow("send_customer_email", "Sends an external customer email.", C.amber), - text("Both are normalized as openai_api tools in the report inventory.", { - name: "normalized-note", - width: wrap(720), - height: hug, - style: { fontSize: 24, color: C.muted }, - }), - ]), - codePanel("tool-schema-snippet", [ - "tools/openai-tools.json", - "\"name\": \"create_refund\"", - "\"description\": \"Create a refund for a customer payment.\"", - "\"strict\": false", - "\"additionalProperties\": true", - "\"required\": [\"payment_id\"]", - "\"amount\": { \"type\": \"number\" }", - ], { heading: true, headingColor: C.red, textStyle: S.codeSmall, fill: C.white }), - ], - ), - "Source: samples/simple_openai_api_agent/tools/openai-tools.json", - ); - - // 4. Scanned findings - bodySlide( - presentation, - "3. What did Agents Shipgate find?", - "It turns the prompt/tool surface into a release-review queue.", - "The fixture emits 15 high-severity and 5 medium-severity findings.", - grid( - { name: "findings-grid", width: fill, height: fill, columns: [fr(0.82), fr(1.18)], columnGap: 58 }, - [ - chart({ - name: "severity-chart", - chartType: "bar", - width: fill, - height: fill, - config: { - title: "Findings by severity", - categories: ["Critical", "High", "Medium", "Low"], - series: [{ name: "Findings", values: [0, 15, 5, 0] }], - }, - }), - column({ name: "finding-list", width: fill, height: fill, gap: 20, justify: "center" }, [ - text("Top scan results", { name: "finding-list-label", width: fill, height: hug, style: S.label }), - riskRow("Prompt/tool mismatch", "Prompt says advise-only while refund and email tools are enabled.", C.red), - riskRow("Schema strictness gaps", "create_refund lacks strict=true, complete required fields, and amount bounds.", C.amber), - riskRow("Retry without idempotency", "Refund and email side effects may be retried without idempotency evidence.", C.blue), - riskRow("Trace approval gap", "Trace sample shows create_refund with approved=false.", C.green), - ]), - ], - ), - "Source: samples/simple_openai_api_agent/expected/report.md", - ); - - // 5. Why the prompt/tool mismatch is serious - bodySlide( - presentation, - "4. Why is this serious?", - "The prompt is the user-facing promise. The tools are the release-time blast radius.", - "When those disagree, production behavior can exceed what reviewers approved.", - grid( - { name: "serious-grid", width: fill, height: fill, columns: [fr(1), fr(1), fr(1)], columnGap: 34 }, - [ - panel({ name: "serious-finance", width: fill, height: fill, fill: C.white, line: { width: 1, fill: C.line }, borderRadius: "rounded-md", padding: { x: 30, y: 28 } }, - column({ width: fill, height: fill, gap: 18 }, [ - shape({ width: fixed(30), height: fixed(30), fill: C.red, line: { width: 0, fill: C.red } }), - text("Financial side effect", { width: fill, height: hug, style: { fontSize: 34, bold: true, color: C.ink } }), - text("A refund tool can move money even though the prompt says no account action.", { width: fill, height: hug, style: { fontSize: 25, color: C.muted } }), - ])), - panel({ name: "serious-duplicate", width: fill, height: fill, fill: "#FDF9F0", line: { width: 1, fill: C.line }, borderRadius: "rounded-md", padding: { x: 30, y: 28 } }, - column({ width: fill, height: fill, gap: 18 }, [ - shape({ width: fixed(30), height: fixed(30), fill: C.amber, line: { width: 0, fill: C.amber } }), - text("Duplicate actions", { width: fill, height: hug, style: { fontSize: 34, bold: true, color: C.ink } }), - text("Retry policy plus missing idempotency can duplicate refunds or customer emails.", { width: fill, height: hug, style: { fontSize: 25, color: C.muted } }), - ])), - panel({ name: "serious-review", width: fill, height: fill, fill: "#F3FAF8", line: { width: 1, fill: C.line }, borderRadius: "rounded-md", padding: { x: 30, y: 28 } }, - column({ width: fill, height: fill, gap: 18 }, [ - shape({ width: fixed(30), height: fixed(30), fill: C.green, line: { width: 0, fill: C.green } }), - text("Review gap", { width: fill, height: hug, style: { fontSize: 34, bold: true, color: C.ink } }), - text("Approval policy and trace evidence do not prove the financial action is gated.", { width: fill, height: hug, style: { fontSize: 25, color: C.muted } }), - ])), - ], - ), - "Source: expected/report.md top findings and trace sample", - ); - - // 6. What a reviewer asks for - bodySlide( - presentation, - "5. What does the fix look like?", - "Agents Shipgate gives concrete review asks instead of a generic score.", - "Future users can use the finding list as their first release checklist.", - askTable(), - "Source: expected/report.md recommendations", - ); - - // 7. Shipgate manifest - bodySlide( - presentation, - "6. How do you connect Agents Shipgate?", - "Declare the local direct API artifacts under openai_api.", - "No OpenAI API call is made during scanning; the scanner reads the files you already review.", - grid( - { name: "integration-grid", width: fill, height: fill, columns: [fr(1), fr(1)], columnGap: 54 }, - [ - codePanel("manifest-panel", [ - "shipgate.yaml", - "openai_api:", - " prompt_files:", - " - prompts/support_refund.md", - " tools:", - " - path: tools/openai-tools.json", - " response_formats:", - " - path: schemas/refund_decision.schema.json", - " policy_rules:", - " - path: policies/openai-api-policy.yaml", - ], { heading: true, headingColor: C.blue, textStyle: S.codeSmall, fill: C.white }), - column({ name: "artifact-explainers", width: fill, height: fill, gap: 24, justify: "center" }, [ - miniFile("Prompt files", "What the model is told to do", C.teal), - miniFile("Tool schemas", "What the model is allowed to call", C.red), - miniFile("Response formats", "What downstream logic depends on", C.blue), - miniFile("Policy rules", "Approval, confirmation, retry, timeout evidence", C.green), - ]), - ], - ), - "Source: docs/manifest-v0.1.md and samples/simple_openai_api_agent/shipgate.yaml", - ); - - // 8. Run and roll out - bodySlide( - presentation, - "7. Run locally, then put it in CI.", - "Start advisory. Promote to strict.", - "Local scan first; CI can fail only on new blockers after a baseline.", - grid( - { name: "rollout-grid", width: fill, height: fill, columns: [fr(1.05), fr(0.95)], columnGap: 54 }, - [ - codePanel("run-commands", [ - "Local commands", - "agents-shipgate init --workspace . --write", - "agents-shipgate scan -c shipgate.yaml", - "", - "Try the fixture:", - "agents-shipgate scan -c samples/simple_openai_api_agent/shipgate.yaml", - ], { heading: true, headingColor: C.teal, textStyle: S.codeSmall, fill: C.white }), - column({ name: "rollout-steps", width: fill, height: fill, gap: 24, justify: "center" }, [ - riskRow("1. Advisory scan", "Write report.md and report.json in PR review.", C.blue), - riskRow("2. Fix or document", "Add strict schemas, approval evidence, idempotency, owner, and scopes.", C.amber), - riskRow("3. Promote to strict", "Use a baseline so CI fails on new critical/high findings.", C.green), - text("Static release-readiness scanner for AI agent tool surfaces.", { - name: "tagline", - width: wrap(700), - height: hug, - style: { fontSize: 25, bold: true, color: C.ink }, - }), - ]), - ], - ), - "Source: README.md, docs/manifest-v0.1.md, and STABILITY.md", - ); - - const pptxBlob = await PresentationFile.exportPptx(presentation); - await pptxBlob.save(pptxPath); - await savePreviews(presentation); - - console.log(JSON.stringify({ pptx: pptxPath, slides: presentation.slides.count, previews: previewDir, layouts: layoutDir }, null, 2)); -} - -main().catch((error) => { - console.error(error && error.stack ? error.stack : String(error)); - process.exit(1); -}); diff --git a/docs/decks/vc-thesis/README.md b/docs/decks/vc-thesis/README.md deleted file mode 100644 index 7d9f771a..00000000 --- a/docs/decks/vc-thesis/README.md +++ /dev/null @@ -1,166 +0,0 @@ -# Three Moons Lab — VC thesis discussion deck - -A working thesis on release readiness for agentic systems. **Not a fundraising deck** — an -artifact for sparring sessions with founders, operators, and design partners. **15 slides.** - -## Files - -| File | What it is | -| ----------------------------------- | ------------------------------------------------------------------ | -| `build/deck-editable.pptx` | **Editable PowerPoint** — text natively reconstructed. Edit copy here. | -| `build/deck.pptx` | Image-based PowerPoint with speaker notes (pixel-perfect, copy not editable) | -| `build/deck.pdf` | PDF version. Share this if you want zero edit affordance. | -| `build/slide-01.png` … `15.png` | Rendered slides at 1920×1080 @2x, for screen / Loom / web | -| `build/contact-sheet.png` | One-image overview of all 15 rendered slides | -| `build/contact-sheet-editable.png` | Same overview for the editable .pptx (LibreOffice render) | -| `build/fragments/` | Cropped image fragments embedded in `deck-editable.pptx` | -| `build_deck.py` | Generator for rendered PNGs | -| `assemble.py` | Post-build: regenerates contact sheet + PDF | -| `build_pptx.py` | Wraps PNGs into `deck.pptx` (image-based version) | -| `build_pptx_native.py` | Builds `deck-editable.pptx` (native reconstruction) | -| `crop_fragments.py` | Crops Gödel/FEP diagrams + Slide 8 from rendered PNGs | -| `slide-08-options/` | Three drafts of Slide 8 — V2 won, V1/V3 kept for reference | - -## Two PowerPoint versions — pick the one that matches your edit need - -| Edit you want to make | `deck-editable.pptx` | `deck.pptx` | -| --------------------------------------------- | -------------------- | ------------------ | -| Tweak a headline / body copy / kicker | ✅ Native text | ❌ Edit Python | -| Reorder slides, hide, duplicate, add slides | ✅ | ✅ | -| Add / edit speaker notes | ✅ | ✅ (pre-filled) | -| Resize / move existing shapes | ✅ | ❌ | -| Restyle colors / fonts globally | ✅ | ❌ | -| Get pixel-perfect to the rendered design | mostly — slight font drift | ✅ exact | - -**Native reconstruction trade-offs.** Slides 1, 2, 3, 6, 7, 9, 10, 11, 12, 13 are fully -native — every word is a real PowerPoint text run, every card is a real shape. Slides 4, 5, -and 8 embed cropped image fragments where rebuilding them with PowerPoint shapes would lose -intent: the Gödel self-reference loop (slide 4), the Free Energy boundary diagram (slide 5), -and the syntax-highlighted editor + report (slide 8). The headline and body text on those -slides are still native and editable — only the diagram region is an image. - -**Font note.** The deck specifies Aptos Display / Aptos / Consolas. PowerPoint Mac/365 has -these by default; older Office may substitute (the substitution is usually fine). If the -fonts are missing, PowerPoint will pick a clean sans fallback automatically. - -## Reading guide — how each slide is meant to land - -Five-act narrative arc. Acts 2 (Gödel + FEP) use a dark theme as the intellectual core; the rest is brand cream. - -### Act 1 — The shift (slides 1–3) - -**01 Cover.** -Establishes posture in the subtitle: *"A working thesis — not a pitch."* The viewer should -read this and adjust expectations down from "they're going to ask for money" to "they want -to think alongside me." - -**02 The inflection.** -The single highest-leverage slide for category creation. Side-by-side: yesterday's LLM call -vs today's agent loop. The bottom callout — *"every tool change becomes a release event"* — -is the entire wedge in one line. - -**03 The new release problem.** -Three negative-space cards (`it is not — software testing / LLM eval / runtime SRE`) -followed by a dark-callout positive definition. Defends the category claim by what it isn't. - -### Act 2 — The intellectual core (slides 4–5, dark) - -**04 Gödel.** -*Why* this category exists. A sufficiently capable agent cannot self-certify. External -assurance is structural, not optional. Closes off two would-be competitors: foundation -model labs ("we'll add this") and agent self-certification ("the agent can check itself"). - -**05 Free Energy Principle.** -*Where* the wedge intervenes. Tool calls are the only channel through which internal -uncertainty becomes external side effect. Readiness = bounding free energy at the action -boundary. Pivots naturally into Slide 7's tool-use argument. - -> If the audience reads slides 4–5 and doesn't lean in, they're not in this thesis. That's a -> useful filter, not a bug. - -### Act 3 — Our thesis (slides 6–7) - -**06 The thesis.** -Stack diagram positions Three Moons Lab between agent frameworks (above) and tool surfaces -(below), distinct from adjacent infra (eval, observability, MCP gateways). Right side: the -thesis stated plainly. - -**07 Why tool-use is the right wedge.** -Three cards — action boundary, most formalizable, highest-leverage risk surface — backed by -the academic landscape (AppWorld, ToolEmu, AgentDojo, τ-bench). Dark callout closes with -the "wedge logic" criteria: any one of those misses, and tool-use isn't the right cut. - -### Act 4 — The product path (slides 8–12) - -**08 + 09 — DECLARED → DETECTED diptych.** -The most concrete pair. Slide 8 shows the team's actual source files: `refund_agent.py` -(the OpenAI Agents SDK wiring) on the left, `shipgate.yaml` (the release contract) on the -right. Slide 9 shows what shipgate found on those files: BLOCKED, 2 critical, 14 high, real -check IDs. The narrative spine sits between the two slides: the manifest's -`prohibited_actions` literally says *"issue refund without approval"* (slide 8), but -`stripe.create_refund` ships with no approval policy declared (slide 9). The audience -connects the two slides themselves — that's the whole point. - -**10 How the release contract gets written.** -Anticipates "wait, where does shipgate.yaml come from?" — a question that always comes up -after slides 8–9. Four steps: SCAFFOLD (auto via `agents-shipgate init`) → AUTHOR (human -fills declared_purpose, prohibited_actions, policies, scopes) → SCAN → ITERATE in PR / CI. -Bottom row makes the auto-vs-human split explicit: scanner can detect tools, only humans -can write intent. Optional Terraform analogy if they push: "prose policy → policy as code." - -**11 Phase 2 & 3 — Sandbox → Trace.** -Phase 2 turns Phase 1 unknowns into experimental evidence (sandbox, failure injection, -prompt-injection harness). Phase 3 turns one-time reports into continuous state (trace -ingestion, replay, regression detection). The footer line is deliberate humility: -"Phase 1 ships now. Phase 2 and 3 are deliberate land-and-expand." - -**12 The compounding asset.** -The single most important slide for category vs feature framing. Three phases are the same -evidence corpus unfolding across timescales — failure taxonomy, policy library, trace -schema all compound with every user. Anti-feature defense: a GitHub Action lint can't -compound; this can. - -### Act 5 — Where I am, what I want (slides 13–15) - -**13 What I'm validating now.** -Three open hypotheses, each paired with the proof being sought. Closes with a deliberate -disclaimer of what's *not* claimed (PMF, runtime safety certification, foundation-model-lab -proof). This slide signals intellectual honesty more than any other. - -**14 10-year north star — medical record for every agent.** -Healthcare-for-agents metaphor placed late so it doesn't dominate framing. Today = -pre-release exam → Year 2 = stress tests → Year 3–4 = vital signs → 10-year = medical -record across the lifetime of every agent. The bottom callout grounds it: "Today we build -the first instrument." - -**15 What I'm looking for.** -Not "what we want from you," but "what would help right now, in order." Sparring partners -(thinking partners), design partners (the most valuable ask), capital optionality (later). -The closing italic line — *"infrastructure to make their entry into the world -accountable"* — is the only place the deck declares its philosophy directly. - -## Notes for delivery - -- **For Prateek-style first conversation**: walk slides 1, 2, 3, 6, 7, 8, 9, 10, 13, 15 (~10 slides). - Skip 4–5 unless they lean philosophical. Skip 11–12 unless they push on category-vs-feature. - Skip 14 unless they ask about long-term vision. - Slide 10 (workflow) is the answer to "where does shipgate.yaml come from?" — usually the next question after Slide 9. - -- **For a written share** (email, doc): send the full PDF. Slide 8 carries enough specificity - to ground the abstract argument; slides 4–5 give the deck intellectual identity. - -- **For Loom / async**: walk every slide. Watch for which slides someone replays or pauses - on — those are signal. - -## Re-run - -```bash -cd docs/decks/vc-thesis -python3 build_deck.py # renders 12 slides + copies slide-08 from v2 -python3 assemble.py # contact sheet + PDF -python3 crop_fragments.py # cuts diagram fragments for the editable pptx -python3 build_pptx.py # image-based deck.pptx with speaker notes -python3 build_pptx_native.py # editable deck-editable.pptx (recommended) -``` - -Requires Python 3, `playwright` + `chromium`, `markdown`, `Pillow`, `Pygments`, `python-pptx`. diff --git a/docs/decks/vc-thesis/assemble.py b/docs/decks/vc-thesis/assemble.py deleted file mode 100644 index 1f37301b..00000000 --- a/docs/decks/vc-thesis/assemble.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Post-build: contact sheet (3x5 grid PNG) + PDF deck.""" -from pathlib import Path - -from PIL import Image - -BUILD_DIR = Path(__file__).resolve().parent / "build" - -SLIDES = sorted(BUILD_DIR.glob("slide-*.png")) -assert len(SLIDES) == 15, f"Expected 15 slides, got {len(SLIDES)}" - -# Contact sheet — 5 columns × 3 rows, exactly fills 15 slides -COLS, ROWS = 5, 3 -THUMB_W = 720 -GAP = 20 -MARGIN = 40 - -# Compute thumb height from one slide -sample = Image.open(SLIDES[0]) -ratio = sample.height / sample.width -THUMB_H = int(THUMB_W * ratio) - -sheet_w = COLS * THUMB_W + (COLS - 1) * GAP + 2 * MARGIN -sheet_h = ROWS * THUMB_H + (ROWS - 1) * GAP + 2 * MARGIN + 100 # extra for header - -sheet = Image.new("RGB", (sheet_w, sheet_h), (245, 240, 229)) - -# Header band -from PIL import ImageDraw, ImageFont - -draw = ImageDraw.Draw(sheet) -try: - font_title = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 36) - font_sub = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 18) -except Exception: - font_title = ImageFont.load_default() - font_sub = ImageFont.load_default() - -draw.text((MARGIN, 30), "Three Moons Lab — Release Readiness for Agentic Systems", fill=(26, 37, 48), font=font_title) -draw.text((MARGIN, 78), f"Working thesis · {len(SLIDES)} slides · contact sheet", fill=(107, 95, 77), font=font_sub) - -for i, slide in enumerate(SLIDES): - row, col = divmod(i, COLS) - x = MARGIN + col * (THUMB_W + GAP) - y = MARGIN + 100 + row * (THUMB_H + GAP) - img = Image.open(slide).convert("RGB").resize((THUMB_W, THUMB_H), Image.LANCZOS) - sheet.paste(img, (x, y)) - # Slide number badge - draw.rectangle((x + 12, y + 12, x + 56, y + 36), fill=(26, 37, 48)) - draw.text((x + 22, y + 14), f"{i+1:02d}", fill=(245, 240, 229), font=font_sub) - -contact_path = BUILD_DIR / "contact-sheet.png" -sheet.save(contact_path, optimize=True) -print(f"Contact sheet: {contact_path} ({contact_path.stat().st_size // 1024} KB)") - -# PDF deck — one page per slide at native 1920x1080 -slide_imgs = [Image.open(s).convert("RGB") for s in SLIDES] -pdf_path = BUILD_DIR / "deck.pdf" -slide_imgs[0].save( - pdf_path, - "PDF", - resolution=144.0, - save_all=True, - append_images=slide_imgs[1:], -) -print(f"PDF deck: {pdf_path} ({pdf_path.stat().st_size // 1024} KB)") diff --git a/docs/decks/vc-thesis/build/_logo-mark-dark.png b/docs/decks/vc-thesis/build/_logo-mark-dark.png deleted file mode 100644 index 322b31fe..00000000 Binary files a/docs/decks/vc-thesis/build/_logo-mark-dark.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/_logo-mark-light.png b/docs/decks/vc-thesis/build/_logo-mark-light.png deleted file mode 100644 index 28ba7ce3..00000000 Binary files a/docs/decks/vc-thesis/build/_logo-mark-light.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/contact-sheet-editable.png b/docs/decks/vc-thesis/build/contact-sheet-editable.png deleted file mode 100644 index b563f60d..00000000 Binary files a/docs/decks/vc-thesis/build/contact-sheet-editable.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/contact-sheet.png b/docs/decks/vc-thesis/build/contact-sheet.png deleted file mode 100644 index cd7078cd..00000000 Binary files a/docs/decks/vc-thesis/build/contact-sheet.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/deck-editable.pdf b/docs/decks/vc-thesis/build/deck-editable.pdf deleted file mode 100644 index fa0914f4..00000000 Binary files a/docs/decks/vc-thesis/build/deck-editable.pdf and /dev/null differ diff --git a/docs/decks/vc-thesis/build/deck-editable.pptx b/docs/decks/vc-thesis/build/deck-editable.pptx deleted file mode 100644 index 2ce24816..00000000 Binary files a/docs/decks/vc-thesis/build/deck-editable.pptx and /dev/null differ diff --git a/docs/decks/vc-thesis/build/deck.pdf b/docs/decks/vc-thesis/build/deck.pdf deleted file mode 100644 index 4fb7efe1..00000000 Binary files a/docs/decks/vc-thesis/build/deck.pdf and /dev/null differ diff --git a/docs/decks/vc-thesis/build/deck.pptx b/docs/decks/vc-thesis/build/deck.pptx deleted file mode 100644 index 238aec4c..00000000 Binary files a/docs/decks/vc-thesis/build/deck.pptx and /dev/null differ diff --git a/docs/decks/vc-thesis/build/fragments/fep-boundary.png b/docs/decks/vc-thesis/build/fragments/fep-boundary.png deleted file mode 100644 index da3d9b74..00000000 Binary files a/docs/decks/vc-thesis/build/fragments/fep-boundary.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/fragments/godel-loop.png b/docs/decks/vc-thesis/build/fragments/godel-loop.png deleted file mode 100644 index d0de533d..00000000 Binary files a/docs/decks/vc-thesis/build/fragments/godel-loop.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/fragments/slide-08-full.png b/docs/decks/vc-thesis/build/fragments/slide-08-full.png deleted file mode 100644 index 27294469..00000000 Binary files a/docs/decks/vc-thesis/build/fragments/slide-08-full.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-01.png b/docs/decks/vc-thesis/build/slide-01.png deleted file mode 100644 index 58684982..00000000 Binary files a/docs/decks/vc-thesis/build/slide-01.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-02.png b/docs/decks/vc-thesis/build/slide-02.png deleted file mode 100644 index f22bf94d..00000000 Binary files a/docs/decks/vc-thesis/build/slide-02.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-03.png b/docs/decks/vc-thesis/build/slide-03.png deleted file mode 100644 index 4ad05318..00000000 Binary files a/docs/decks/vc-thesis/build/slide-03.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-04.png b/docs/decks/vc-thesis/build/slide-04.png deleted file mode 100644 index 1331ea6c..00000000 Binary files a/docs/decks/vc-thesis/build/slide-04.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-05.png b/docs/decks/vc-thesis/build/slide-05.png deleted file mode 100644 index a257856d..00000000 Binary files a/docs/decks/vc-thesis/build/slide-05.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-06.png b/docs/decks/vc-thesis/build/slide-06.png deleted file mode 100644 index 5b0f9178..00000000 Binary files a/docs/decks/vc-thesis/build/slide-06.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-07.png b/docs/decks/vc-thesis/build/slide-07.png deleted file mode 100644 index 418a12d9..00000000 Binary files a/docs/decks/vc-thesis/build/slide-07.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-08.png b/docs/decks/vc-thesis/build/slide-08.png deleted file mode 100644 index 55326199..00000000 Binary files a/docs/decks/vc-thesis/build/slide-08.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-09.png b/docs/decks/vc-thesis/build/slide-09.png deleted file mode 100644 index 0fbecde2..00000000 Binary files a/docs/decks/vc-thesis/build/slide-09.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-10.png b/docs/decks/vc-thesis/build/slide-10.png deleted file mode 100644 index a2007bd7..00000000 Binary files a/docs/decks/vc-thesis/build/slide-10.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-11.png b/docs/decks/vc-thesis/build/slide-11.png deleted file mode 100644 index f4abeb69..00000000 Binary files a/docs/decks/vc-thesis/build/slide-11.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-12.png b/docs/decks/vc-thesis/build/slide-12.png deleted file mode 100644 index a02da574..00000000 Binary files a/docs/decks/vc-thesis/build/slide-12.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-13.png b/docs/decks/vc-thesis/build/slide-13.png deleted file mode 100644 index 547c9760..00000000 Binary files a/docs/decks/vc-thesis/build/slide-13.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-14.png b/docs/decks/vc-thesis/build/slide-14.png deleted file mode 100644 index f3100f15..00000000 Binary files a/docs/decks/vc-thesis/build/slide-14.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build/slide-15.png b/docs/decks/vc-thesis/build/slide-15.png deleted file mode 100644 index d57fa607..00000000 Binary files a/docs/decks/vc-thesis/build/slide-15.png and /dev/null differ diff --git a/docs/decks/vc-thesis/build_deck.py b/docs/decks/vc-thesis/build_deck.py deleted file mode 100644 index 824e3fb6..00000000 --- a/docs/decks/vc-thesis/build_deck.py +++ /dev/null @@ -1,1429 +0,0 @@ -"""Three Moons Lab — VC thesis discussion deck. - -15 slides, 1920x1080 @ 2x, brand palette (cream + navy), with a darker variant -for the philosophical core (Gödel + FEP). Slides 8 + 9 form a DECLARED/DETECTED -diptych showing the source files (refund_agent.py + shipgate.yaml) followed by -the shipgate report on those files. - -Run: - python3 build_deck.py -Output: - build/slide-01.png … slide-15.png -""" -from __future__ import annotations - -import asyncio -from pathlib import Path - -import markdown -from playwright.async_api import async_playwright -from pygments import highlight -from pygments.formatters.html import HtmlFormatter -from pygments.lexers.data import YamlLexer -from pygments.lexers.python import PythonLexer - -DECK_DIR = Path(__file__).resolve().parent -REPO = DECK_DIR.parents[2] -BUILD_DIR = DECK_DIR / "build" -BUILD_DIR.mkdir(parents=True, exist_ok=True) - -W, H = 1920, 1080 -TOTAL_SLIDES = 15 - -# --------------------------------------------------------------------------- -# Shared design system -# --------------------------------------------------------------------------- - -BRAND_CSS = r""" -:root { - --cream: #F5F0E5; - --cream-2: #ECE5D5; - --cream-3: #E0D7C0; - --paper: #FBF7EE; - --navy: #1A2530; - --navy-2: #2A3540; - --navy-deep: #0E1820; - --navy-deeper: #060D14; - --muted: #6B5F4D; - --muted-2: #8B7E68; - --muted-dark: #B5A988; /* on dark bg */ - --muted-dark-2: #8C8167; - --rule: #D4CCB8; - --rule-soft: #E5DCC6; - --rule-dark: #2A3540; - --critical: #B8392F; - --critical-bg: #F4D9D6; - --high: #C76A2C; - --high-bg: #F4E2D0; - --medium: #B89530; - --accent: #6B7B4F; /* warm green */ - --accent-dark: #B5C99B; /* cream-side */ - --gold: #D4A847; /* highlight on dark */ -} -* { box-sizing: border-box; } -html, body { - margin: 0; padding: 0; - font-family: -apple-system, "SF Pro Display", "Helvetica Neue", Helvetica, Arial, sans-serif; - -webkit-font-smoothing: antialiased; - font-feature-settings: "liga", "kern"; -} -body { - width: 1920px; height: 1080px; - background: var(--cream); - color: var(--navy); - display: flex; flex-direction: column; - padding: 72px 96px; - position: relative; -} -body.dark { - background: var(--navy-deep); - color: var(--cream); -} -body.dark .muted { color: var(--muted-dark); } -body.dark .rule { background: var(--rule-dark); } - -.kicker { - font-size: 16px; - letter-spacing: 0.18em; - text-transform: uppercase; - color: var(--muted-2); - font-weight: 700; - margin-bottom: 18px; -} -body.dark .kicker { color: var(--muted-dark-2); } - -h1.head { - font-size: 76px; - line-height: 1.04; - font-weight: 700; - letter-spacing: -0.025em; - margin: 0; -} -h1.head.med { font-size: 64px; } -h1.head.sm { font-size: 56px; } -h1.head .accent { color: var(--critical); } -h1.head .gold { color: var(--gold); } -h1.head .dim { color: var(--muted-2); } -body.dark h1.head .dim { color: var(--muted-dark-2); } -body.dark h1.head { color: var(--cream); } - -.lede { - font-size: 26px; - line-height: 1.45; - color: var(--navy-2); - max-width: 1400px; - margin-top: 22px; -} -body.dark .lede { color: var(--muted-dark); } - -.footer-row { - position: absolute; - left: 96px; right: 96px; bottom: 48px; - display: flex; justify-content: space-between; align-items: center; - font-size: 14px; - color: var(--muted); -} -body.dark .footer-row { color: var(--muted-dark-2); } -.brand { - display: flex; align-items: center; gap: 12px; - font-weight: 700; letter-spacing: 0.04em; - color: var(--navy); -} -.brand .dot { width: 10px; height: 10px; border-radius: 50%; background: var(--navy); } -body.dark .brand { color: var(--cream); } -body.dark .brand .dot { background: var(--cream); } - -.slide-no { - font-family: "SF Mono", "JetBrains Mono", Menlo, monospace; - font-size: 13px; - letter-spacing: 0.08em; - color: var(--muted); -} -body.dark .slide-no { color: var(--muted-dark-2); } - -.body-content { - flex: 1; - min-height: 0; - display: flex; - flex-direction: column; - justify-content: center; -} - -/* utility */ -.code { - font-family: "SF Mono", "JetBrains Mono", Menlo, monospace; -} -.tag { - display: inline-block; - padding: 4px 10px; - font-size: 11px; - font-weight: 700; - letter-spacing: 0.12em; - text-transform: uppercase; - border-radius: 3px; - background: var(--cream-2); - color: var(--muted); -} -.tag.crit { background: var(--critical-bg); color: var(--critical); } -.tag.high { background: var(--high-bg); color: var(--high); } -.tag.gold { background: rgba(212,168,71,0.18); color: var(--gold); } - -.divider { height: 1px; background: var(--rule); width: 100%; } -body.dark .divider { background: var(--rule-dark); } - -.callout { - border-left: 4px solid var(--navy); - background: var(--cream-2); - padding: 24px 32px; - border-radius: 4px; - font-size: 24px; - line-height: 1.4; - color: var(--navy); -} -body.dark .callout { border-color: var(--gold); background: rgba(255,255,255,0.04); color: var(--cream); } - -.hairline { border-top: 1px solid var(--rule); padding-top: 18px; } -""" - - -def page(body_html: str, theme: str = "light", slide_no: int = 0, total: int = TOTAL_SLIDES) -> str: - body_class = "dark" if theme == "dark" else "" - return f""" - - -{body_html} - -""" - - -# --------------------------------------------------------------------------- -# Slide content -# --------------------------------------------------------------------------- - -def slide_01_cover() -> str: - body = f""" -
- - -
- Three Moons Lab -
-

- Release readiness
for agentic systems. -

-
- A working thesis — not a pitch. -
-
-
Wendy · pengfei@threemoonslab.com
-
April 2026
-
v0.1 — for discussion
-
-
-""" - return page(body, "light", 1) - - -def slide_02_inflection() -> str: - body = r""" -
Act 1 · The shift
-

Models that answer.
Agents that act.

- -
-
- - -
-
- Yesterday — LLM call -
-
- Input → Output -
- - - prompt - - - model - - text - - - -
    -
  • stateless
  • -
  • no real-world side effects
  • -
  • release risk: "is the answer wrong?"
  • -
-
- - -
-
- Today — agent -
-
- Observe → Plan → Tool → Side effect → Memory -
- - - - observe - plan - memory - - tool call - - side effect - - world - - - -
    -
  • stateful, looping
  • -
  • real consequences — refunds, emails, PRs, deploys
  • -
  • release risk: "did the agent do the wrong thing?"
  • -
-
-
- -
- Once an agent can call tools, every tool change becomes a release event. - The release process built for code does not map onto agents. -
-
-""" - return page(body, "light", 2) - - -def slide_03_new_release_problem() -> str: - body = r""" -
Act 1 · The shift
-

Agent Release Readiness
is a new release decision.

- -
-
- Bounded assurance that a stochastic, open, tool-using system can enter a higher-permission - environment — under a declared task scope, tool surface, permission boundary, and risk tier. -
- -
-
-
It is not — software testing
-
- Software testing assumes a deterministic code path. Agents make their action graph - at runtime from goals, context, tools, and feedback. -
-
-
-
It is not — LLM eval
-
- Eval scores measure input → output behavior on sampled tasks. They cannot answer - whether this tool surface, in this environment, is safe to ship. -
-
-
-
It is not — runtime SRE
-
- SLOs, canaries, and observability fire during or after execution. - Release readiness is the decision before we promote. -
-
-
- -
-
It is
-
- An evidence-based release decision over a stochastic, tool-using, - state-mutating system — graded against a declared operational envelope. -
-
-
-""" - return page(body, "light", 3) - - -def slide_04_godel() -> str: - body = r""" -
Act 2 · First principle №1
-

A sufficiently capable agent
cannot self-certify its own readiness.

- -
-
- - -
-

- Any system rich enough to express its own behavior contains statements - about itself it cannot prove from within. -

-

- For agents, those statements are about - side effects, - long-horizon consequence, - and prompt-injection susceptibility. -

-

- External assurance is not optional. It is structural. -

-
- - -
- - - - AGENT SYSTEM - - - - "am I ready?" - - - - - - - - EXTERNAL - ASSURANCE - - - - - - - - - - -
-
-
-""" - return page(body, "dark", 4) - - -def slide_05_fep() -> str: - body = r""" -
Act 2 · First principle №2
-

Tool calls are where uncertainty
escapes into the world.

- -
-
- - -
-

- Agents act to minimize prediction error under their generative model of reality — - the Free Energy Principle framing. -

-

- The tool boundary is the only channel through which - an agent's internal uncertainty becomes external side effect. -

-

- Release readiness = bounding free energy at the action boundary - before it propagates. -

-
- - -
- - - - INTERNAL - belief - memory - plan - - - - TOOL BOUNDARY - tool call - scope - schema - approval - side effect class - - - - WORLD - DB · email - refund · code - customers - - - - - - - - - - - free energy escapes here - - - - - - - - -
-
-
-""" - return page(body, "dark", 5) - - -def slide_06_thesis() -> str: - body = r""" -
Act 3 · Our thesis
-

The evidence layer between
agent dev and production action.

- -
- -
- - -
-
-
-
Above
-
Agent frameworks · OpenAI Agents SDK · Anthropic · Google ADK · LangChain · CrewAI
-
- -
-
Three Moons Lab — what's missing
-
- CI/CD + audit layer for agentic systems -
-
- pre-release evidence · trace-based replay · runtime continuous readiness -
-
- -
-
Below
-
Tool surfaces · MCP · OpenAPI · function tools · shell · computer use
-
-
- -
- Adjacent (not us): eval frameworks · runtime guardrails · LLM observability · MCP gateways. -
-
- - -
-
Thesis
-
- Every production agent will need a release-readiness record before - it gets promoted — and a trace-replayable evidence trail after. -
-
- That record won't live inside the model. It won't live inside the framework. - It has to live in independent infrastructure. -
-
-
-
-""" - return page(body, "light", 6) - - -def slide_07_wedge() -> str: - body = r""" -
Act 3 · Our wedge
-

Tool-use is the right wedge.

- -
-
- - -
-
-
- Action boundary -
-
- Tool call = the moment language becomes consequence. It's where every interesting - risk crystallizes: side effect, scope, approval, idempotency, recoverability. -
-
- The model becoming smarter doesn't change this boundary. It only makes it more active. -
-
- - -
-
-
- Most formalizable -
-
- Tool surfaces ship with structure: schemas, scopes, MCP annotations, OpenAPI specs, - SDK function signatures. Static analysis bites — unlike "is the agent's reasoning correct?" -
-
- Formalize what's crisp · annotate what's contextual · review what's ambiguous. -
-
- - -
-
-
- Highest-leverage risk -
-
- AppWorld, ToolEmu, AgentDojo, τ-bench, AgentHarm — the academic evidence converges: - tool-use is where current agents fail, where attacks land, where damage compounds. -
-
- High-stakes tools (refund, email, deploy, delete) need readiness, not a benchmark score. -
-
-
- -
- Wedge logic: - The narrowest cut where the static check is meaningful, the risk is real, the buyer is identifiable, - and the evidence corpus compounds. Tool-use clears all four. -
-
-""" - return page(body, "light", 7) - - -# --------------------------------------------------------------------------- -# Slides 8 + 9 — Phase 1 diptych: DECLARED (source files) → DETECTED (report) -# --------------------------------------------------------------------------- - -# Real source files used in both slides -SAMPLE_PY = (REPO / "samples/support_refund_agent/agents/refund_agent.py").read_text() - -# Hand-trimmed YAML excerpt — keep narrative-load-bearing sections only -SAMPLE_YAML = """version: "0.1" - -project: - name: support-refund-agent - owner: support-platform - -agent: - name: refund-assistant - sdk: { type: openai-agents, entrypoint: agents/refund_agent.py } - declared_purpose: - - answer refund policy questions - - prepare refund requests for human review - - update support ticket notes - prohibited_actions: - - issue refund without approval - - cancel order without explicit confirmation - - send external email without preview - -environment: - target: production_like - -tool_sources: - - { id: support_openapi, type: openapi, path: specs/support-tools.openapi.yaml } - - { id: support_mcp_tools, type: mcp, path: .agents-shipgate/mcp-tools.json } - - { id: wildcard_mcp_tools, type: mcp, path: .agents-shipgate/wildcard-tools.json } - - { id: openai_sdk_static, type: openai_agents_sdk, path: agents/refund_agent.py } - -permissions: - scopes: - - zendesk:tickets:read - - zendesk:tickets:write - - stripe:* - credential_mode: service_account - -# … policies, risk_overrides, checks, ci, output omitted""" - - -PYGMENTS_THEME_CSS = """ -.codeblock { font-family: "SF Mono", "JetBrains Mono", "Menlo", monospace; font-size: 12.5px; line-height: 1.55; } -.codeblock .hll { background-color: #F0E9D6 } -.codeblock .c, .codeblock .ch, .codeblock .cm, .codeblock .cpf, .codeblock .c1, .codeblock .cs { color: #8B7E68; font-style: italic } -.codeblock .k, .codeblock .kc, .codeblock .kd, .codeblock .kn, .codeblock .kp, .codeblock .kr, .codeblock .kt { color: #1A2530; font-weight: 700 } -.codeblock .nt { color: #2A3540; font-weight: 600 } -.codeblock .nb, .codeblock .nc, .codeblock .nf, .codeblock .nn { color: #1A2530; font-weight: 600 } -.codeblock .n, .codeblock .nv { color: #2A3540 } -.codeblock .l, .codeblock .ld, .codeblock .s, .codeblock .s1, .codeblock .s2, .codeblock .se, .codeblock .sx, .codeblock .sb, .codeblock .sc, .codeblock .sd, .codeblock .sh, .codeblock .si, .codeblock .sr, .codeblock .ss { color: #6B7B4F } -.codeblock .m, .codeblock .mf, .codeblock .mh, .codeblock .mi, .codeblock .mo, .codeblock .il { color: #C76A2C } -.codeblock .o, .codeblock .ow { color: #6B5F4D } -.codeblock .p { color: #6B5F4D } -.codeblock .err { color: #B8392F } -.codeblock .gh { color: #1A2530; font-weight: 700 } -.codeblock .linenos { color: #B8AB91; padding-right: 12px; user-select: none; border-right: 1px solid #E5DCC6; margin-right: 12px; } -.codeblock pre { margin: 0; padding: 0; background: transparent; } -""" - - -def _render_code(src: str, lexer) -> str: - formatter = HtmlFormatter(cssclass="codeblock", linenos="inline", - nobackground=True, wrapcode=True) - return highlight(src, lexer, formatter) - - -def slide_08_declared() -> str: - py_html = _render_code(SAMPLE_PY, PythonLexer()) - yaml_html = _render_code(SAMPLE_YAML, YamlLexer()) - body = f""" - -
Phase 1 · Static Release-Readiness Scanner
-

What the team declared.

-

- An OpenAI Agents SDK refund agent and its release contract. - ~50 lines of human-authored intent, across two files, that determines whether - a refund of $5,000 can fire without human approval. -

- -
-
- - -
-
-
-
pyrefund_agent.py
-
samples/support_refund_agent/agents/
-
-
- - {py_html} -
-
- - -
-
-
-
shipgate.yaml
-
[ ]support-tools.openapi.yaml
-
{{ }}mcp-tools.json
-
samples/support_refund_agent/
-
-
- - {yaml_html} -
-
-
-
-""" - return page(body, "light", 8) - - -def slide_09_detected() -> str: - body = r""" - - -
Phase 1 · Static Release-Readiness Scanner
-

What shipgate detected.

-

- Static analysis on the declared tool surface from the previous slide. - The manifest's prohibited_actions list says - "issue refund without approval" — but - stripe.create_refund - has no approval policy declared. -

- -
-
-
-
-
Detected · agents-shipgate scan
-

support-refund-agent · refund-assistant

-
-
- target: production_like
- evidence coverage: mixed
- human review: recommended -
-
- -
-
-
Release blockers detected - 2 critical findings on a financial-action tool · release should not promote -
-
- -
-
2
Critical
-
14
High
-
2
Medium
-
0
Low
-
8
Tools scanned
-
- -
-

Top findings

-
showing 4 of 18 · sorted by severity
-
- -
-
Critical
-
-
SHIP-POLICY-APPROVAL-MISSINGstripe.create_refund
-
Tool can issue refunds with no declared approval policy — directly contradicts the manifest's prohibited-actions list.
-
-
-
-
Critical
-
-
SHIP-SIDEFX-IDEMPOTENCY-MISSINGstripe.create_refund
-
No idempotency key, annotation, or declared idempotency policy — retries can double-refund.
-
-
-
-
High
-
-
SHIP-AUTH-MANIFEST-BROAD-SCOPE
-
Manifest declares wildcard permission scope stripe:* — broader than any required tool scope.
-
-
-
-
High
-
-
SHIP-INVENTORY-WILDCARD-TOOLSwildcard_mcp_tools.*
-
MCP source declares wildcard tool exposure — full tool surface is unknown at release time.
-
-
- -
-
- 8 tools - 3 high-risk - 1 wildcard - mcp×3 - openapi×4 - sdk×1 -
-
report.md · report.json · report.sarif
-
-
-
-""" - return page(body, "light", 9) - - -def slide_10_workflow() -> str: - body = r""" -
Act 4 · The product path
-

How the release contract gets written.

-

- shipgate.yaml - is a living contract — half scaffolded by the scanner, half human-authored, - versioned with the agent code, reviewed in PR, enforced in CI. -

- -
- - -
-""" - cards = [ - ("01", "SCAFFOLD", "AUTO", - "$ agents-shipgate init", - "Scanner walks the workspace, detects OpenAPI / MCP / SDK files, prefills " - "tool_sources and references their schemas.", - "Output: starter shipgate.yaml with CHANGE_ME placeholders for intent."), - ("02", "AUTHOR", "HUMAN", - "~ shipgate.yaml", - "Fill in declared_purpose, prohibited_actions, approval policies, " - "permission scopes, risk owners. Scanner can't infer intent.", - "Output: completed release contract — your team's policy as code."), - ("03", "SCAN", "AUTO", - "$ agents-shipgate scan", - "Static analysis matches declared intent against actual configured enforcement. " - "Surfaces gaps as findings — like the ones on the previous slide.", - "Output: BLOCKED / WARN / PASS report (markdown · JSON · SARIF)."), - ("04", "ITERATE", "HUMAN + CI", - "~ PR review · CI gate", - "Fix policies, narrow scopes, accept residual risk with reason. " - "Each commit re-scans. Manifest evolves with the agent.", - "Output: living contract, version-controlled. New findings block release."), - ] - for num, label, who, cmd, desc, out in cards: - who_color = "#6B7B4F" if who == "AUTO" else ("#C76A2C" if who == "HUMAN" else "#1A2530") - who_bg = "#E5EBD8" if who == "AUTO" else ("#F4E2D0" if who == "HUMAN" else "#ECE5D5") - body += f""" -
-
-
{num}
-
{who}
-
-
{label}
-
{cmd}
-
{desc}
-
{out}
-
""" - body += r""" -
- - -
-
-
↻ findings → fix → re-scan → commit · the loop runs every PR
-
-
- - -
-
-
SCANNER WRITES
-
- tool_sources - · schemas, scopes, MCP annotations — referenced, not copied -
-
-
-
HUMAN WRITES
-
- declared_purpose · - prohibited_actions - · policies · risk owners — intent only humans can author -
-
-
-
HYBRID
-
- permissions.scopes - — scanner suggests least-privilege from tool specs; human ratifies -
-
-
-
-""" - return page(body, "light", 10) - - -def slide_11_phase23() -> str: - body = r""" -
Act 4 · The product path
-

Beyond static — sandbox & trace.

- -
- -
- - -
-
-
Phase 2
-
~6–12 months out
-
-
- Sandbox & simulation -
-
- Turn the unknowns surfaced in Phase 1 into experimental evidence — without exposing - production state. -
-
    -
  • Mocked tool execution & failure injection
  • -
  • Prompt-injection harness on read-tools (web, email, docs)
  • -
  • State-diff assertions for collateral damage
  • -
  • Synthetic adversarial scenarios (ToolEmu / AppWorld lineage)
  • -
-
- Output: pre-promotion stress test report. CI-attachable. Fails-loud on regressions. -
-
- - -
-
-
Phase 3
-
~12–24 months out
-
-
- Trace, replay, runtime -
-
- Turn one-time pre-release reports into a continuous readiness state — pulled from - the agent's actual production behavior. -
-
    -
  • Trace ingestion: OpenAI Agents SDK, MCP events, custom hooks
  • -
  • Replay bundles for incident forensics
  • -
  • Regression detection across prompt / model / tool changes
  • -
  • Runtime anomaly & blast-radius monitors
  • -
-
- Output: living readiness state. Audit-grade. Connected to incident review. -
-
-
- -
- Phase 1 ships now. Phase 2 and 3 are deliberate land-and-expand, not a roadmap to be promised on Slide 11. -
-
-""" - return page(body, "light", 11) - - -def slide_12_compounding() -> str: - body = r""" -
Act 4 · Why this is a category, not a feature
-

Three phases. One compounding
evidence corpus.

- -
- - -
- -
- - - - - - - - - - - - - - - - - - - - PHASE 1 · STATIC - Tool surface metadata - manifests · schemas · scopes · effect classes · approval flags - - - - PHASE 2 · SANDBOX - Failure-mode taxonomy - attack patterns · injection results · state-diff baselines · scenario library - - - - PHASE 3 · TRACE - Production trace data - tool-call events · approval logs · replay bundles · regression deltas - incident forensics · cross-org failure patterns - - - - corpus compounds - - - - - - - -
- -
-
- Three phases are not three products. They are the same evidence corpus - unfolding across three timescales. -
-
- Each user adds metadata, failure cases, and traces. The - failure taxonomy, policy library, and - trace schema compound. -
-
- This is the anti-feature defense. - A single GitHub Action lint cannot compound. A scanner backed by a growing - cross-organizational evidence corpus can. -
-
-
-
-""" - return page(body, "light", 12) - - -def slide_13_validating() -> str: - body = r""" -
Act 5 · Where I am
-

What I'm validating now.

- -
-
- Three open hypotheses. Each one converts (or kills) the company. The next 6–8 weeks are - a validation loop, not a product sprint. -
- -
- - -
-
H1
-
-
Hypothesis
-
- Production agents have a recurring pre-release readiness workflow - today — even if no one has named it. -
-
-
-
Proof I'm seeking
-
- 3–5 design partners running shipgate in real CI · 10+ release-blocking findings - on real tool surfaces · repeatable trigger event. -
-
-
- - -
-
H2
-
-
Hypothesis
-
- The first owner is platform / AI infra engineering, not - security/GRC. Security buys later, after evidence accumulates. -
-
-
-
Proof I'm seeking
-
- Design-partner data on who triggers / triages findings · which team owns - the CI gate · whether security review piggybacks on shipgate output. -
-
-
- - -
-
H3
-
-
Hypothesis
-
- Static + manifest checks are sufficient through Risk Tier 3 - (reversible internal write). Tier 4+ requires sandbox + trace. -
-
-
-
Proof I'm seeking
-
- Real findings on real tool surfaces, post-fix · false-positive rate on - static checks · which tiers actually demand simulation evidence. -
-
-
-
- -
- What I'm not claiming: PMF, runtime safety certification, or that this is foundation-model-lab-proof. Those are unknowns to be earned. -
-
-""" - return page(body, "light", 13) - - -def slide_14_north_star() -> str: - body = r""" -
Act 5 · 10-year north star
-

Today: pre-release exam.
Long term: a medical record for every agent.

- -
- - - - - - - - - TODAY - Pre-release exam - static scanner · - CI gate · SARIF report - - - - - YEAR 2 - Stress tests - sandbox · failure injection · - prompt-injection harness - - - - - YEAR 3–4 - Vital signs - trace ingestion · replay · - runtime anomaly monitors - - - - - - 10-YEAR NORTH STAR - Medical record - across the lifetime of every agent — - development, incidents, retirement - - - -
- Today we build the first instrument. The compounding ambition is to make every production agent's release, - incident, and behavior traceable and accountable across its life — the way we expect of any other deployable system. -
-
-""" - return page(body, "light", 14) - - -def slide_15_looking_for() -> str: - body = r""" -
Closing
-

What I'm looking for.

- -
-
- This deck is not a fundraise. It's an invitation to think alongside us. - Three concrete asks, in order of immediate value: -
- -
- - -
-
Ask 01
-
- Sparring partners -
-
- Founders, operators, researchers willing to push back on the thesis. Especially: - people who think this is a feature, not a category. I want to be wrong fast. -
-
- Best for: Prateek, AI-infra peers, security/GRC operators, MCP & framework authors. -
-
- - -
-
Ask 02 — most valuable now
-
- Design partners -
-
- Teams shipping production agents with non-trivial tool surfaces — refunds, customer - comms, code execution, internal data access. I want to scan, find real risk, - watch what gets fixed, learn what their CI actually demands. -
-
- Looking for: 3–5 partners over the next 6–8 weeks. -
-
- - -
-
Ask 03 — later
-
- Capital optionality -
-
- Not raising today. When the design-partner loop converts the thesis to traction, - I'd like the conversation to continue with people who already understood the worldview. -
-
- Trigger: 3+ design partners using shipgate findings to gate releases. -
-
-
- -
-
- Three Moons Lab is not building infrastructure to make agents smarter. - We're building infrastructure to make their entry into the world accountable. -
-
-
-""" - return page(body, "light", 15) - - -# --------------------------------------------------------------------------- -# Render pipeline -# --------------------------------------------------------------------------- - -SLIDE_BUILDERS = [ - (1, slide_01_cover), - (2, slide_02_inflection), - (3, slide_03_new_release_problem), - (4, slide_04_godel), - (5, slide_05_fep), - (6, slide_06_thesis), - (7, slide_07_wedge), - (8, slide_08_declared), - (9, slide_09_detected), - (10, slide_10_workflow), - (11, slide_11_phase23), - (12, slide_12_compounding), - (13, slide_13_validating), - (14, slide_14_north_star), - (15, slide_15_looking_for), -] - - -async def render_html_to_png(page, html: str, out: Path): - html_path = out.with_suffix(".html") - html_path.write_text(html) - await page.goto(f"file://{html_path}") - await page.screenshot(path=str(out), clip={"x": 0, "y": 0, "width": W, "height": H}) - - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch() - context = await browser.new_context(viewport={"width": W, "height": H}, device_scale_factor=2) - pg = await context.new_page() - for n, builder in SLIDE_BUILDERS: - html = builder() - out = BUILD_DIR / f"slide-{n:02d}.png" - await render_html_to_png(pg, html, out) - print(f" rendered slide-{n:02d}") - await browser.close() - - # All slides rendered directly — nothing else to copy. - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/docs/decks/vc-thesis/build_pptx.py b/docs/decks/vc-thesis/build_pptx.py deleted file mode 100644 index d47d667c..00000000 --- a/docs/decks/vc-thesis/build_pptx.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Build deck.pptx — each slide is the rendered PNG full-bleed, -with a speaker note derived from the reading guide. - -Visual fidelity: pixel-perfect (each slide is the rendered image). -Editability: structural (reorder, add/remove slides, edit speaker notes). -NOT editable: text on slides — to change copy, edit build_deck.py and re-run. -""" -from pathlib import Path - -from pptx import Presentation -from pptx.util import Inches, Pt - -BUILD_DIR = Path(__file__).resolve().parent / "build" -OUT = BUILD_DIR / "deck.pptx" - -# Speaker notes per slide — derived from the reading guide so Wendy has prompts -# while presenting. Edit freely in PowerPoint without affecting visuals. -NOTES = { - 1: "Subtitle: 'A working thesis — not a pitch.' " - "Set this expectation up front so the room reads correctly: I'm not raising; " - "I'm pressure-testing whether this can become a venture-scale infrastructure company.", - 2: "Highest-leverage slide for category creation. " - "The bottom callout — 'every tool change becomes a release event' — is the entire wedge in one line. " - "If they read past this and don't update their model of agents, the rest of the deck won't land.", - 3: "Three negative-space cards (it is not — software testing / LLM eval / runtime SRE) before the positive definition. " - "Defending the category claim by what it isn't is more honest than a TAM slide.", - 4: "Gödel — *why* this category must exist. External assurance is structural, not a 'nice to have'. " - "This slide closes off two competitors before they're raised: foundation labs ('we'll add this') " - "and agent self-certification ('the agent can check itself'). " - "Skip this slide if the audience is allergic to philosophy.", - 5: "Free Energy — *where* the wedge intervenes. " - "Tool calls are the only channel through which internal uncertainty becomes external side effect. " - "Pivots naturally into Slide 7's tool-use argument. " - "Skip if you skipped Slide 4.", - 6: "Stack diagram positions us between agent frameworks (above) and tool surfaces (below), " - "distinct from adjacent infra (eval, observability, MCP gateways). " - "Right side: thesis stated plainly. " - "If they push on 'why not LangSmith / Patronus / Galileo', point at the adjacency footer.", - 7: "Three reasons tool-use is the right cut. " - "Academic backup (AppWorld, ToolEmu, AgentDojo, τ-bench) shows this isn't a hunch. " - "Wedge logic in the dark callout: any miss and tool-use isn't the right cut.", - 8: "DECLARED — what the team built. Python (refund_agent.py) on the left shows the actual " - "OpenAI Agents SDK wiring. YAML (shipgate.yaml) on the right shows the release contract. " - "Read prohibited_actions out loud — 'issue refund without approval'. Hold that thought.", - 9: "DETECTED — what shipgate found. Same tool surface, scanned. BLOCKED. 2 critical, 14 high. " - "First finding: stripe.create_refund has no approval policy declared. " - "That's the contradiction with the previous slide's prohibited_actions list. " - "Let the audience connect Slide 8 ↔ Slide 9 themselves.", - 10: "How the contract gets written. Anticipates the question 'wait, where does shipgate.yaml come from?' " - "Four steps: scaffold (auto) → author (human) → scan (auto) → iterate (human + CI). " - "Bottom row makes the auto-vs-human split explicit — scanner can detect tools, only humans can write intent. " - "The Terraform analogy works well here if they push: prose policy → policy as code.", - 11: "Sandbox turns Phase 1 unknowns into experimental evidence. " - "Trace turns one-time reports into a continuous readiness state. " - "Footer line is deliberate humility: 'Phase 1 ships now. Phase 2 and 3 are land-and-expand.'", - 12: "Most important slide for category vs feature framing. " - "Three phases = same evidence corpus unfolding across timescales. " - "Failure taxonomy / policy library / trace schema all compound with every user. " - "If they say 'this could be a feature in [LangSmith / GitHub / Snyk]', this is the slide that pushes back.", - 13: "Three open hypotheses, each paired with proof being sought. " - "Closes with what I'm NOT claiming — PMF, runtime safety certification, foundation-model-lab proof. " - "The 'what I'm not claiming' line is the strongest credibility signal in the deck.", - 14: "Healthcare metaphor placed deliberately late so it doesn't dominate framing. " - "Today: first instrument. Long-term: medical record across every agent's life. " - "Skip this slide if the audience is too transactional for vision.", - 15: "Three asks in order of immediate value: sparring partners → design partners (most valuable now) → capital optionality. " - "Closing italic line is the only place the deck declares philosophy directly: " - "'infrastructure to make their entry into the world accountable.' " - "End on this beat.", -} - - -def main(): - prs = Presentation() - # 16:9 widescreen, exactly 1920x1080 in EMU at 144 DPI - prs.slide_width = Inches(13.333) - prs.slide_height = Inches(7.5) - - blank_layout = prs.slide_layouts[6] # blank - - for n in range(1, 16): - slide = prs.slides.add_slide(blank_layout) - png = BUILD_DIR / f"slide-{n:02d}.png" - if not png.exists(): - raise FileNotFoundError(png) - - # Full-bleed image - slide.shapes.add_picture( - str(png), - left=0, top=0, - width=prs.slide_width, height=prs.slide_height, - ) - - # Speaker notes - notes_tf = slide.notes_slide.notes_text_frame - notes_tf.text = NOTES.get(n, "") - - prs.save(OUT) - size_kb = OUT.stat().st_size // 1024 - print(f"Wrote {OUT} ({size_kb} KB, {size_kb / 1024:.1f} MB)") - - -if __name__ == "__main__": - main() diff --git a/docs/decks/vc-thesis/build_pptx_native.py b/docs/decks/vc-thesis/build_pptx_native.py deleted file mode 100644 index 5e4dfc77..00000000 --- a/docs/decks/vc-thesis/build_pptx_native.py +++ /dev/null @@ -1,924 +0,0 @@ -"""Native PPTX reconstruction — text editable directly in PowerPoint. - -Each slide is built with native shapes and text boxes (editable, restyleable). -A handful of complex SVG diagrams (Slide 4 Gödel loop, Slide 5 FEP boundary) -and the syntax-highlighted editor on Slide 8 embed cropped image fragments -because rebuilding them as PowerPoint shapes would lose intent. - -Output: build/deck-editable.pptx -""" -from __future__ import annotations - -from pathlib import Path - -from pptx import Presentation -from pptx.dml.color import RGBColor -from pptx.enum.shapes import MSO_SHAPE -from pptx.enum.text import PP_ALIGN, MSO_ANCHOR -from pptx.util import Inches, Pt, Emu - -DECK_DIR = Path(__file__).resolve().parent -BUILD = DECK_DIR / "build" -FRAGS = BUILD / "fragments" -LOGO = BUILD / "_logo-mark-light.png" -OUT = BUILD / "deck-editable.pptx" - -# ---- Brand palette --------------------------------------------------------- -CREAM = RGBColor(0xF5, 0xF0, 0xE5) -CREAM_2 = RGBColor(0xEC, 0xE5, 0xD5) -CREAM_3 = RGBColor(0xE0, 0xD7, 0xC0) -PAPER = RGBColor(0xFB, 0xF7, 0xEE) -NAVY = RGBColor(0x1A, 0x25, 0x30) -NAVY_2 = RGBColor(0x2A, 0x35, 0x40) -NAVY_DEEP = RGBColor(0x0E, 0x18, 0x20) -MUTED = RGBColor(0x6B, 0x5F, 0x4D) -MUTED_2 = RGBColor(0x8B, 0x7E, 0x68) -MUTED_DARK= RGBColor(0xB5, 0xA9, 0x88) -RULE = RGBColor(0xD4, 0xCC, 0xB8) -CRITICAL = RGBColor(0xB8, 0x39, 0x2F) -CRITICAL_BG = RGBColor(0xF4, 0xD9, 0xD6) -HIGH = RGBColor(0xC7, 0x6A, 0x2C) -HIGH_BG = RGBColor(0xF4, 0xE2, 0xD0) -GOLD = RGBColor(0xD4, 0xA8, 0x47) -ACCENT_GREEN = RGBColor(0xB5, 0xC9, 0x9B) - -# Fonts — use Aptos with Calibri fallback (PowerPoint 365 default) -FONT_HEAD = "Aptos Display" -FONT_BODY = "Aptos" -FONT_MONO = "Consolas" - -# ---- Slide geometry ------------------------------------------------------- -SLIDE_W = Inches(13.333) -SLIDE_H = Inches(7.5) -MARGIN_X = Inches(0.6) -MARGIN_Y = Inches(0.55) - - -# ---- Low-level helpers ----------------------------------------------------- - -def _set_shape_fill(shape, color): - fill = shape.fill - fill.solid() - fill.fore_color.rgb = color - - -def _set_shape_line(shape, color=None, width=None): - line = shape.line - if color is None: - line.fill.background() - else: - line.color.rgb = color - if width is not None: - line.width = width - - -def set_background(slide, color): - """Add a slide-sized rectangle as the back-most fill.""" - rect = slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, 0, 0, SLIDE_W, SLIDE_H) - _set_shape_fill(rect, color) - _set_shape_line(rect) - # Send to back by inserting at index 0 - spTree = rect._element.getparent() - spTree.remove(rect._element) - spTree.insert(2, rect._element) - return rect - - -def add_rect(slide, x, y, w, h, fill=None, line=None, line_w=None, radius=None): - shape_type = MSO_SHAPE.ROUNDED_RECTANGLE if radius else MSO_SHAPE.RECTANGLE - rect = slide.shapes.add_shape(shape_type, x, y, w, h) - if radius is not None: - # adjustment value 0..1 — keep small for subtle rounding - rect.adjustments[0] = radius - if fill is not None: - _set_shape_fill(rect, fill) - else: - rect.fill.background() - _set_shape_line(rect, line, line_w) - return rect - - -def add_line(slide, x1, y1, x2, y2, color, width=Pt(1)): - ln = slide.shapes.add_connector(1, x1, y1, x2, y2) # straight - ln.line.color.rgb = color - ln.line.width = width - return ln - - -def add_text(slide, x, y, w, h, text, - font=FONT_BODY, size=14, bold=False, italic=False, - color=NAVY, align="left", anchor="top", letter_spacing=None, - line_spacing=None): - """Single-run text box.""" - tb = slide.shapes.add_textbox(x, y, w, h) - tf = tb.text_frame - tf.word_wrap = True - tf.margin_left = tf.margin_right = Inches(0) - tf.margin_top = tf.margin_bottom = Inches(0) - if anchor == "top": - tf.vertical_anchor = MSO_ANCHOR.TOP - elif anchor == "middle": - tf.vertical_anchor = MSO_ANCHOR.MIDDLE - elif anchor == "bottom": - tf.vertical_anchor = MSO_ANCHOR.BOTTOM - - p = tf.paragraphs[0] - p.alignment = {"left": PP_ALIGN.LEFT, "right": PP_ALIGN.RIGHT, "center": PP_ALIGN.CENTER}[align] - if line_spacing is not None: - p.line_spacing = line_spacing - run = p.add_run() - run.text = text - run.font.name = font - run.font.size = Pt(size) - run.font.bold = bold - run.font.italic = italic - run.font.color.rgb = color - return tb - - -def add_runs(slide, x, y, w, h, runs, - align="left", anchor="top", line_spacing=None, - default_font=FONT_BODY, default_size=14): - """Multi-run text box. `runs` is a list of dicts: - {text, size, bold, italic, color, font}. - Use {"break": True} to start a new paragraph. - """ - tb = slide.shapes.add_textbox(x, y, w, h) - tf = tb.text_frame - tf.word_wrap = True - tf.margin_left = tf.margin_right = Inches(0) - tf.margin_top = tf.margin_bottom = Inches(0) - if anchor == "middle": tf.vertical_anchor = MSO_ANCHOR.MIDDLE - elif anchor == "bottom": tf.vertical_anchor = MSO_ANCHOR.BOTTOM - else: tf.vertical_anchor = MSO_ANCHOR.TOP - - paragraphs = [tf.paragraphs[0]] - cur = paragraphs[0] - cur.alignment = {"left": PP_ALIGN.LEFT, "right": PP_ALIGN.RIGHT, "center": PP_ALIGN.CENTER}[align] - if line_spacing: cur.line_spacing = line_spacing - - for r in runs: - if r.get("break"): - cur = tf.add_paragraph() - cur.alignment = {"left": PP_ALIGN.LEFT, "right": PP_ALIGN.RIGHT, "center": PP_ALIGN.CENTER}[align] - if line_spacing: cur.line_spacing = line_spacing - paragraphs.append(cur) - continue - run = cur.add_run() - run.text = r["text"] - run.font.name = r.get("font", default_font) - run.font.size = Pt(r.get("size", default_size)) - run.font.bold = r.get("bold", False) - run.font.italic = r.get("italic", False) - run.font.color.rgb = r.get("color", NAVY) - return tb - - -def add_picture(slide, x, y, w, h, path): - return slide.shapes.add_picture(str(path), x, y, w, h) - - -def add_footer(slide, slide_no, total=15, dark=False): - fg = MUTED_DARK if dark else MUTED - fg_strong = CREAM if dark else NAVY - # Brand line - dot = slide.shapes.add_shape(MSO_SHAPE.OVAL, - MARGIN_X, SLIDE_H - Inches(0.45), - Inches(0.1), Inches(0.1)) - _set_shape_fill(dot, fg_strong) - _set_shape_line(dot) - add_text(slide, MARGIN_X + Inches(0.18), SLIDE_H - Inches(0.5), - Inches(8), Inches(0.3), - "Three Moons Lab · A working thesis · April 2026", - font=FONT_BODY, size=10, bold=True, color=fg_strong) - # Slide number - add_text(slide, SLIDE_W - MARGIN_X - Inches(2), SLIDE_H - Inches(0.5), - Inches(2), Inches(0.3), - f"{slide_no:02d} / {total:02d}", - font=FONT_MONO, size=9, color=fg, align="right") - - -def add_kicker_headline(slide, kicker, headline_runs, dark=False, - headline_y=Inches(1.0), kicker_y=Inches(0.55), - headline_h=Inches(2.4)): - kicker_color = MUTED_DARK if dark else MUTED_2 - add_text(slide, MARGIN_X, kicker_y, Inches(12), Inches(0.35), - kicker, font=FONT_BODY, size=11, bold=True, - color=kicker_color, letter_spacing=0.18) - add_runs(slide, MARGIN_X, headline_y, Inches(12), headline_h, - headline_runs, line_spacing=1.05, - default_font=FONT_HEAD, default_size=44) - - -# ---- Slide builders -------------------------------------------------------- - -def build_slide_01(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - # Logo - add_picture(s, MARGIN_X, Inches(1.1), Inches(1.0), Inches(1.0), LOGO) - # Wordmark - add_text(s, MARGIN_X, Inches(2.3), Inches(6), Inches(0.4), - "T H R E E M O O N S L A B", - font=FONT_BODY, size=11, bold=True, color=MUTED) - # Headline - add_runs(s, MARGIN_X, Inches(2.85), Inches(11), Inches(2.2), - [{"text": "Release readiness", "size": 64, "bold": True, "color": NAVY}, - {"break": True}, - {"text": "for agentic systems.", "size": 64, "bold": True, "color": NAVY}], - line_spacing=1.05, default_font=FONT_HEAD) - # Subtitle italic - add_text(s, MARGIN_X, Inches(5.05), Inches(10), Inches(0.5), - "A working thesis — not a pitch.", - font=FONT_BODY, size=20, italic=True, color=NAVY_2) - # Author / date row - add_runs(s, MARGIN_X, Inches(6.1), Inches(11), Inches(0.4), - [{"text": "Wendy · pengfei@threemoonslab.com", "size": 10, "color": MUTED, "font": FONT_MONO}, - {"text": " April 2026", "size": 10, "color": MUTED, "font": FONT_MONO}, - {"text": " v0.1 — for discussion", "size": 10, "color": MUTED, "font": FONT_MONO}]) - add_footer(s, 1) - - -def build_slide_02(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 1 · THE SHIFT", - [{"text": "Models that answer.", "size": 44, "bold": True, "color": NAVY}, - {"break": True}, - {"text": "Agents that ", "size": 44, "bold": True, "color": MUTED_2}, - {"text": "act.", "size": 44, "bold": True, "color": NAVY}]) - - # LEFT card — LLM call - card_y = Inches(2.6) - card_h = Inches(3.2) - add_rect(s, MARGIN_X, card_y, Inches(6), card_h, fill=PAPER, line=RULE, line_w=Pt(0.75)) - add_text(s, MARGIN_X + Inches(0.3), card_y + Inches(0.2), Inches(5.4), Inches(0.3), - "YESTERDAY — LLM CALL", font=FONT_BODY, size=10, bold=True, color=MUTED_2) - add_text(s, MARGIN_X + Inches(0.3), card_y + Inches(0.5), Inches(5.4), Inches(0.5), - "Input → Output", font=FONT_HEAD, size=22, bold=True, color=NAVY) - # Mini diagram: prompt → model → text - dy = card_y + Inches(1.1) - box_w = Inches(1.3); box_h = Inches(0.5) - add_rect(s, MARGIN_X + Inches(0.3), dy, box_w, box_h, fill=CREAM_2, line=RULE) - add_text(s, MARGIN_X + Inches(0.3), dy, box_w, box_h, "prompt", - font=FONT_MONO, size=11, color=NAVY, align="center", anchor="middle") - add_line(s, MARGIN_X + Inches(1.65), dy + Inches(0.25), - MARGIN_X + Inches(2.1), dy + Inches(0.25), MUTED, width=Pt(1.5)) - add_rect(s, MARGIN_X + Inches(2.1), dy, box_w, box_h, fill=CREAM_2, line=RULE) - add_text(s, MARGIN_X + Inches(2.1), dy, box_w, box_h, "model", - font=FONT_MONO, size=11, color=NAVY, align="center", anchor="middle") - add_line(s, MARGIN_X + Inches(3.45), dy + Inches(0.25), - MARGIN_X + Inches(3.9), dy + Inches(0.25), MUTED, width=Pt(1.5)) - add_text(s, MARGIN_X + Inches(3.9), dy, Inches(1.0), box_h, "text", - font=FONT_MONO, size=11, color=NAVY, align="left", anchor="middle") - - add_runs(s, MARGIN_X + Inches(0.3), card_y + Inches(2.05), Inches(5.4), Inches(1.0), - [{"text": "• stateless", "size": 12, "color": NAVY_2}, - {"break": True}, - {"text": "• no real-world side effects", "size": 12, "color": NAVY_2}, - {"break": True}, - {"text": "• release risk: ", "size": 12, "color": NAVY_2}, - {"text": "\"is the answer wrong?\"", "size": 12, "italic": True, "color": MUTED}], - line_spacing=1.6) - - # RIGHT card — Agent - rx = MARGIN_X + Inches(6.4) - add_rect(s, rx, card_y, Inches(6), card_h, fill=CRITICAL_BG, line=CRITICAL, line_w=Pt(0.75)) - add_text(s, rx + Inches(0.3), card_y + Inches(0.2), Inches(5.4), Inches(0.3), - "TODAY — AGENT", font=FONT_BODY, size=10, bold=True, color=CRITICAL) - add_text(s, rx + Inches(0.3), card_y + Inches(0.5), Inches(5.4), Inches(0.6), - "Observe → Plan → Tool → Side effect → Memory", - font=FONT_HEAD, size=18, bold=True, color=NAVY) - - # Mini diagram: agent loop ellipse → tool call → side effect → world - dy = card_y + Inches(1.2) - # Loop ellipse - add_rect(s, rx + Inches(0.3), dy, Inches(1.5), Inches(0.7), fill=PAPER, line=NAVY) - add_runs(s, rx + Inches(0.3), dy, Inches(1.5), Inches(0.7), - [{"text": "observe", "size": 9, "color": NAVY, "font": FONT_MONO}, - {"break": True}, - {"text": "plan", "size": 9, "color": NAVY, "font": FONT_MONO}, - {"break": True}, - {"text": "memory", "size": 9, "color": NAVY, "font": FONT_MONO}], - line_spacing=1.0, align="center", anchor="middle") - add_line(s, rx + Inches(1.85), dy + Inches(0.35), - rx + Inches(2.4), dy + Inches(0.35), CRITICAL, width=Pt(2)) - add_rect(s, rx + Inches(2.4), dy + Inches(0.1), Inches(1.5), Inches(0.5), - fill=PAPER, line=CRITICAL) - add_text(s, rx + Inches(2.4), dy + Inches(0.1), Inches(1.5), Inches(0.5), - "side effect", - font=FONT_MONO, size=10, bold=True, color=CRITICAL, - align="center", anchor="middle") - add_line(s, rx + Inches(3.95), dy + Inches(0.35), - rx + Inches(4.55), dy + Inches(0.35), CRITICAL, width=Pt(2)) - add_text(s, rx + Inches(4.55), dy + Inches(0.1), Inches(0.9), Inches(0.5), - "world", font=FONT_MONO, size=10, color=NAVY, align="left", anchor="middle") - # tool call label above arrow - add_text(s, rx + Inches(1.85), dy - Inches(0.05), Inches(0.55), Inches(0.3), - "tool call", font=FONT_MONO, size=8, bold=True, color=CRITICAL, align="center") - - add_runs(s, rx + Inches(0.3), card_y + Inches(2.15), Inches(5.4), Inches(1.0), - [{"text": "• stateful, looping", "size": 12, "color": NAVY_2}, - {"break": True}, - {"text": "• ", "size": 12, "color": NAVY_2}, - {"text": "real consequences", "size": 12, "bold": True, "color": NAVY}, - {"text": " — refunds, emails, PRs, deploys", "size": 12, "color": NAVY_2}, - {"break": True}, - {"text": "• release risk: ", "size": 12, "color": NAVY_2}, - {"text": "\"did the agent do the wrong thing?\"", "size": 12, "italic": True, "color": CRITICAL}], - line_spacing=1.6) - - # Bottom callout - cy = Inches(6.0) - add_rect(s, MARGIN_X, cy, Inches(12.1), Inches(0.85), fill=CREAM_2, line=NAVY, line_w=Pt(2)) - add_runs(s, MARGIN_X + Inches(0.3), cy + Inches(0.1), Inches(11.7), Inches(0.7), - [{"text": "Once an agent can call tools, ", "size": 14, "color": NAVY}, - {"text": "every tool change becomes a release event.", "size": 14, "bold": True, "color": NAVY}, - {"text": " The release process built for code does not map onto agents.", "size": 14, "color": NAVY_2}], - anchor="middle", line_spacing=1.4) - - add_footer(s, 2) - - -def build_slide_03(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 1 · THE SHIFT", - [{"text": "Agent Release Readiness", "size": 42, "bold": True, "color": NAVY}, - {"break": True}, - {"text": "is a new release decision.", "size": 42, "bold": True, "color": NAVY}]) - # Lede - add_text(s, MARGIN_X, Inches(2.6), Inches(12), Inches(0.9), - "Bounded assurance that a stochastic, open, tool-using system can enter a higher-permission " - "environment — under a declared task scope, tool surface, permission boundary, and risk tier.", - font=FONT_BODY, size=15, color=NAVY_2, line_spacing=1.4) - - # Three cards - cards = [ - ("IT IS NOT — SOFTWARE TESTING", - "Software testing assumes a deterministic code path. Agents make their action graph at runtime " - "from goals, context, tools, and feedback."), - ("IT IS NOT — LLM EVAL", - "Eval scores measure input → output behavior on sampled tasks. They cannot answer whether this " - "tool surface, in this environment, is safe to ship."), - ("IT IS NOT — RUNTIME SRE", - "SLOs, canaries, and observability fire during or after execution. Release readiness is the " - "decision before we promote."), - ] - cw = Inches(3.95); ch = Inches(2.0); cy = Inches(3.85) - for i, (label, body) in enumerate(cards): - cx = MARGIN_X + i * (cw + Inches(0.1)) - add_rect(s, cx, cy, cw, ch, fill=PAPER, line=RULE) - add_text(s, cx + Inches(0.25), cy + Inches(0.2), cw - Inches(0.5), Inches(0.3), - label, font=FONT_BODY, size=9, bold=True, color=MUTED_2) - add_text(s, cx + Inches(0.25), cy + Inches(0.55), cw - Inches(0.5), ch - Inches(0.7), - body, font=FONT_BODY, size=12, color=NAVY_2, line_spacing=1.4) - - # Bottom dark callout - cby = Inches(6.05) - add_rect(s, MARGIN_X, cby, Inches(12.1), Inches(0.85), fill=NAVY) - add_runs(s, MARGIN_X + Inches(0.3), cby + Inches(0.1), Inches(11.7), Inches(0.7), - [{"text": "IT IS ", "size": 11, "bold": True, "color": MUTED_DARK, "font": FONT_BODY}, - {"text": "An ", "size": 14, "color": CREAM}, - {"text": "evidence-based release decision", "size": 14, "bold": True, "color": GOLD}, - {"text": " over a stochastic, tool-using, state-mutating system — graded against a declared " - "operational envelope.", "size": 14, "color": CREAM}], - anchor="middle", line_spacing=1.4) - - add_footer(s, 3) - - -def build_slide_04(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, NAVY_DEEP) - add_kicker_headline(s, "ACT 2 · FIRST PRINCIPLE №1", - [{"text": "A sufficiently capable agent", "size": 36, "bold": True, "color": CREAM}, - {"break": True}, - {"text": "cannot self-certify its own readiness.", "size": 36, "bold": True, "color": CREAM}], - dark=True, headline_h=Inches(2.0)) - - # LEFT prose - px = MARGIN_X; py = Inches(3.2); pw = Inches(6.5) - add_runs(s, px, py, pw, Inches(3.5), - [{"text": "Any system rich enough to express its own behavior contains statements about itself " - "it cannot prove from within.", "size": 15, "color": MUTED_DARK}, - {"break": True}, {"break": True}, - {"text": "For agents, those statements are about ", "size": 15, "color": MUTED_DARK}, - {"text": "side effects", "size": 15, "bold": True, "color": CREAM}, - {"text": ", ", "size": 15, "color": MUTED_DARK}, - {"text": "long-horizon consequence", "size": 15, "bold": True, "color": CREAM}, - {"text": ", and ", "size": 15, "color": MUTED_DARK}, - {"text": "prompt-injection susceptibility", "size": 15, "bold": True, "color": CREAM}, - {"text": ".", "size": 15, "color": MUTED_DARK}, - {"break": True}, {"break": True}, - {"text": "External assurance is not optional. It is structural.", - "size": 15, "bold": True, "color": GOLD}], - line_spacing=1.45) - - # RIGHT diagram (cropped image) - add_picture(s, Inches(7.5), Inches(2.2), Inches(5.4), Inches(3.86), - FRAGS / "godel-loop.png") - - add_footer(s, 4, dark=True) - - -def build_slide_05(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, NAVY_DEEP) - add_kicker_headline(s, "ACT 2 · FIRST PRINCIPLE №2", - [{"text": "Tool calls are where uncertainty", "size": 36, "bold": True, "color": CREAM}, - {"break": True}, - {"text": "escapes into the world.", "size": 36, "bold": True, "color": CREAM}], - dark=True, headline_h=Inches(2.0)) - - # LEFT prose - px = MARGIN_X; py = Inches(3.2); pw = Inches(6.5) - add_runs(s, px, py, pw, Inches(3.5), - [{"text": "Agents act to minimize prediction error under their generative model of reality — the ", - "size": 15, "color": MUTED_DARK}, - {"text": "Free Energy Principle", "size": 15, "italic": True, "color": MUTED_DARK}, - {"text": " framing.", "size": 15, "color": MUTED_DARK}, - {"break": True}, {"break": True}, - {"text": "The ", "size": 15, "color": MUTED_DARK}, - {"text": "tool boundary", "size": 15, "bold": True, "color": CREAM}, - {"text": " is the only channel through which an agent's internal uncertainty becomes external side effect.", - "size": 15, "color": MUTED_DARK}, - {"break": True}, {"break": True}, - {"text": "Release readiness = bounding free energy at the action boundary ", - "size": 15, "bold": True, "color": GOLD}, - {"text": "before", "size": 15, "italic": True, "color": MUTED_DARK}, - {"text": " it propagates.", "size": 15, "bold": True, "color": GOLD}], - line_spacing=1.45) - - # RIGHT diagram (cropped image) - add_picture(s, Inches(7.3), Inches(2.5), Inches(5.5), Inches(3.3), - FRAGS / "fep-boundary.png") - - add_footer(s, 5, dark=True) - - -def build_slide_06(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 3 · OUR THESIS", - [{"text": "The evidence layer between", "size": 42, "bold": True, "color": NAVY}, - {"break": True}, - {"text": "agent dev and production action.", "size": 42, "bold": True, "color": NAVY}]) - - # LEFT — stack - sx = MARGIN_X; sy = Inches(2.8); sw = Inches(7.0) - # ABOVE - add_rect(s, sx, sy, sw, Inches(0.75), fill=PAPER, line=RULE) - add_text(s, sx + Inches(0.25), sy + Inches(0.07), sw, Inches(0.25), - "ABOVE", font=FONT_BODY, size=9, bold=True, color=MUTED_2) - add_text(s, sx + Inches(0.25), sy + Inches(0.32), sw - Inches(0.5), Inches(0.4), - "Agent frameworks · OpenAI Agents SDK · Anthropic · Google ADK · LangChain · CrewAI", - font=FONT_BODY, size=12, color=NAVY_2) - # MIDDLE — Three Moons - sy2 = sy + Inches(0.85) - add_rect(s, sx, sy2, sw, Inches(1.15), fill=NAVY) - add_text(s, sx + Inches(0.25), sy2 + Inches(0.1), sw, Inches(0.25), - "THREE MOONS LAB — WHAT'S MISSING", - font=FONT_BODY, size=9, bold=True, color=GOLD) - add_text(s, sx + Inches(0.25), sy2 + Inches(0.4), sw - Inches(0.5), Inches(0.4), - "CI/CD + audit layer for agentic systems", - font=FONT_HEAD, size=18, bold=True, color=CREAM) - add_text(s, sx + Inches(0.25), sy2 + Inches(0.78), sw - Inches(0.5), Inches(0.3), - "pre-release evidence · trace-based replay · runtime continuous readiness", - font=FONT_BODY, size=11, color=MUTED_DARK) - # BELOW - sy3 = sy2 + Inches(1.25) - add_rect(s, sx, sy3, sw, Inches(0.75), fill=PAPER, line=RULE) - add_text(s, sx + Inches(0.25), sy3 + Inches(0.07), sw, Inches(0.25), - "BELOW", font=FONT_BODY, size=9, bold=True, color=MUTED_2) - add_text(s, sx + Inches(0.25), sy3 + Inches(0.32), sw - Inches(0.5), Inches(0.4), - "Tool surfaces · MCP · OpenAPI · function tools · shell · computer use", - font=FONT_BODY, size=12, color=NAVY_2) - # Adjacency note - add_text(s, sx, sy3 + Inches(0.95), sw, Inches(0.3), - "Adjacent (not us): eval frameworks · runtime guardrails · LLM observability · MCP gateways.", - font=FONT_BODY, size=10, italic=True, color=MUTED) - - # RIGHT — thesis card - tx = sx + sw + Inches(0.4); ty = Inches(2.8); tw = Inches(4.7); th = Inches(3.6) - add_rect(s, tx, ty, tw, th, fill=CREAM_2) - # Left edge line - add_rect(s, tx, ty, Inches(0.05), th, fill=NAVY) - add_text(s, tx + Inches(0.3), ty + Inches(0.25), tw - Inches(0.5), Inches(0.3), - "THESIS", font=FONT_BODY, size=10, bold=True, color=MUTED_2) - add_runs(s, tx + Inches(0.3), ty + Inches(0.65), tw - Inches(0.5), Inches(2.0), - [{"text": "Every production agent will need a ", "size": 16, "color": NAVY}, - {"text": "release-readiness record", "size": 16, "bold": True, "color": NAVY}, - {"text": " before it gets promoted — and a ", "size": 16, "color": NAVY}, - {"text": "trace-replayable evidence trail", "size": 16, "bold": True, "color": NAVY}, - {"text": " after.", "size": 16, "color": NAVY}], - line_spacing=1.35) - add_runs(s, tx + Inches(0.3), ty + Inches(2.4), tw - Inches(0.5), Inches(1.0), - [{"text": "That record won't live inside the model. It won't live inside the framework. " - "It has to live in ", "size": 13, "color": NAVY_2}, - {"text": "independent infrastructure", "size": 13, "bold": True, "color": NAVY}, - {"text": ".", "size": 13, "color": NAVY_2}], - line_spacing=1.4) - - add_footer(s, 6) - - -def build_slide_07(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 3 · OUR WEDGE", - [{"text": "Tool-use is the right wedge.", "size": 44, "bold": True, "color": NAVY}]) - - cards = [ - ("①", "Action boundary", - "Tool call = the moment language becomes consequence. It's where every interesting risk " - "crystallizes: side effect, scope, approval, idempotency, recoverability.", - "The model becoming smarter doesn't change this boundary. It only makes it more active."), - ("②", "Most formalizable", - "Tool surfaces ship with structure: schemas, scopes, MCP annotations, OpenAPI specs, SDK " - "function signatures. Static analysis bites — unlike \"is the agent's reasoning correct?\"", - "Formalize what's crisp · annotate what's contextual · review what's ambiguous."), - ("③", "Highest-leverage risk", - "AppWorld, ToolEmu, AgentDojo, τ-bench, AgentHarm — the academic evidence converges: " - "tool-use is where current agents fail, where attacks land, where damage compounds.", - "High-stakes tools (refund, email, deploy, delete) need readiness, not a benchmark score."), - ] - cw = Inches(3.95); ch = Inches(3.4); cy = Inches(2.4) - for i, (num, title, body, foot) in enumerate(cards): - cx = MARGIN_X + i * (cw + Inches(0.1)) - add_rect(s, cx, cy, cw, ch, fill=PAPER, line=RULE) - add_text(s, cx + Inches(0.3), cy + Inches(0.2), Inches(1), Inches(0.7), - num, font=FONT_HEAD, size=32, bold=True, color=CRITICAL) - add_text(s, cx + Inches(0.3), cy + Inches(0.9), cw - Inches(0.6), Inches(0.5), - title, font=FONT_HEAD, size=18, bold=True, color=NAVY) - add_text(s, cx + Inches(0.3), cy + Inches(1.45), cw - Inches(0.6), Inches(1.4), - body, font=FONT_BODY, size=11, color=NAVY_2, line_spacing=1.4) - add_text(s, cx + Inches(0.3), cy + Inches(2.85), cw - Inches(0.6), Inches(0.5), - foot, font=FONT_BODY, size=9, italic=True, color=MUTED, line_spacing=1.4) - - # Bottom dark callout - cby = Inches(6.0) - add_rect(s, MARGIN_X, cby, Inches(12.1), Inches(0.9), fill=NAVY) - add_runs(s, MARGIN_X + Inches(0.3), cby + Inches(0.1), Inches(11.7), Inches(0.7), - [{"text": "Wedge logic: ", "size": 13, "bold": True, "color": GOLD}, - {"text": "The narrowest cut where the static check is meaningful, the risk is real, " - "the buyer is identifiable, and the evidence corpus compounds. ", "size": 13, "color": CREAM}, - {"text": "Tool-use clears all four.", "size": 13, "bold": True, "color": CREAM}], - anchor="middle", line_spacing=1.4) - - add_footer(s, 7) - - -def build_slide_08(prs): - """Slide 8 — DECLARED. Python source + YAML config side-by-side. - Kept as full-bleed image because the syntax-highlighted editor view - can't be cleanly reconstructed in PowerPoint shapes.""" - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_picture(s, 0, 0, SLIDE_W, SLIDE_H, BUILD / "slide-08.png") - - -def build_slide_09(prs): - """Slide 9 — DETECTED. The shipgate report on the declared tool surface. - Kept as full-bleed image so the report card hierarchy stays intact.""" - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_picture(s, 0, 0, SLIDE_W, SLIDE_H, BUILD / "slide-09.png") - - -def build_slide_10(prs): - """Slide 10 — How the contract gets written (init → author → scan → iterate). - Kept as full-bleed image because of the dense 4-card flow + 3-column footer.""" - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_picture(s, 0, 0, SLIDE_W, SLIDE_H, BUILD / "slide-10.png") - - -def build_slide_11(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 4 · THE PRODUCT PATH", - [{"text": "Beyond static — sandbox & trace.", "size": 42, "bold": True, "color": NAVY}]) - - cards = [ - ("PHASE 2", "~6–12 months out", "Sandbox & simulation", - "Turn the unknowns surfaced in Phase 1 into experimental evidence — without exposing production state.", - ["Mocked tool execution & failure injection", - "Prompt-injection harness on read-tools (web, email, docs)", - "State-diff assertions for collateral damage", - "Synthetic adversarial scenarios (ToolEmu / AppWorld lineage)"], - "Output: pre-promotion stress test report. CI-attachable. Fails-loud on regressions."), - ("PHASE 3", "~12–24 months out", "Trace, replay, runtime", - "Turn one-time pre-release reports into a continuous readiness state — pulled from the agent's actual production behavior.", - ["Trace ingestion: OpenAI Agents SDK, MCP events, custom hooks", - "Replay bundles for incident forensics", - "Regression detection across prompt / model / tool changes", - "Runtime anomaly & blast-radius monitors"], - "Output: living readiness state. Audit-grade. Connected to incident review."), - ] - cw = Inches(6.0); ch = Inches(3.95); cy = Inches(2.4) - for i, (label, when, title, lede, bullets, foot) in enumerate(cards): - cx = MARGIN_X + i * (cw + Inches(0.1)) - add_rect(s, cx, cy, cw, ch, fill=PAPER, line=RULE) - add_text(s, cx + Inches(0.3), cy + Inches(0.18), Inches(2), Inches(0.3), - label, font=FONT_BODY, size=10, bold=True, color=MUTED_2) - add_text(s, cx + cw - Inches(2.3), cy + Inches(0.18), Inches(2), Inches(0.3), - when, font=FONT_MONO, size=9, color=MUTED, align="right") - add_text(s, cx + Inches(0.3), cy + Inches(0.5), cw - Inches(0.6), Inches(0.5), - title, font=FONT_HEAD, size=22, bold=True, color=NAVY) - add_text(s, cx + Inches(0.3), cy + Inches(1.05), cw - Inches(0.6), Inches(0.8), - lede, font=FONT_BODY, size=11, color=NAVY_2, line_spacing=1.4) - bullet_runs = [] - for j, b in enumerate(bullets): - if j > 0: bullet_runs.append({"break": True}) - bullet_runs.append({"text": "• " + b, "size": 11, "color": NAVY_2}) - add_runs(s, cx + Inches(0.3), cy + Inches(1.95), cw - Inches(0.6), Inches(1.5), - bullet_runs, line_spacing=1.5) - add_line(s, cx + Inches(0.3), cy + ch - Inches(0.6), - cx + cw - Inches(0.3), cy + ch - Inches(0.6), RULE) - add_text(s, cx + Inches(0.3), cy + ch - Inches(0.5), cw - Inches(0.6), Inches(0.4), - foot, font=FONT_BODY, size=10, italic=True, color=MUTED, line_spacing=1.4) - - add_text(s, MARGIN_X, Inches(6.55), Inches(12.1), Inches(0.4), - "Phase 1 ships now. Phase 2 and 3 are deliberate land-and-expand, not a roadmap to be promised on Slide 9.", - font=FONT_BODY, size=11, color=NAVY_2, align="center", italic=True) - - add_footer(s, 11) - - -def build_slide_12(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 4 · WHY THIS IS A CATEGORY, NOT A FEATURE", - [{"text": "Three phases. One compounding", "size": 42, "bold": True, "color": NAVY}, - {"break": True}, - {"text": "evidence corpus.", "size": 42, "bold": True, "color": NAVY}]) - - # LEFT — stacked layers - sx = MARGIN_X; sw = Inches(7.6) - # Phase 3 (top) - p3y = Inches(2.6); p3h = Inches(1.05) - add_rect(s, sx, p3y, sw, p3h, fill=CREAM_3, line=NAVY) - add_text(s, sx + Inches(0.3), p3y + Inches(0.1), sw, Inches(0.25), - "PHASE 3 · TRACE", font=FONT_BODY, size=10, bold=True, color=MUTED_2) - add_text(s, sx + Inches(0.3), p3y + Inches(0.35), sw - Inches(0.5), Inches(0.3), - "Production trace data", font=FONT_HEAD, size=16, bold=True, color=NAVY) - add_text(s, sx + Inches(0.3), p3y + Inches(0.65), sw - Inches(0.5), Inches(0.4), - "tool-call events · approval logs · replay bundles · regression deltas · incident forensics", - font=FONT_MONO, size=10, color=NAVY_2) - # Phase 2 (middle) - p2y = p3y + p3h + Inches(0.05); p2h = Inches(0.95) - add_rect(s, sx, p2y, sw, p2h, fill=CREAM_2, line=NAVY) - add_text(s, sx + Inches(0.3), p2y + Inches(0.1), sw, Inches(0.25), - "PHASE 2 · SANDBOX", font=FONT_BODY, size=10, bold=True, color=MUTED_2) - add_text(s, sx + Inches(0.3), p2y + Inches(0.35), sw - Inches(0.5), Inches(0.3), - "Failure-mode taxonomy", font=FONT_HEAD, size=16, bold=True, color=NAVY) - add_text(s, sx + Inches(0.3), p2y + Inches(0.65), sw - Inches(0.5), Inches(0.3), - "attack patterns · injection results · state-diff baselines · scenario library", - font=FONT_MONO, size=10, color=NAVY_2) - # Phase 1 (bottom) - p1y = p2y + p2h + Inches(0.05); p1h = Inches(0.95) - add_rect(s, sx, p1y, sw, p1h, fill=PAPER, line=NAVY) - add_text(s, sx + Inches(0.3), p1y + Inches(0.1), sw, Inches(0.25), - "PHASE 1 · STATIC", font=FONT_BODY, size=10, bold=True, color=MUTED_2) - add_text(s, sx + Inches(0.3), p1y + Inches(0.35), sw - Inches(0.5), Inches(0.3), - "Tool surface metadata", font=FONT_HEAD, size=16, bold=True, color=NAVY) - add_text(s, sx + Inches(0.3), p1y + Inches(0.65), sw - Inches(0.5), Inches(0.3), - "manifests · schemas · scopes · effect classes · approval flags", - font=FONT_MONO, size=10, color=NAVY_2) - # Compound arrow on right of the stack - arrow_x = sx + sw + Inches(0.15) - arrow_top_y = p3y + Inches(0.05) - arrow_bottom_y = p1y + p1h - Inches(0.05) - add_line(s, arrow_x, arrow_bottom_y, arrow_x, arrow_top_y + Inches(0.18), - CRITICAL, width=Pt(2.5)) - # Arrow head triangle pointing up - head = s.shapes.add_shape(MSO_SHAPE.ISOSCELES_TRIANGLE, - arrow_x - Inches(0.12), arrow_top_y, - Inches(0.24), Inches(0.22)) - _set_shape_fill(head, CRITICAL) - _set_shape_line(head) - # "corpus compounds" caption below the stack — keeps it out of the right column - add_text(s, sx, p1y + p1h + Inches(0.15), sw, Inches(0.3), - "corpus compounds across phases →", - font=FONT_BODY, size=11, bold=True, italic=True, - color=CRITICAL, align="right") - - # RIGHT — explanation - rx = sx + sw + Inches(1.2); rw = Inches(3.9) - add_text(s, rx, Inches(2.6), rw, Inches(1.2), - "Three phases are not three products. They are the same evidence corpus unfolding " - "across three timescales.", - font=FONT_BODY, size=14, color=NAVY_2, line_spacing=1.45) - add_runs(s, rx, Inches(3.85), rw, Inches(1.0), - [{"text": "Each user adds metadata, failure cases, and traces. The ", "size": 12, "color": NAVY_2}, - {"text": "failure taxonomy", "size": 12, "bold": True, "color": NAVY}, - {"text": ", ", "size": 12, "color": NAVY_2}, - {"text": "policy library", "size": 12, "bold": True, "color": NAVY}, - {"text": ", and ", "size": 12, "color": NAVY_2}, - {"text": "trace schema", "size": 12, "bold": True, "color": NAVY}, - {"text": " compound.", "size": 12, "color": NAVY_2}], - line_spacing=1.4) - # Anti-feature box - afy = Inches(4.85) - add_rect(s, rx, afy, rw, Inches(1.7), fill=NAVY) - add_runs(s, rx + Inches(0.2), afy + Inches(0.15), rw - Inches(0.4), Inches(1.4), - [{"text": "This is the anti-feature defense.", "size": 11, "bold": True, "color": GOLD}, - {"text": " A single GitHub Action lint cannot compound. A scanner backed by a growing " - "cross-organizational evidence corpus can.", "size": 11, "color": CREAM}], - line_spacing=1.4) - - add_footer(s, 12) - - -def build_slide_13(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 5 · WHERE I AM", - [{"text": "What I'm validating now.", "size": 44, "bold": True, "color": NAVY}]) - add_text(s, MARGIN_X, Inches(2.05), Inches(12), Inches(0.5), - "Three open hypotheses. Each one converts (or kills) the company. The next 6–8 weeks are " - "a validation loop, not a product sprint.", - font=FONT_BODY, size=14, color=NAVY_2, line_spacing=1.4) - - rows = [ - ("H1", - "Production agents have a recurring pre-release readiness workflow today — even if no one has named it.", - "3–5 design partners running shipgate in real CI · 10+ release-blocking findings on real tool surfaces · repeatable trigger event."), - ("H2", - "The first owner is platform / AI infra engineering, not security/GRC. Security buys later, after evidence accumulates.", - "Design-partner data on who triggers / triages findings · which team owns the CI gate · whether security review piggybacks on shipgate output."), - ("H3", - "Static + manifest checks are sufficient through Risk Tier 3 (reversible internal write). Tier 4+ requires sandbox + trace.", - "Real findings on real tool surfaces, post-fix · false-positive rate on static checks · which tiers actually demand simulation evidence."), - ] - rh = Inches(1.05); ry0 = Inches(2.85) - for i, (label, hyp, proof) in enumerate(rows): - ry = ry0 + i * (rh + Inches(0.15)) - add_rect(s, MARGIN_X, ry, Inches(12.1), rh, fill=PAPER, line=RULE) - add_text(s, MARGIN_X + Inches(0.25), ry + Inches(0.2), Inches(0.8), Inches(0.7), - label, font=FONT_HEAD, size=22, bold=True, color=CRITICAL, anchor="middle") - add_text(s, MARGIN_X + Inches(1.1), ry + Inches(0.13), Inches(2.5), Inches(0.25), - "HYPOTHESIS", font=FONT_BODY, size=8, bold=True, color=MUTED_2) - add_text(s, MARGIN_X + Inches(1.1), ry + Inches(0.38), Inches(5.3), Inches(0.65), - hyp, font=FONT_BODY, size=11, color=NAVY, line_spacing=1.3) - add_text(s, MARGIN_X + Inches(6.6), ry + Inches(0.13), Inches(3), Inches(0.25), - "PROOF I'M SEEKING", font=FONT_BODY, size=8, bold=True, color=MUTED_2) - add_text(s, MARGIN_X + Inches(6.6), ry + Inches(0.38), Inches(5.4), Inches(0.65), - proof, font=FONT_BODY, size=11, color=NAVY_2, line_spacing=1.3) - - add_text(s, MARGIN_X, Inches(6.4), Inches(12.1), Inches(0.4), - "What I'm not claiming: PMF, runtime safety certification, or that this is foundation-model-lab-proof. " - "Those are unknowns to be earned.", - font=FONT_BODY, size=11, italic=True, color=MUTED, align="center") - - add_footer(s, 13) - - -def build_slide_14(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "ACT 5 · 10-YEAR NORTH STAR", - [{"text": "Today: pre-release exam.", "size": 36, "bold": True, "color": NAVY}, - {"break": True}, - {"text": "Long-term: a record for every agent.", "size": 36, "bold": True, "color": NAVY}], - headline_h=Inches(2.0)) - - # Timeline axis - ty = Inches(4.3) - add_line(s, MARGIN_X + Inches(0.5), ty, SLIDE_W - MARGIN_X - Inches(0.5), ty, NAVY, width=Pt(1.5)) - - # Stages: (x, label, title, sub1, sub2, color, marker_text) - stages = [ - (Inches(1.4), "TODAY", "Pre-release exam", "static scanner ·", "CI gate · SARIF report", NAVY, "⊕"), - (Inches(4.5), "YEAR 2", "Stress tests", "sandbox · failure injection ·", "prompt-injection harness", NAVY, "⊗"), - (Inches(7.6), "YEAR 3–4", "Vital signs", "trace ingestion · replay ·", "runtime anomaly monitors", NAVY, "♥"), - (Inches(10.7), "10-YEAR NORTH STAR", "Medical record", "across the lifetime of every agent —", "development, incidents, retirement", CRITICAL, "★"), - ] - for cx, label, title, sub1, sub2, color, marker in stages: - # Marker symbol above - add_text(s, cx - Inches(0.4), ty - Inches(1.0), Inches(0.8), Inches(0.5), - marker, font=FONT_HEAD, size=24, color=color, align="center") - # Dot on axis - dot = slide_oval(s, cx - Inches(0.07), ty - Inches(0.07), Inches(0.14), Inches(0.14)) - _set_shape_fill(dot, color) - _set_shape_line(dot) - # Label below - add_text(s, cx - Inches(1.5), ty + Inches(0.2), Inches(3.0), Inches(0.3), - label, font=FONT_BODY, size=10, bold=True, color=color if color == CRITICAL else MUTED_2, - align="center") - add_text(s, cx - Inches(1.5), ty + Inches(0.5), Inches(3.0), Inches(0.4), - title, font=FONT_HEAD, size=16, bold=True, color=NAVY, align="center") - add_text(s, cx - Inches(1.5), ty + Inches(0.95), Inches(3.0), Inches(0.3), - sub1, font=FONT_BODY, size=10, color=NAVY_2, align="center") - add_text(s, cx - Inches(1.5), ty + Inches(1.2), Inches(3.0), Inches(0.3), - sub2, font=FONT_BODY, size=10, color=NAVY_2, align="center") - - # Bottom callout — moved up to clear the footer - cby = Inches(6.25) - add_rect(s, MARGIN_X, cby, Inches(12.1), Inches(0.6), fill=CREAM_2) - add_rect(s, MARGIN_X, cby, Inches(0.05), Inches(0.6), fill=NAVY) - add_text(s, MARGIN_X + Inches(0.3), cby + Inches(0.04), Inches(11.7), Inches(0.5), - "Today we build the first instrument. The compounding ambition is to make every production agent's " - "release, incident, and behavior traceable and accountable across its life.", - font=FONT_BODY, size=11, color=NAVY, anchor="middle", line_spacing=1.4) - - add_footer(s, 14) - - -def slide_oval(slide, x, y, w, h): - return slide.shapes.add_shape(MSO_SHAPE.OVAL, x, y, w, h) - - -def build_slide_15(prs): - s = prs.slides.add_slide(prs.slide_layouts[6]) - set_background(s, CREAM) - add_kicker_headline(s, "CLOSING", - [{"text": "What I'm looking for.", "size": 44, "bold": True, "color": NAVY}]) - add_text(s, MARGIN_X, Inches(2.05), Inches(12), Inches(0.5), - "This deck is not a fundraise. It's an invitation to think alongside us. " - "Three concrete asks, in order of immediate value:", - font=FONT_BODY, size=14, color=NAVY_2, line_spacing=1.4) - - cards = [ - ("ASK 01", "Sparring partners", - "Founders, operators, researchers willing to push back on the thesis. Especially: people " - "who think this is a feature, not a category. I want to be wrong fast.", - "Best for: Prateek, AI-infra peers, security/GRC operators, MCP & framework authors.", - "light"), - ("ASK 02 — MOST VALUABLE NOW", "Design partners", - "Teams shipping production agents with non-trivial tool surfaces — refunds, customer comms, " - "code execution, internal data access. I want to scan, find real risk, watch what gets fixed, " - "learn what their CI actually demands.", - "Looking for: 3–5 partners over the next 6–8 weeks.", - "dark"), - ("ASK 03 — LATER", "Capital optionality", - "Not raising today. When the design-partner loop converts the thesis to traction, I'd like " - "the conversation to continue with people who already understood the worldview.", - "Trigger: 3+ design partners using shipgate findings to gate releases.", - "light"), - ] - cw = Inches(3.95); ch = Inches(3.4); cy = Inches(2.65) - for i, (label, title, body, foot, theme) in enumerate(cards): - cx = MARGIN_X + i * (cw + Inches(0.1)) - if theme == "dark": - add_rect(s, cx, cy, cw, ch, fill=NAVY) - label_color = GOLD - title_color = CREAM - body_color = MUTED_DARK - foot_color = MUTED_DARK - else: - add_rect(s, cx, cy, cw, ch, fill=PAPER, line=RULE) - label_color = MUTED_2 - title_color = NAVY - body_color = NAVY_2 - foot_color = MUTED - add_text(s, cx + Inches(0.3), cy + Inches(0.22), cw - Inches(0.6), Inches(0.3), - label, font=FONT_BODY, size=10, bold=True, color=label_color) - add_text(s, cx + Inches(0.3), cy + Inches(0.6), cw - Inches(0.6), Inches(0.5), - title, font=FONT_HEAD, size=20, bold=True, color=title_color) - add_text(s, cx + Inches(0.3), cy + Inches(1.2), cw - Inches(0.6), Inches(1.7), - body, font=FONT_BODY, size=11, color=body_color, line_spacing=1.4) - add_text(s, cx + Inches(0.3), cy + ch - Inches(0.45), cw - Inches(0.6), Inches(0.35), - foot, font=FONT_BODY, size=9, italic=True, color=foot_color) - - # Bottom callout — moved up to clear the footer (text wraps to 2 lines) - cby = Inches(6.2) - cbh = Inches(0.65) - add_rect(s, MARGIN_X, cby, Inches(12.1), cbh, fill=CREAM_2) - add_rect(s, MARGIN_X, cby, Inches(0.05), cbh, fill=CRITICAL) - add_runs(s, MARGIN_X + Inches(0.3), cby + Inches(0.08), Inches(11.7), cbh - Inches(0.16), - [{"text": "Three Moons Lab is not building infrastructure to make agents smarter. " - "We're building infrastructure to make their entry into the world ", - "size": 11, "italic": True, "color": NAVY}, - {"text": "accountable", "size": 11, "italic": True, "bold": True, "color": CRITICAL}, - {"text": ".", "size": 11, "italic": True, "color": NAVY}], - anchor="middle", line_spacing=1.4) - - add_footer(s, 15) - - -# ---- Main ----------------------------------------------------------------- - -BUILDERS = [ - build_slide_01, build_slide_02, build_slide_03, build_slide_04, build_slide_05, - build_slide_06, build_slide_07, build_slide_08, build_slide_09, build_slide_10, - build_slide_11, build_slide_12, build_slide_13, build_slide_14, build_slide_15, -] - - -def main(): - prs = Presentation() - prs.slide_width = SLIDE_W - prs.slide_height = SLIDE_H - - for builder in BUILDERS: - builder(prs) - - prs.save(OUT) - size_kb = OUT.stat().st_size // 1024 - print(f"Wrote {OUT} ({size_kb} KB)") - - -if __name__ == "__main__": - main() diff --git a/docs/decks/vc-thesis/crop_fragments.py b/docs/decks/vc-thesis/crop_fragments.py deleted file mode 100644 index da7d0e63..00000000 --- a/docs/decks/vc-thesis/crop_fragments.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Crop image fragments from rendered slides for embedding in the native pptx. - -We only crop the regions that can't be reconstructed in PowerPoint shapes -without losing visual intent: - -- Slide 4 Gödel self-reference loop SVG (right column) -- Slide 5 FEP three-region diagram (right column) -- Slide 8 syntax-highlighted editor panel + report card (full slide content) -""" -from pathlib import Path - -from PIL import Image - -BUILD = Path(__file__).resolve().parent / "build" -FRAGS = BUILD / "fragments" -FRAGS.mkdir(exist_ok=True) - -# Rendered slides are 1920x1080@2x = 3840x2160. Coordinates given in 1x. -def crop_2x(src: Path, x: int, y: int, w: int, h: int, out: Path): - im = Image.open(src) - sx = im.width / 1920 - sy = im.height / 1080 - box = (int(x * sx), int(y * sy), int((x + w) * sx), int((y + h) * sy)) - crop = im.crop(box) - crop.save(out, optimize=True) - print(f" {out.name} ({crop.size[0]}×{crop.size[1]})") - - -# Slide 4 — self-reference loop. SVG was at (~960, ~430) with size (540, 400). -crop_2x(BUILD / "slide-04.png", 920, 380, 700, 500, FRAGS / "godel-loop.png") - -# Slide 5 — FEP three-region diagram. Was right column. -crop_2x(BUILD / "slide-05.png", 1000, 400, 800, 480, FRAGS / "fep-boundary.png") - -# Slide 8 — keep the entire editor + report panel as a single image fragment. -# Use the v2 source which is already at exactly the right composition. -slide_08 = BUILD / "slide-08.png" -crop_2x(slide_08, 0, 0, 1920, 1080, FRAGS / "slide-08-full.png") diff --git a/docs/decks/vc-thesis/slide-08-options/README.md b/docs/decks/vc-thesis/slide-08-options/README.md deleted file mode 100644 index 1b66f865..00000000 --- a/docs/decks/vc-thesis/slide-08-options/README.md +++ /dev/null @@ -1,81 +0,0 @@ -# Slide 8 — three options to compare - -This folder holds three drafts of Slide 8 (Phase 1 — Static Release-Readiness Scanner). -All three use the same real artifact: `samples/support_refund_agent/expected/report.md`, -which is the CI-baseline output of `agents-shipgate scan` on the canonical -support-refund-agent fixture (Stripe refund + Shopify cancel + Gmail send + Zendesk + KB + -wildcard MCP exposure). Findings shown are the actual check IDs the scanner emits. - -Pick one. Delete the others. - -## V1 — Raw report rendering · `v1-raw-report.png` - -The actual `report.md` rendered as a PNG, framed by the slide chrome. Brand cream -background, navy text, monospace report card with a soft fade at the bottom signaling -"more findings below." - -**Tone:** honest, OSS-scanner, "this is the artifact, not a mockup." -**Strength:** maximum credibility — every finding ID, evidence string, recommendation -is exactly what the scanner outputs. No design liberties. -**Weakness:** dense; the audience has to read monospace text from slide distance. -**Use when:** the viewer is technical and you want to ground the deck in real output -without product polish. - -## V2 — Designed product-UI summary · `v2-product-ui.png` - -A clean dashboard tile mock with verdict banner, severity counts grid, and 4 top -findings as styled rows. Real check IDs preserved. Tool-surface summary as pill chips. - -**Tone:** "this could be a real product surface." -**Strength:** scannable from across a room; the verdict + counts read first, the -findings second, the inventory third — clear visual hierarchy. Strong slide energy. -**Weakness:** it's a designed mockup, not the actual output. Some viewers may discount -that you "designed it to look good." -**Use when:** the viewer is a VC/operator who wants to see a product story, not a -terminal capture; or when you want to set up Phase 2/3 visually consistent with this. - -## V3 — Animated terminal GIF · `v3-terminal-scan.gif` - -A ~9-second animated terminal showing the command being typed, then the actual -output streaming in: status, severity counts, top findings table, reports written, -exit code 20. Brand-matched cream slide chrome around a dark terminal panel. - -**Tone:** live demo without the latency. -**Strength:** the only option that *moves*; the eye locks onto the BLOCKED moment -and the cascade of CRITICAL findings. Memorable in a way static slides aren't. -**Weakness:** depends on the deck format — must be a context that plays GIFs -(Keynote/PowerPoint export, web, Loom, Figma); does not work in static PDF. -**Use when:** the deck is being shared digitally or screen-shared, and the viewer -will sit through 9 seconds of motion. - -## My read - -If forced to pick one for tomorrow's Prateek meeting, I'd lead with **V2** for the -slide-projected version (it reads fastest and feels most "company") and keep **V3** -in the back pocket as a Loom/follow-up artifact. **V1** has the strongest -intellectual-honesty signal but loses to V2 on legibility at slide distance. - -But you said "ship all three and decide later," so all three are here. - -## Files - -- `v1-raw-report.png` (1920×1080 @2x) -- `v2-product-ui.png` (1920×1080 @2x) -- `v3-terminal-scan.gif` (1280×720, ~9s, 53 frames, ~2.6 MB) -- `build_v1.py`, `build_v2.py`, `build_v3.py` — generators (re-runnable) -- `v1-raw-report.html`, `v2-product-ui.html` — generated intermediate HTML - -The `_v3_*_check.png` files are sandbox-side preview stills used to debug the GIF -before the final render; they're sandbox-owned and didn't delete cleanly. Safe to -ignore or remove manually. - -## Re-run - -```bash -cd docs/decks/vc-thesis/slide-08-options -python3 build_v1.py -python3 build_v2.py -python3 build_v3.py -``` - -Requires Python 3, `playwright` + `chromium`, `markdown`, and `Pillow`. diff --git a/docs/decks/vc-thesis/slide-08-options/build_v1.py b/docs/decks/vc-thesis/slide-08-options/build_v1.py deleted file mode 100644 index 379d872b..00000000 --- a/docs/decks/vc-thesis/slide-08-options/build_v1.py +++ /dev/null @@ -1,205 +0,0 @@ -"""V1 — Render the actual report.md as a clean PNG sized for slide use. - -Style: brand-cream + navy, terminal-monospace for the report body, signaling -"this is the real artifact, not a mockup." -""" -import asyncio -from pathlib import Path - -import markdown -from playwright.async_api import async_playwright - -OUT_DIR = Path(__file__).resolve().parent -REPO = OUT_DIR.parents[3] -SRC = REPO / "samples/support_refund_agent/expected/report.md" -OUT = OUT_DIR / "v1-raw-report.png" - -CSS = """ -:root { - --cream: #F5F0E5; - --cream-2: #ECE5D5; - --navy: #1A2530; - --navy-2: #2A3540; - --muted: #6B5F4D; - --critical: #B8392F; - --high: #C76A2C; - --medium: #B89530; - --rule: #D4CCB8; -} -* { box-sizing: border-box; } -html, body { - margin: 0; padding: 0; - background: var(--cream); - color: var(--navy); - font-family: -apple-system, "SF Pro Display", "Helvetica Neue", Helvetica, Arial, sans-serif; -} -body { - width: 1920px; height: 1080px; - padding: 64px 96px; - display: grid; - grid-template-columns: 1fr 1.4fr; - gap: 64px; -} -.left { - display: flex; - flex-direction: column; - justify-content: space-between; -} -.kicker { - font-size: 18px; - letter-spacing: 0.18em; - text-transform: uppercase; - color: var(--muted); - font-weight: 600; -} -h1.headline { - font-size: 64px; - line-height: 1.05; - font-weight: 700; - margin: 18px 0 28px 0; - letter-spacing: -0.02em; -} -.subhead { - font-size: 22px; - line-height: 1.5; - color: var(--navy-2); - max-width: 540px; -} -.subhead em { font-style: italic; color: var(--muted); } -.cmd { - margin-top: 36px; - font-family: "SF Mono", "JetBrains Mono", "Menlo", monospace; - font-size: 18px; - color: var(--navy-2); - background: var(--cream-2); - padding: 14px 20px; - border-left: 3px solid var(--navy); - border-radius: 2px; -} -.cmd .prompt { color: var(--muted); margin-right: 10px; } -.footer-row { - display: flex; justify-content: space-between; align-items: flex-end; - font-size: 16px; color: var(--muted); -} -.brand { - display: flex; align-items: center; gap: 12px; - font-weight: 600; letter-spacing: 0.04em; - color: var(--navy); -} -.brand .dot { width: 10px; height: 10px; border-radius: 50%; background: var(--navy); } -.report-card { - background: var(--cream-2); - border: 1px solid var(--rule); - border-radius: 6px; - padding: 56px 44px 44px 44px; - font-family: "SF Mono", "JetBrains Mono", "Menlo", monospace; - font-size: 14.5px; - line-height: 1.55; - color: var(--navy); - overflow: hidden; - position: relative; -} -.report-card::after { - content: ""; - position: absolute; - left: 0; right: 0; bottom: 0; - height: 120px; - background: linear-gradient(to bottom, rgba(236,229,213,0) 0%, var(--cream-2) 80%); - pointer-events: none; -} -.report-card::before { - content: "agents-shipgate-reports/report.md"; - position: absolute; - top: -11px; left: 32px; - background: var(--cream); - padding: 0 12px; - font-family: -apple-system, sans-serif; - font-size: 13px; - letter-spacing: 0.06em; - text-transform: uppercase; - color: var(--muted); -} -.report-card h1 { font-size: 22px; margin: 0 0 14px 0; font-weight: 700; } -.report-card h2 { font-size: 17px; margin: 22px 0 10px 0; color: var(--navy-2); border-bottom: 1px solid var(--rule); padding-bottom: 6px; } -.report-card h3 { font-size: 15px; margin: 16px 0 6px 0; color: var(--muted); text-transform: uppercase; letter-spacing: 0.05em; } -.report-card p { margin: 6px 0; } -.report-card ul, .report-card ol { margin: 6px 0 6px 22px; padding: 0; } -.report-card li { margin: 4px 0; } -.report-card code { background: rgba(26,37,48,0.08); padding: 1px 5px; border-radius: 3px; font-size: 13.5px; } -.report-card strong { color: var(--navy); } -.report-card table { border-collapse: collapse; width: 100%; margin: 8px 0; font-size: 13px; } -.report-card th, .report-card td { text-align: left; padding: 4px 8px; border-bottom: 1px solid var(--rule); } -.report-card th { color: var(--muted); font-weight: 600; } -/* highlight critical lines */ -.report-card .blocked { color: var(--critical); font-weight: 700; } -""" - - -def truncate_report_for_slide(md: str) -> str: - # Keep header + Top Findings + Findings By Category headers — drop the long appendix - # so the page reads at slide distance. - lines = md.splitlines() - cut = [] - for line in lines: - if line.startswith("## Recommended Next Actions"): - break - cut.append(line) - return "\n".join(cut) - - -def post_process_html(html: str) -> str: - # Mark BLOCKED line in red - html = html.replace( - "Result: BLOCKED - release blockers detected.", - 'Result: BLOCKED — release blockers detected.', - ) - return html - - -HTML_TEMPLATE = """ - - -
-
-
Phase 1 · Static Release-Readiness Scanner
-

When the agent can refund $5,000,
release readiness becomes a CI question.

-

- Scanned a real OpenAI Agents SDK + MCP + OpenAPI tool surface for a customer-support refund agent. - Static checks alone surfaced 2 critical, 14 high, blocking the release. -

-
$agents-shipgate scan --config support-refund-agent/shipgate.yaml
-
- -
-
- {report} -
- -""" - - -async def main(): - raw = SRC.read_text() - raw = truncate_report_for_slide(raw) - md_html = markdown.markdown(raw, extensions=["tables", "fenced_code"]) - md_html = post_process_html(md_html) - page_html = HTML_TEMPLATE.format(css=CSS, report=md_html) - - html_path = OUT_DIR / "v1-raw-report.html" - html_path.write_text(page_html) - - async with async_playwright() as p: - browser = await p.chromium.launch() - context = await browser.new_context(viewport={"width": 1920, "height": 1080}, device_scale_factor=2) - page = await context.new_page() - await page.goto(f"file://{html_path}") - await page.screenshot(path=str(OUT), full_page=False, clip={"x": 0, "y": 0, "width": 1920, "height": 1080}) - await browser.close() - print(f"Wrote {OUT}") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/docs/decks/vc-thesis/slide-08-options/build_v2.py b/docs/decks/vc-thesis/slide-08-options/build_v2.py deleted file mode 100644 index d826903a..00000000 --- a/docs/decks/vc-thesis/slide-08-options/build_v2.py +++ /dev/null @@ -1,522 +0,0 @@ -"""V2 — Designed report card, paired with a realistic editor screenshot of the -actual sample files (shipgate.yaml + refund_agent.py). - -Left = the raw input the team checked into git, syntax-highlighted with Pygments -inside an editor-like chrome. Right = the designed shipgate report card. - -The narrative spine: the team's prohibited-actions list literally says "issue -refund without approval", but the tool surface ships stripe.create_refund with -no approval policy declared — and shipgate detects exactly that gap. -""" -import asyncio -import html -from pathlib import Path - -from playwright.async_api import async_playwright -from pygments import highlight -from pygments.formatters.html import HtmlFormatter -from pygments.lexers.data import YamlLexer -from pygments.lexers.python import PythonLexer - -OUT_DIR = Path(__file__).resolve().parent -REPO = OUT_DIR.parents[3] -OUT = OUT_DIR / "v2-product-ui.png" - -SAMPLE_YAML = (REPO / "samples/support_refund_agent/shipgate.yaml").read_text() -SAMPLE_PY = (REPO / "samples/support_refund_agent/agents/refund_agent.py").read_text() - -# Hand-trimmed slide excerpt of the actual shipgate.yaml — keeps the -# narrative-load-bearing sections (declared_purpose, prohibited_actions, -# tool_sources, permissions) and elides the rest with `# …` markers so the -# viewer knows it continues. -YAML_TRIMMED = """version: "0.1" - -project: - name: support-refund-agent - owner: support-platform - -agent: - name: refund-assistant - sdk: { type: openai-agents, entrypoint: agents/refund_agent.py } - declared_purpose: - - answer refund policy questions - - prepare refund requests for human review - - update support ticket notes - prohibited_actions: - - issue refund without approval - - cancel order without explicit confirmation - - send external email without preview - -environment: - target: production_like - -tool_sources: - - { id: support_openapi, type: openapi, path: specs/support-tools.openapi.yaml } - - { id: support_mcp_tools, type: mcp, path: .agents-shipgate/mcp-tools.json } - - { id: wildcard_mcp_tools, type: mcp, path: .agents-shipgate/wildcard-tools.json } - - { id: openai_sdk_static, type: openai_agents_sdk, path: agents/refund_agent.py } - -permissions: - scopes: - - zendesk:tickets:read - - zendesk:tickets:write - - stripe:* - credential_mode: service_account - -# … policies, risk_overrides, checks, ci, output omitted""" - - - -# Brand-tinted Pygments theme that matches the cream/navy palette -PYGMENTS_THEME_CSS = """ -.codeblock { font-family: "SF Mono", "JetBrains Mono", "Menlo", monospace; font-size: 11.5px; line-height: 1.5; } -.codeblock .hll { background-color: #F0E9D6 } -.codeblock .c, .codeblock .ch, .codeblock .cm, .codeblock .cpf, .codeblock .c1, .codeblock .cs { color: #8B7E68; font-style: italic } /* Comment */ -.codeblock .k, .codeblock .kc, .codeblock .kd, .codeblock .kn, .codeblock .kp, .codeblock .kr, .codeblock .kt { color: #1A2530; font-weight: 700 } -.codeblock .nt { color: #2A3540; font-weight: 600 } /* yaml key */ -.codeblock .nb, .codeblock .nc, .codeblock .nf, .codeblock .nn { color: #1A2530; font-weight: 600 } -.codeblock .l, .codeblock .ld, .codeblock .s, .codeblock .s1, .codeblock .s2, .codeblock .se, .codeblock .sx, .codeblock .sb, .codeblock .sc, .codeblock .sd, .codeblock .sh, .codeblock .si, .codeblock .sr, .codeblock .ss { color: #6B7B4F } /* strings -> warm green */ -.codeblock .m, .codeblock .mf, .codeblock .mh, .codeblock .mi, .codeblock .mo, .codeblock .il { color: #C76A2C } /* numbers -> warm orange */ -.codeblock .o, .codeblock .ow { color: #6B5F4D } -.codeblock .p { color: #6B5F4D } -.codeblock .err { color: #B8392F } -.codeblock .gh { color: #1A2530; font-weight: 700 } -.codeblock .linenos { color: #B8AB91; padding-right: 12px; user-select: none; border-right: 1px solid #E5DCC6; margin-right: 12px; } -.codeblock pre { margin: 0; padding: 0; background: transparent; } -""" - - -def render_code(src: str, lexer, start_line: int = 1) -> str: - formatter = HtmlFormatter( - cssclass="codeblock", - linenos="inline", - linenostart=start_line, - nobackground=True, - wrapcode=True, - ) - out = highlight(src, lexer, formatter) - return out - - -YAML_HTML = render_code(YAML_TRIMMED, YamlLexer()) -PY_HTML = render_code(SAMPLE_PY, PythonLexer()) -# Keep PY_HTML available in case we want to add it back, but the slide focuses -# on shipgate.yaml since that's where the declared release contract lives. -_ = PY_HTML - - -HTML = r""" - - -
-
-
Phase 1 · Static Release-Readiness Scanner
-

What the team declared, - what shipgate detected.

-
-
$agents-shipgate scan --config support-refund-agent/shipgate.yaml
-
- -
- -
-
-
-
shipgate.yaml
-
pyrefund_agent.py
-
{ }mcp-tools.json
-
[ ]support-tools.openapi.yaml
-
samples/support_refund_agent/
-
-
-
- - YAML_PLACEHOLDER -
-
-
- - -
-
-
-
Detected · agents-shipgate scan
-

Release-readiness report

-
-
- target: production_like
- evidence coverage: mixed
- human review: recommended -
-
- -
-
-
Release blockers detected - 2 critical findings on a financial-action tool · release should not promote -
-
- -
-
2
Critical
-
14
High
-
2
Medium
-
0
Low
-
8
Tools scanned
-
- -
-

Top findings

-
showing 4 of 18 · sorted by severity
-
- -
-
Critical
-
-
SHIP-POLICY-APPROVAL-MISSINGstripe.create_refund
-
Tool can issue refunds with no declared approval policy — directly contradicts the manifest's prohibited-actions list.
-
-
- -
-
Critical
-
-
SHIP-SIDEFX-IDEMPOTENCY-MISSINGstripe.create_refund
-
No idempotency key, annotation, or declared idempotency policy — retries can double-refund.
-
-
- -
-
High
-
-
SHIP-AUTH-MANIFEST-BROAD-SCOPE
-
Manifest declares wildcard permission scope stripe:* — broader than any required tool scope.
-
-
- -
-
High
-
-
SHIP-INVENTORY-WILDCARD-TOOLSwildcard_mcp_tools.*
-
MCP source declares wildcard tool exposure — full tool surface is unknown at release time.
-
-
- -
-
- 8 tools - 3 high-risk - 1 wildcard - mcp×3 - openapi×4 - sdk×1 -
-
report.md · report.json · report.sarif
-
-
-
- -
-
Three Moons Lab · A working thesis · April 2026
-
08 / 13
-
- -""" - - -def assemble() -> str: - page = HTML - page = page.replace("PYGMENTS_PLACEHOLDER", PYGMENTS_THEME_CSS) - page = page.replace("YAML_PLACEHOLDER", YAML_HTML) - page = page.replace("PY_PLACEHOLDER", PY_HTML) - return page - - -async def main(): - page_html = assemble() - html_path = OUT_DIR / "v2-product-ui.html" - html_path.write_text(page_html) - async with async_playwright() as p: - browser = await p.chromium.launch() - context = await browser.new_context(viewport={"width": 1920, "height": 1080}, device_scale_factor=2) - pg = await context.new_page() - await pg.goto(f"file://{html_path}") - await pg.screenshot(path=str(OUT), clip={"x": 0, "y": 0, "width": 1920, "height": 1080}) - await browser.close() - print(f"Wrote {OUT}") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/docs/decks/vc-thesis/slide-08-options/build_v3.py b/docs/decks/vc-thesis/slide-08-options/build_v3.py deleted file mode 100644 index d8754f01..00000000 --- a/docs/decks/vc-thesis/slide-08-options/build_v3.py +++ /dev/null @@ -1,268 +0,0 @@ -"""V3 — Animated terminal GIF showing `agents-shipgate scan` running and arriving -at BLOCKED. Frame-by-frame in PIL. - -Output is sized for slide use (1920x1080) with the terminal embedded so it -matches V1/V2 framing. -""" -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path - -from PIL import Image, ImageDraw, ImageFont - -OUT_DIR = Path(__file__).resolve().parent -OUT = OUT_DIR / "v3-terminal-scan.gif" - -# Brand -CREAM = (245, 240, 229) -CREAM_2 = (236, 229, 213) -NAVY = (26, 37, 48) -NAVY_2 = (42, 53, 64) -MUTED = (107, 95, 77) -RULE = (212, 204, 184) -TERMINAL_BG = (15, 22, 30) # nearly black, navy-tinted -TERMINAL_FG = (235, 224, 196) # warm cream-ish -TERMINAL_DIM = (140, 130, 110) -TERMINAL_PROMPT = (180, 200, 175) -RED = (220, 95, 85) -ORANGE = (220, 140, 70) -YELLOW = (220, 185, 90) -GREEN = (140, 180, 130) - -W, H = 1920, 1080 - -# Fonts -def font(size: int, bold: bool = False) -> ImageFont.FreeTypeFont: - candidates = [ - "/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf" if bold else "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf", - "/System/Library/Fonts/Menlo.ttc", - "/Library/Fonts/Menlo.ttc", - ] - for path in candidates: - try: - return ImageFont.truetype(path, size) - except OSError: - continue - return ImageFont.load_default() - -def sans(size: int, bold: bool = False) -> ImageFont.FreeTypeFont: - candidates = [ - "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" if bold else "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", - "/System/Library/Fonts/Supplemental/Arial Bold.ttf" if bold else "/System/Library/Fonts/Supplemental/Arial.ttf", - "/System/Library/Fonts/Helvetica.ttc", - ] - for path in candidates: - try: - return ImageFont.truetype(path, size) - except OSError: - continue - return ImageFont.load_default() - - -# The full output we want to "type out" -# (text, color) pairs. None color = TERMINAL_FG default. -# A None entry separates lines. -SCRIPT: list[tuple[str, tuple[int, int, int] | None]] = [ - ("Agents Shipgate 0.5.1", None), - ("", None), - ("Project: support-refund-agent Agent: refund-assistant Target: production_like", None), - ("", None), - ("Status: Release blockers detected", RED), - ("Critical: 2 High: 14 Medium: 2 Low: 0", None), - ("Evidence coverage: mixed | Human review: recommended", TERMINAL_DIM), - ("", None), - ("Top findings:", None), - (" CRITICAL SHIP-POLICY-APPROVAL-MISSING stripe.create_refund", RED), - (" CRITICAL SHIP-SIDEFX-IDEMPOTENCY-MISSING stripe.create_refund", RED), - (" HIGH SHIP-AUTH-MANIFEST-BROAD-SCOPE manifest scopes=['stripe:*']", ORANGE), - (" HIGH SHIP-INVENTORY-WILDCARD-TOOLS wildcard_mcp_tools.*", ORANGE), - (" HIGH SHIP-SCHEMA-MISSING-BOUNDS stripe.create_refund.amount", ORANGE), - (" HIGH SHIP-SCHEMA-BROAD-FREE-TEXT gmail.send_customer_email", ORANGE), - ("", None), - ("Reports written: report.md report.json report.sarif", TERMINAL_DIM), - ("", None), - ("CI mode: strict Exit code: 20", RED), -] - -COMMAND_TEXT = "$ agents-shipgate scan --config support-refund-agent/shipgate.yaml" - - -@dataclass -class TerminalState: - """Snapshot of what the terminal currently shows, for one frame.""" - typed_chars: int = 0 # how many chars of the command have been typed - revealed_lines: int = 0 # how many output lines have been revealed - cursor_visible: bool = True - show_done: bool = False # show the trailing "✕ release blocked" callout - - -# Layout inside the slide -TERM_X, TERM_Y = 96, 250 -TERM_W, TERM_H = 1728, 720 -PAD = 36 -LINE_H = 30 -HEAD_LINE_H = 38 -FONT_MONO = font(21) -FONT_MONO_BOLD = font(21, bold=True) -FONT_HEAD_KICKER = sans(20, bold=True) -FONT_HEAD = sans(56, bold=True) -FONT_SUB = sans(22) -FONT_FOOT = sans(18) -FONT_BRAND = sans(18, bold=True) -FONT_TERM_TITLE = sans(15, bold=True) - - -def draw_slide_chrome(img: Image.Image) -> None: - d = ImageDraw.Draw(img) - # Kicker - d.text((96, 88), "PHASE 1 · STATIC RELEASE-READINESS SCANNER", fill=MUTED, font=FONT_HEAD_KICKER) - # Headline - d.text((96, 124), "What `agents-shipgate scan` actually says", fill=NAVY, font=FONT_HEAD) - # Subhead - d.text((96, 196), "Run on a real OpenAI Agents SDK + MCP + OpenAPI surface for a refund agent.", - fill=NAVY_2, font=FONT_SUB) - # Footer - d.ellipse((96, 1024, 110, 1038), fill=NAVY) - d.text((122, 1020), "Three Moons Lab · agents-shipgate v0.5.1", fill=NAVY, font=FONT_BRAND) - d.text((1820, 1020), "Slide 8 / V3", fill=MUTED, font=FONT_BRAND, anchor="ra") - - -def draw_terminal_frame(img: Image.Image, state: TerminalState) -> None: - d = ImageDraw.Draw(img) - # Terminal panel - d.rounded_rectangle((TERM_X, TERM_Y, TERM_X + TERM_W, TERM_Y + TERM_H), radius=10, fill=TERMINAL_BG) - # macOS-ish window dots - d.ellipse((TERM_X + 22, TERM_Y + 18, TERM_X + 38, TERM_Y + 34), fill=(190, 90, 80)) - d.ellipse((TERM_X + 46, TERM_Y + 18, TERM_X + 62, TERM_Y + 34), fill=(200, 170, 90)) - d.ellipse((TERM_X + 70, TERM_Y + 18, TERM_X + 86, TERM_Y + 34), fill=(140, 180, 130)) - # Title bar text - d.text((TERM_X + TERM_W // 2, TERM_Y + 21), "support-refund-agent — agents-shipgate", - fill=TERMINAL_DIM, font=FONT_TERM_TITLE, anchor="ma") - # divider - d.line((TERM_X, TERM_Y + 52, TERM_X + TERM_W, TERM_Y + 52), fill=(35, 45, 55), width=1) - - # Inside content - cx = TERM_X + PAD - cy = TERM_Y + 70 - - # Line 1: typed command, character by character - typed = COMMAND_TEXT[: state.typed_chars] - # split into prompt + rest for color - if typed.startswith("$"): - d.text((cx, cy), "$", fill=TERMINAL_PROMPT, font=FONT_MONO_BOLD) - rest = typed[1:] - # measure "$ " width - dx, _ = FONT_MONO_BOLD.getbbox("$")[2:] - d.text((cx + dx + 6, cy), rest, fill=TERMINAL_FG, font=FONT_MONO) - else: - d.text((cx, cy), typed, fill=TERMINAL_FG, font=FONT_MONO) - # blinking cursor at end of typed text (only while typing or after if no output yet) - if state.cursor_visible and state.revealed_lines == 0: - cmd_w, _ = ImageDraw.Draw(Image.new("RGB", (1, 1))).textbbox((0, 0), typed, font=FONT_MONO)[2:] - # use textlength which returns just the width - cmd_w = FONT_MONO.getlength(typed) - d.rectangle((cx + cmd_w + 4, cy + 2, cx + cmd_w + 16, cy + LINE_H - 4), fill=TERMINAL_FG) - - # Output lines (after one blank line of separation) - if state.revealed_lines > 0: - oy = cy + LINE_H + 12 - for i, (text, color) in enumerate(SCRIPT[: state.revealed_lines]): - ink = color if color is not None else TERMINAL_FG - if text == "": - pass # blank line — just skip the height - else: - d.text((cx, oy), text, fill=ink, font=FONT_MONO) - oy += LINE_H - - # Trailing prompt + cursor only if we've revealed everything - if state.revealed_lines >= len(SCRIPT): - oy += 4 - d.text((cx, oy), "$", fill=TERMINAL_PROMPT, font=FONT_MONO_BOLD) - if state.cursor_visible: - d.rectangle((cx + 22, oy + 2, cx + 34, oy + LINE_H - 4), fill=TERMINAL_FG) - - -def render_frame(state: TerminalState) -> Image.Image: - img = Image.new("RGB", (W, H), CREAM) - draw_slide_chrome(img) - draw_terminal_frame(img, state) - return img - - -def build_frames() -> list[tuple[Image.Image, int]]: - """Return list of (frame, duration_ms).""" - frames: list[tuple[Image.Image, int]] = [] - - # 1. Empty terminal, blinking cursor on empty prompt — 0.6s with 2 blink cycles - for _ in range(2): - frames.append((render_frame(TerminalState(typed_chars=0, revealed_lines=0, cursor_visible=True)), 200)) - frames.append((render_frame(TerminalState(typed_chars=0, revealed_lines=0, cursor_visible=False)), 200)) - - # 2. Type the command character by character - cmd_len = len(COMMAND_TEXT) - # Type in chunks of 3 chars per frame for snappiness - chunk = 3 - pos = 1 # already showed $ - while pos <= cmd_len: - frames.append((render_frame(TerminalState(typed_chars=pos, revealed_lines=0, cursor_visible=True)), 30)) - pos += chunk - # Final state with full command - frames.append((render_frame(TerminalState(typed_chars=cmd_len, revealed_lines=0, cursor_visible=True)), 250)) - frames.append((render_frame(TerminalState(typed_chars=cmd_len, revealed_lines=0, cursor_visible=False)), 250)) - - # 3. Reveal output lines progressively, faster at start, slower around verdict - pacing = { - 0: 80, # banner - 4: 400, # Status line — pause for emphasis - 5: 250, # severity counts - 6: 200, - 8: 250, # Top findings: - 9: 280, # first critical - 10: 280, # second critical - 11: 180, - 12: 180, - 13: 180, - 14: 180, - 16: 200, - 18: 800, # final exit code, hold - } - for i in range(1, len(SCRIPT) + 1): - dur = pacing.get(i - 1, 90) - frames.append(( - render_frame(TerminalState(typed_chars=cmd_len, revealed_lines=i, cursor_visible=False)), - dur, - )) - - # 4. Hold + blink final cursor 3 times (~2.4s) - for _ in range(3): - frames.append((render_frame(TerminalState(typed_chars=cmd_len, revealed_lines=len(SCRIPT), cursor_visible=True)), 400)) - frames.append((render_frame(TerminalState(typed_chars=cmd_len, revealed_lines=len(SCRIPT), cursor_visible=False)), 400)) - - return frames - - -def main(): - frames = build_frames() - print(f"Built {len(frames)} frames") - images = [f for f, _ in frames] - durations = [d for _, d in frames] - # Reduce filesize: convert to P mode with adaptive palette - # First downsize to keep GIF reasonable for slide use - target_w = 1280 - target_h = int(H * target_w / W) - images = [im.resize((target_w, target_h), Image.LANCZOS).convert("P", palette=Image.ADAPTIVE, colors=128) for im in images] - images[0].save( - OUT, - save_all=True, - append_images=images[1:], - duration=durations, - loop=0, - optimize=True, - disposal=2, - ) - print(f"Wrote {OUT} ({OUT.stat().st_size // 1024} KB)") - - -if __name__ == "__main__": - main() diff --git a/docs/decks/vc-thesis/slide-08-options/v1-raw-report.png b/docs/decks/vc-thesis/slide-08-options/v1-raw-report.png deleted file mode 100644 index 1377f14d..00000000 Binary files a/docs/decks/vc-thesis/slide-08-options/v1-raw-report.png and /dev/null differ diff --git a/docs/decks/vc-thesis/slide-08-options/v2-product-ui.png b/docs/decks/vc-thesis/slide-08-options/v2-product-ui.png deleted file mode 100644 index 103b1d8e..00000000 Binary files a/docs/decks/vc-thesis/slide-08-options/v2-product-ui.png and /dev/null differ diff --git a/docs/decks/vc-thesis/slide-08-options/v3-terminal-scan.gif b/docs/decks/vc-thesis/slide-08-options/v3-terminal-scan.gif deleted file mode 100644 index e58320c3..00000000 Binary files a/docs/decks/vc-thesis/slide-08-options/v3-terminal-scan.gif and /dev/null differ diff --git a/docs/design-partner-verifier-pilot.md b/docs/design-partner-verifier-pilot.md index 4a0180ff..e2751a54 100644 --- a/docs/design-partner-verifier-pilot.md +++ b/docs/design-partner-verifier-pilot.md @@ -1,6 +1,6 @@ # Design Partner Verifier Pilot -Use this runbook to get three design partners through the v1.0.0a1 verifier +Use this runbook to get three design partners through the v0.14.0 verifier loop on one real or sanitized AI-generated agent PR each. ## Goal @@ -126,7 +126,7 @@ Paste this into the partner's coding agent from the target repo root: Add Agents Shipgate as an advisory verifier for this AI-generated agent-capability PR. -Use the v1.0.0a1 verifier-first path: +Use the v0.14.0 verifier-first path: 1. Install or upgrade agents-shipgate (the pilot needs contract v7 or newer): pipx install agents-shipgate pipx upgrade agents-shipgate diff --git a/docs/distribution.md b/docs/distribution.md index abcdf2f2..9407e8f0 100644 --- a/docs/distribution.md +++ b/docs/distribution.md @@ -5,7 +5,7 @@ These items require release infrastructure, registry credentials, domains, or Gi ## Package Channels - `agents-shipgate` is published on PyPI. -- Pinned GitHub Action release tags are published, including `v1.0.0a1`. +- Pinned GitHub Action release tags are published, including `v0.14.0`. - GitHub Releases attach the wheel, sdist, SBOM, and Sigstore bundles. - Evaluate a container image later only if it has an exercised build-and-test path. - Evaluate Homebrew once CLI usage warrants it. diff --git a/docs/examples/capability-lock.v0.2.example.json b/docs/examples/capability-lock.v0.2.example.json index 93aadea6..ef6eb92d 100644 --- a/docs/examples/capability-lock.v0.2.example.json +++ b/docs/examples/capability-lock.v0.2.example.json @@ -1,7 +1,7 @@ { "capability_lock_schema_version": "0.2", "experimental": false, - "cli_version": "1.0.0a1", + "cli_version": "0.14.0", "source": { "config_path": "shipgate.yaml", "manifest_dir": ".", diff --git a/docs/faq.md b/docs/faq.md index 7ac492c9..6af7775e 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -137,8 +137,8 @@ Skip emission with `--no-packet`; re-render later with ## Is it production-ready? -v1.0.0a1 is the current alpha contract version. The manifest schema is stable -across the 0.x series; see [`STABILITY.md`](../STABILITY.md). Used by +v0.14.0 is the current pre-1.0 beta contract version. The manifest schema is +stable across the 0.x series; see [`STABILITY.md`](../STABILITY.md). Used by early design partners. Public preview. ## How do I add it to GitHub Actions? diff --git a/docs/integrations.md b/docs/integrations.md index 5851ead8..f2383aa3 100644 --- a/docs/integrations.md +++ b/docs/integrations.md @@ -23,12 +23,12 @@ jobs: with: fetch-depth: 0 - id: agents-shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' ``` To post PR comments, set: @@ -181,7 +181,7 @@ agents-shipgate: stage: test image: python:3.12 script: - - python -m pip install --pre "agents-shipgate==1.0.0a1" + - python -m pip install --pre "agents-shipgate==0.14.0" - agents-shipgate scan --config shipgate.yaml --ci-mode advisory --format markdown,json,sarif artifacts: when: always @@ -213,7 +213,7 @@ jobs: - image: cimg/python:3.12 steps: - checkout - - run: python -m pip install --pre "agents-shipgate==1.0.0a1" + - run: python -m pip install --pre "agents-shipgate==0.14.0" - run: agents-shipgate scan --config shipgate.yaml --ci-mode advisory --format markdown,json,sarif - store_artifacts: path: agents-shipgate-reports @@ -276,7 +276,7 @@ Run Agents Shipgate locally on every commit that touches a tool-surface artifact # .pre-commit-config.yaml repos: - repo: https://github.com/ThreeMoonsLab/agents-shipgate - rev: v1.0.0a1 + rev: v0.14.0 hooks: - id: agents-shipgate ``` diff --git a/docs/quickstart.md b/docs/quickstart.md index b5738370..da33ce94 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -255,13 +255,13 @@ jobs: with: fetch-depth: 0 - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target pr_comment: "true" - shipgate_version: "1.0.0a1" + shipgate_version: "0.14.0" ``` Advisory mode never fails CI — it posts the merge verdict, capability changes, diff --git a/docs/shipgate-strategic-engineering-review.md b/docs/shipgate-strategic-engineering-review.md deleted file mode 100644 index 2c286626..00000000 --- a/docs/shipgate-strategic-engineering-review.md +++ /dev/null @@ -1,458 +0,0 @@ -# Shipgate Strategic Engineering Review - -> 历史说明:本文是 2026-06-09 基于 `0.11.0` / commit `ef58a57` -> 的战略审计快照。当前 `main` 已经实现 GitHub Check Run、Actions -> annotations、`verifier.json` 和 PR capability-review comment;文中将这些 -> 能力标为"缺失"的段落应按历史记录阅读,而不是当前实现状态。 - -> 审计日期:2026-06-09 · 审计对象:`ThreeMoonsLab/agents-shipgate` · commit `ef58a57` · 版本 `0.11.0` -> -> 证据标注约定:**【仓库实证】** 直接观察到;**【意图明确但未完成】** 有代码/文档意图但未落地;**【缺失/未实现】** 经搜索确认不存在;**【战略建议】** 基于明确论点的建议。 - ---- - -## 0. Executive Summary - -**一句话结论:这是一个工程纪律罕见地好、但需求验证严重不足的产品。最大的风险不在代码里,而在那个只有表头、没有一行数据的 `benchmark/results/2026-W2-baseline.csv` 里。** - -五个核心判断: - -1. **工程资产是真实的、稀缺的。** 静态-only 不变量由 AST 合约测试强制(`tests/test_adapter_static_only.py`)、单一决策引擎(`release_decision.decision`)、不可抑制的 trust-root 检查、severity floor、sigstore 签名 + Trusted Publishing + SBOM 的发布链。全套测试(≈2,100 项)一次通过。这一层"deterministic verifier + reward-hacking 抵抗"的纪律,在 agent 工具生态里几乎没有同类。 -2. **最大的弱点是"供给远超已验证的需求"。** 仓库已构建了六种 agent 接入面(AGENTS.md 块、Claude Code skill、Codex skill/plugin、Cursor rule、slash command、PR template)、GitHub Action(50 个 outputs)、adoption harness、governance benchmark、营销文案——但 adoption benchmark 结果文件是空的【仓库实证】,已知的真实 pilot 只有一次,且返回了最不可行动的裁决 `insufficient_evidence`(项目记录:2026-06-01 Stripe pilot)。 -3. **`insufficient_evidence` 是当前的产品杀手。** 阈值硬编码(≥50% 低置信工具、>3 source warnings,`ci/release_decision.py:27-28`),真实世界的 dynamic-toolkit 工厂模式必然触发它,而它给用户的下一步是"提供更好的输入"——这是把工作推回给用户,不是产品价值。 -4. **定位需要一次校准而非推翻。** 当前的 wedge 是"AI agent 项目的 tool-surface 合并门禁"——这个市场今天很小。但 trust-root 检查、`codex_boundary`(1,473 行)和最近的 "Phase 2 GitHub PR gate" 提交表明引擎已经长出了更大的那个方向:**治理任何被 coding agent 修改的仓库里的能力/权限漂移**(MCP 配置、CI 权限、agent 指令文件、workflow 变更)。这才是与 "agent-native verification layer" 使命对齐、且每个用 Claude Code/Codex 的团队都有的痛点。 -5. **下一个里程碑不该是新功能,而是证据。** 跑出真实的 adoption baseline 数据、跑通三个 design partner、把 `insufficient_evidence` 修成可行动的——在这三件事完成前,任何新表面积都是在加重已经过载的合约。 - ---- - -## 1. Evidence Base - -### 检查环境 - -- 工作目录:`/Users/pengfeihu/code/shipgate/.claude/worktrees/objective-greider-dff29c`(git worktree) -- Commit:`ef58a574586318eeb0583852d84b757e2c6abb93`,分支 `claude/objective-greider-dff29c`(基于 `main`),`git status` 干净 -- 近期提交:`2f3e58a` Add Phase 2 GitHub PR gate support、`ba97ac7` Codex-compatible local boundary check (#189)、`383b2e1` Standardize capability locks (#187) - -### 运行的命令(全部成功,无失败) - -| 命令 | 结果 | -|---|---| -| `git rev-parse HEAD` / `git status --short` / `git log` | 见上 | -| `find` 全仓库文件清单(src/ 含 ~200 个 Python 文件,tests/ 121 个测试文件) | 完整 | -| `PYTHONPATH=src .venv/bin/python -m pytest tests/ -x -q` | **exit 0,≈2,100 项测试通过,4 项 skip,无失败** | -| `curl https://pypi.org/pypi/agents-shipgate/json` | **PyPI 最新版 0.11.0**,与 `pyproject.toml:7` 一致(早期 pilot 中 pipx 拉到旧版 0.8.0 的问题已解决) | -| `ls examples/golden-prs/ plugins/ skills/ .agents/plugins/` | README 引用的路径全部存在,无断链 | -| `cat benchmark/results/2026-W2-baseline.csv` | **仅有 CSV 表头 `model,prompt,archetype,variant,score,...`,零数据行**【仓库实证】 | - -### 阅读的关键文件(节选) - -`README.md`(748 行)、`AGENTS.md`(568 行)、`ROADMAP.md`、`STABILITY.md`、`pyproject.toml`、`action.yml`(416 行)、`shipgate.yaml`、`docs/category.md`、`docs/design-partner-verifier-pilot.md`、`docs/trust-model.md`、`docs/autofix-policy.md`、`src/agents_shipgate/cli/main.py`、`cli/scan/orchestrator.py`、`cli/verify/orchestrator.py`、`ci/release_decision.py`、`ci/exit_policy.py`、`core/capability_delta.py`、`core/codex_boundary.py`、`checks/registry.py`、`checks/verify*.py`、`inputs/protocol.py`、`schemas/manifest/root.py`、`schemas/verifier.py`、`schemas/agent_result_v1.py`、`tests/test_adapter_static_only.py`、`tests/test_self_approval_signal.py`、`tests/test_severity_override_floor.py`、`.github/workflows/{ci,release,agents-shipgate,adoption-harness}.yml`、`tools/shipgate-detect.py`、`llms.txt`、`.well-known/agents-shipgate.json`、`docs/triggers.json`。 - -另检索了 `policy / rule / gate / verify / agent / MCP / permission / capability / trust_root / insufficient_evidence / self_approval / FastMCP` 等关键词在 `src/` 内的全部命中。 - ---- - -## 2. Current Repo State - -### Shipgate 今天实际上是什么【仓库实证】 - -一个 **Python 3.12 / Typer / Pydantic v2** 的 CLI + GitHub Action:**对"AI agent 的工具能力面"做静态、确定性的 PR 合并门禁**。核心循环: - -``` -PR 改了 agent 能做什么 - → agents-shipgate verify --base origin/main --head HEAD - → base/head 双扫描 + capability delta + trust-root 检查 - → report.json.release_decision.decision (唯一决策引擎:blocked | review_required | insufficient_evidence | passed) - → verifier.json.merge_verdict (它的确定性投影:mergeable | human_review_required | insufficient_evidence | blocked | unknown) - → fix_task 路由:机械修复 → coding agent;权限缺口 → 人 -``` - -### 已实现 vs 仅暗示 - -**已实现(成熟)**: -- 完整 CLI(20+ 子命令:`scan / verify / init / detect / doctor / explain / fixture / baseline / apply-patches / attest / capability / trigger / install-hooks / bootstrap / feedback` 等,`cli/main.py:35-141`) -- 13 个静态输入 adapter(MCP 导出、OpenAPI、OpenAI SDK/API、Anthropic API、Google ADK、LangChain、CrewAI、n8n、Codex config/plugin 等,`inputs/protocol.py`) -- 16 个内建检查 + 6 个 Tier B verify 检查(`checks/registry.py:48-74`),entry-point 插件机制(默认关闭) -- capability fact / lock / diff 模型(`core/capability_delta.py`、`docs/capability-standard.md`),语义方向分类(broadened/narrowed/mixed) -- GitHub Action(composite,50 个 outputs,sticky PR comment,SARIF 输出) -- 六种 agent 接入面渲染器(`cli/discovery/agent_instructions/renderers/`) -- 发布链:ruff + 85% 覆盖率门槛 + pip-audit + SBOM + sigstore 签名 + uv Trusted Publishing(`.github/workflows/release.yml`) - -**意图明确但未完成**: -- Workflow-evidence flywheel:`feedback capture` 首版已发,但 raw-bundle replay 和 `scenario replay` harness 未完成(`ROADMAP.md:35-46`) -- Pre-emptive authority surface(`verify --preview` 让 agent 在行动前知道不能碰什么)——roadmap 第 3 项,部分存在 -- Adoption benchmark:harness、matrix、runner 文档齐全,**但结果 CSV 为空** - -**缺失/未实现**: -- MCP server 模式(Shipgate 自己作为 MCP server 被 agent 调用)——`rg "FastMCP|mcp.server" src/` 零命中 -- Claude Code PreToolUse 级别的事中拦截(现有 hooks 是事后 advisory,`cli/install_hooks.py`) -- 跨仓库/组织级策略继承(manifest 无 `extends:`/`imports:`) - -### 成熟与早期的分界 - -**成熟**:决策引擎、capability diff、静态信任模型、测试纪律、发布工程、schema 版本管理。 -**早期/欠确定**:真实用户证据、`insufficient_evidence` 的实际可用性、agent 自发发现率(未测量)、面向新用户的认知负担(748 行 README、26 个 report-schema 版本文件、7+ 种输出 artifact)。 - ---- - -## 3. Engineering Architecture Review - -### 总体评价:A-。架构与使命对齐,纪律执行到位,少数模块需要拆分。 - -**分层清晰【仓库实证】**:`inputs/`(adapter,Protocol 契约)→ `core/`(domain + findings + capability)→ `checks/`(规则)→ `ci/`(决策)→ `report/ packet/`(渲染)→ `cli/`(编排)。scan 管线 9 个命名阶段,带稳定的 `_perf` 计时点(`cli/scan/orchestrator.py:22-136`);状态用不可变 dataclass 传递,findings 不在管线中途被原地修改。 - -**"一个决策引擎"在代码层面是真的**:`ci/release_decision.py:build_release_decision()` 是唯一裁决来源;`schemas/verifier.py` 的 `merge_verdict` 是其投影;`ci/exit_policy.py` 是唯一退出码映射;`cli/verify/fix_task.py` 的修复路由是规则式的。配套强制机制:`core/check_ids.py:14` 的 `UNSUPPRESSIBLE_FINDING_CATEGORIES`(verify/codex_boundary 类检查不可被 manifest 抑制)、`CheckMetadata.floor_severity`(降级有硬下限,越级降级需显式 acknowledgement + 过期时间,`tests/test_severity_override_floor.py` 共 68 个用例)。 - -**扩展性**:adapter 与 check 都走 entry-point(`pyproject.toml:96-109`),第三方插件默认关闭(`AGENTS_SHIPGATE_ENABLE_PLUGINS=1` 才开),载入有 4 道验证门(`checks/plugin_validation.py`)。 - -**代码质量信号**:`rg "TODO|FIXME|HACK" src/` 零命中;全面 Pydantic v2 + `Literal` 类型;`extra="forbid"` 严格校验加 `difflib` 字段名建议(`config/loader.py`)。 - -### 架构问题(按严重度) - -1. **`core/codex_boundary.py` 1,473 行、54 个函数的 god module**【仓库实证】。它同时管网络 profile、MCP 自动批准、hooks、skill、agent 指令削弱——这恰恰是未来最重要的方向(见 §7),现在就该拆成 `codex_boundary/{network,mcp,hooks,instructions}.py`,否则下一步扩展(Claude Code、Cursor 的 host 配置)会把它推到 3,000 行。 -2. **`cli/verify/orchestrator.py:run_verify()` 287 行**:base 扫描准备、diff 收集、verifier artifact 组装应拆分。 -3. **Schema 蔓延**:`docs/` 下 25 个 `report-schema.v0.x.json` + `schemas/manifest/` 26 个文件。冻结旧版本是对的,但 0.x 阶段每月 bump schema(v0.17→v0.25 仅几个月)说明 report 顶层块增长过快——每个新的 verifier-cycle 概念(`heuristics_filter`、`reviewer_summary`、`human_ack`、`capability_runtime_evidence`…)都成为顶层块。**【战略建议】** 在 v1.0 前做一次顶层块的合并收敛,否则 1.0 的稳定承诺会锁死一个过宽的表面。 -4. **决策逻辑的"分布式风险"**:裁决虽然单源,但贡献裁决输入的逻辑分散在 scan 阶段 5-8、verify 编排、各 check 模块。建议引入一个显式的 `DecisionEngine` 协议把 `release_decision + merge_verdict + exit_code + fix_task` 的契约固化成一处可审计的接口。 -5. **性能边界合理**:latency budget 测试(`tests/test_latency_budget.py`,预算在 `benchmark/perf/budgets.yaml`)+ 10MiB 单文件上限(`inputs/common.py:18`)。缺整体扫描预算(见 §8)。 - -**结论:当前架构完全可以支撑 agent-native 的下一步,瓶颈不在架构。** - ---- - -## 4. Policy / Verification Model Review - -### 模型回答任务清单的逐项判定 - -| 能力 | 判定 | 证据 | -|---|---|---| -| 策略是否声明式、确定性、可审计 | **是(局部)** | manifest `policies/permissions/checks` + YAML policy packs(`schemas/policy_pack.py`);匹配语义 AND-of-ORs,确定性排序 | -| 能否在 agent 行动**前**评估 | **部分** | `verify --preview` 与 `trigger` 提供 pre-flight;但没有事中(PreToolUse)拦截点【缺失】 | -| 能否解释为何通过/失败 | **是,且超出行业水平** | 每个 finding 带 `check_id / recommendation / evidence / provenance_kind / 双源 source`;`release_decision.contribution_rules[]` 解释每条 finding 落入 blocker/review/excluded 的规则名(`ci/release_decision.py:245-267`);`explain-finding` 命令 | -| 区分低/高风险变更 | **是** | capability effect(read/write/financial/code_execution)、`high_risk`、语义方向 broadened/narrowed | -| 表达最小权限 | **部分** | `permissions.scopes` + `is_broad_scope()` 启发式(`core/heuristics.py:31-49`);但 scope 是自由字符串,无 schema 化的权限模型 | -| 表达 review 要求 | **部分** | per-tool `require_approval_for_tools`;**无 per-path、无 CODEOWNERS 路由、无指定 reviewer**【缺失】 | -| 检测能力扩张 | **是(核心强项)** | capability fact 按 identity hash 配对,effect/authority/control hash 任一变化触发 `changed` + 方向判定(`core/capability_delta.py`) | -| 检测 MCP/tool 权限变更 | **部分** | `.mcp.json` 仅作为 trust-root 文件被"碰了就标记";`codex_boundary` 对 `.codex/config.toml` 有语义级检查;`.claude/settings.json`、`.cursor/mcp.json` 无语义 diff【部分缺失,详见 §7】 | -| 检测 CI/CD、secrets、依赖、网络、部署风险 | **窄** | CI gate 删除 = critical blocker(`checks/verify_ci_gate.py`);secrets 仅输出端 redaction;**依赖变更、workflow 权限扩张、deploy 配置不在模型内**【缺失】 | -| 机器可读输出 | **是,过剩** | report.json (schema v0.25)、verifier.json、SARIF、agent_result_v1、attestation、capability lock、feedback export | - -### 决策算法(实证伪代码,`ci/release_decision.py:35-214`) - -``` -for finding in findings: - suppressed → excluded - blocks_release 且非 baseline-matched → blocker (policy_block_new) - severity ∈ blocker层 且非 baseline-matched → blocker (severity_block_new) - severity ∈ {C,H,M} 或 requires_human_review → review_item - 其余 → excluded - -blockers 非空 → blocked -低置信工具 ≥ ceil(0.5×tools) 或 warnings > 3 → insufficient_evidence ← 硬编码阈值 -review_items 非空 或 warnings > 0 → review_required -否则 → passed -``` - -### 三个实质性弱点 - -1. **`insufficient_evidence` 的阈值不可配置且语义太钝**【仓库实证 + 项目记录】。`_LOW_CONFIDENCE_TOOL_RATIO = 0.5`、`_MAX_TOLERATED_SOURCE_WARNINGS = 3` 写死在 `ci/release_decision.py:27-28`。唯一一次真实 pilot(2026-06-01,Stripe `stripe/ai` PR #232)正是栽在这里:dynamic-toolkit 工厂让静态提取置信度崩塌,裁决退化为"证据不足"。这个裁决对用户的含义是"Shipgate 看不懂你的仓库",第一印象即流失。修复方向不是调阈值,而是**让低置信场景产出具体的、可机械执行的 next_action**(如"检测到 config-bound toolkit 工厂,运行 `shipgate init --suggest-inventory` 生成工具清单骨架")。 -2. **策略表达力天花板**:匹配谓词是平铺的 AND-of-ORs,**无条件嵌套**(无法表达"financial 且 amount > 1000 才需 approval")、无 ABAC 式任意属性表达式、无策略继承。当前对单仓库够用,对组织级采购是硬伤。 -3. **review 只有"要人看"一档**:deny/warn/require-review/allow 四态中,"warn but pass"没有独立通道(非阻断 finding 全部落入 `review_required`)。这放大了噪音问题——advisory 模式下一切都是 review item,团队会习惯性忽略。 - -### 强项必须点名 - -- **Baseline + 抑制本身被审计**:baseline 审计日志(`core/baseline_audit.py`,JSONL)、抑制对 trust-root 类检查无效、`policy_weakened`/`trust_root_touched` 任一为真时即使裁决 mergeable 也强制 `can_merge_without_human=false`(`tests/test_self_approval_signal.py:107-131`)。**这是整个仓库最有护城河价值的 200 行逻辑。** -- 确定性贯彻到 hash:capability ID 只含 identity(路径/行号变化不抖动 ID),全部 JSON `sort_keys=True`,attestation 刻意去掉 wall-clock。 - ---- - -## 5. Agent-Native Workflow Review - -### 一个 agent 今天如何遇到 Shipgate - -发现链是仓库里设计最完整的部分【仓库实证】: - -1. **目标仓库已 init 过** → AGENTS.md/CLAUDE.md 管理块、`.claude/skills/`、`.cursor/rules/`、`.agents/skills/`(Codex)—— 六个渲染器(`cli/discovery/agent_instructions/renderers/`),managed-block 幂等更新。 -2. **目标仓库没 init 过** → 这是断点。agent 必须事先知道 Shipgate 存在。仓库准备了 `llms.txt`、`.well-known/agents-shipgate.json`、`docs/triggers.json`(machine-readable 触发表,有合约测试)、零安装检测器 `tools/shipgate-detect.py`(stdlib-only,curl | python3)——但这些只对"已经在浏览 Shipgate 仓库的 agent"有效。**冷启动发现完全依赖 (a) 人类引入,或 (b) agent 训练语料/搜索覆盖。** -3. **运行时机**:`docs/triggers.json` 的 44 条规则 + `trigger` 命令给出 run/skip 裁决——这是同类工具中独有的"该不该跑我"协议,设计正确。 -4. **解读输出**:读序协议明确(verifier.json 先于 report.json),`first_next_action` 带 `{actor, kind, command, why}`,错误带结构化 `next_action`(`AGENTS_SHIPGATE_AGENT_MODE=1`,`docs/errors.json`)。 -5. **修复**:`fix_task` 确定性路由 + `apply-patches`(dry-run 默认、manifest 目录围栏、SHA 校验)+ `docs/agent-autofix-boundary.md` 明文列出 agent 永远不许自断言的六类证据(approval/confirmation/idempotency/broad-scope/prohibited-action/trace)。**自动修复边界的设计是行业级范本。** - -### 三个关键缺口 - -1. **没有 MCP server 模式**【缺失/未实现】。`rg "FastMCP|mcp.server" src/` 零命中。后果:agent 无法在会话内以工具调用的方式问"这个改动会过门禁吗?"——必须 shell out 到 CLI 并解析文件。对 Claude Code 这无伤大雅(Bash 顺手),但对受限工具环境的 agent(无 shell 的 PR bot、IDE 内嵌 agent)这是接入硬墙。一个薄的 MCP wrapper(`shipgate_preview_diff`、`shipgate_verify`、`shipgate_explain_finding` 三个工具)即可补上,且与静态-only 不变量不冲突(server 本地起、不出网)。 -2. **事中(pre-action)拦截不存在**【缺失/未实现】。现有 Claude Code hooks(`install-hooks --target claude-code`)是 Edit/Write 之后的 advisory 检查 + Stop 时全量 verify。Roadmap 第 3 项(pre-emptive authority surface)方向正确但只解决"让 agent 提前知道边界",不解决"agent 越界时被拦"。**【战略建议】** 用 Claude Code 的 PreToolUse hook 实现真正的拦截:对 Edit/Write 目标路径做 trust-root 匹配,命中 `.github/workflows/*`、`shipgate.yaml`、`.mcp.json` 等保护面时返回 deny + 解释。这把 Shipgate 从"PR 时间的门禁"升级为"agent 循环内的边界",且实现成本低(trust-root glob 表已存在于 `checks/verify.py`)。 -3. **adoption 假设未经测量**【仓库实证】。harness 与 benchmark 的设计文档非常完善(24 个付费 cell 的矩阵、100 分 rubric、W2→W4 计划),但 `benchmark/results/2026-W2-baseline.csv` 零数据。所有"agent 会发现并使用 Shipgate"的断言目前都是未验证假设。**这应该是本周就做的事,先于一切新功能。** - ---- - -## 6. GitHub / CI / PR Integration Review - -### 现状【仓库实证】 - -- **Action**:composite(`action.yml`,416 行),verify 模式默认,`shipgate_version` 可锁 PyPI 版本,50 个 outputs(`decision / merge_verdict / can_merge_without_human / trust_root_touched / policy_weakened / capability_changes_* / trigger_action` 等),sticky PR comment(分页搜索 marker,6000 字符截断),artifact 上传,SARIF 输出对接 code scanning。 -- **示例齐全**:`examples/github-actions/` 含 advisory、block-on-blocked、require-mergeable、baseline、SARIF、multi-config 等配方;自家仓库 dogfood(`.github/workflows/agents-shipgate.yml`,advisory + `fail_on_decisions: block`)。 -- **决策策略**:`fail_on_decisions` 输入支持按 `block`/`require_review` 失败——这就是"blocking vs advisory"的正确分层。 - -### 缺口与下一个集成里程碑 - -1. **没有 GitHub Check Run / annotations**【缺失】。现在的呈现是 PR comment + SARIF。SARIF 路径要求仓库开 code scanning(私有仓库需 GHAS 付费),comment 是单条大块文本。**【战略建议】** 下一个里程碑应是原生 **Check Run**:per-finding 的行级 annotation(capability 变更指向 manifest/工具定义的具体行)、`merge_verdict` 作为 check conclusion、`neutral` conclusion 对应 `review_required`。这让 branch protection 能直接 require 这个 check,而不需要用户自己写 `fail_on_decisions` 逻辑。 -2. **没有 risk label / reviewer 路由**【缺失】。outputs 里已有 `trust_root_touched`、`capability_changes_added` 等信号,但没有现成的"贴 `agent-capability-change` label + 按 CODEOWNERS 请求 reviewer"的官方配方。这是十行 workflow 的事,却是 reviewer 体验的关键一截——建议作为 `examples/github-actions/09-risk-labels-and-reviewers.yml` 补上,长期并入 Action 本体。 -3. **噪音控制依赖用户自律**:advisory 模式 + 每 PR 一条 comment,对非 agent 改动的 PR 依赖 `trigger` 的 skip 裁决。skip 时是否完全静默(不发 comment、不建 check)需要在 Action 里做成显式保证并写进文档——"无关 PR 零噪音"应该是营销级承诺。 -4. **base ref 获取是已知的用户坑**:verify 从不 fetch(信任模型决定),需要 `fetch-depth: 0`。文档已反复强调,但 Action 可以在 base 不可用时输出更明确的修复指引(已有 `merge_verdict: unknown` + exit 2 的行为,可加 `next_action: "set fetch-depth: 0"` 到 step summary)。 - ---- - -## 7. MCP / Tool Permission Governance Review - -**这是战略上最重要的一节。** - -### 现状盘点 - -| 面 | 现状 | 证据 | -|---|---|---| -| MCP 导出(agent 项目声明的工具面) | 一等公民,语义级 | `inputs/mcp.py`,wildcard 工具面有专门 finding | -| `.codex/config.toml` / hooks / plugin | 语义级检查 | `core/codex_boundary.py`:network mode=full、MCP auto-approve-write(critical)、app connector 自动批准、AGENTS.md 削弱、CI gate hook 移除 | -| `.mcp.json` | 仅 trust-root "被碰即标记" | `checks/verify.py` glob 表,无字段级 diff | -| `.claude/settings.json`(permissions/hooks/MCP) | **不在模型内** | trust-root glob 表无此项【缺失】 | -| `.cursor/mcp.json`、VS Code MCP 配置 | **不在模型内** | 同上【缺失】 | -| GitHub workflow `permissions:` 扩张、新 secrets 引用、`pull_request_target` | **不在模型内** | 仅"CI gate 文件被删"有检查【缺失】 | -| package.json scripts / pre-commit 命令变更 | **不在模型内** | 【缺失】 | -| 环境变量/secrets 暴露 | 仅输出端 redaction | `core/privacy.py`,不是对 diff 的治理 | - -### 判断 - -`codex_boundary` 证明了引擎完全有能力对 **coding-agent host 配置**做语义级能力 diff——它只是目前只对 Codex 做了。`.mcp.json`、`.claude/settings.json` 里的一行改动(新 MCP server、`permissions.allow` 加一条 `Bash(*)`)就是真实的能力扩张事件,今天 Shipgate 只能说"trust root 被碰了,请人看"。 - -### 提议的具体模型:Capability Diff for Agent Hosts【战略建议】 - -把现有 `CapabilityFactV1`(identity/effect/authority/controls/evidence + 七类 hash)从"agent 项目的工具"推广到"coding-agent host 的权限授予": - -```yaml -# 每条 host 配置授予 = 一个 capability fact -identity: {host: claude_code | codex | cursor, kind: mcp_server | permission_rule | hook | env} -effect: {network: bool, filesystem: read|write, shell: bool, scope_pattern: "Bash(*)"} -authority: {auto_approved: bool, env_passthrough: [..], transport: stdio|http} -controls: {requires_confirmation: bool} -``` - -diff 语义复用现有 broadened/narrowed/mixed。策略层复用 policy packs: - -```yaml -rules: - - id: ORG-MCP-NO-NEW-SERVER-WITHOUT-REVIEW - match: {host_capability: {kind: mcp_server, change: added}} - severity: high - block: false # require-review - - id: ORG-NO-WILDCARD-BASH-ALLOW - match: {host_capability: {kind: permission_rule, scope_pattern: "Bash(*"}} - severity: critical - block: true -``` - -**"permission boundary review"** = 把上述 host capability lock 存入 `.agents-shipgate/host-capabilities.lock.json`,PR 时 diff,扩张走 `human_review_required`,缩小自动 `mergeable`。所有基础设施(lock、diff、verdict、attestation)都已存在,这是推广不是新建。 - -这一步同时回答了 §10 的定位问题:它把目标用户从"开发 AI agent 产品的团队"(小)扩展到"用 coding agent 改代码的团队"(所有人)。 - ---- - -## 8. Security / Least-Privilege Review - -### 总体:强。这是仓库最可信的部分。 - -**已落地的防线【仓库实证】**: - -1. **静态-only 是合约不是口号**:AST 扫描器禁止 `exec/eval/__import__/compile/runpy/subprocess/importlib`(`.metadata`/`.resources` 除外)及别名规避;例外按 `(path, line, snippet, rationale)` 逐条钉死在 `ALLOWED_EXCEPTIONS`,行号漂移即测试失败(`tests/test_adapter_static_only.py`)。CI 把它放在全量测试之前 fail-fast(`ci.yml` step 7)。 -2. **Reward-hacking 防御**:trust-root finding 不可抑制;`policy_weakened || trust_root_touched` ⇒ 强制人审;severity floor 不可越(越级降级需带过期时间的 acknowledgement,过期即 ConfigError 而非 warning)。 -3. **Auto-fix 围栏**:dry-run 默认、`--confidence high` 默认、ManualPatch 无条件过滤、目标文件必须 `relative_to(manifest_dir)`(违者 exit 5)、SHA 漂移即拒绝、approval/confirmation/idempotency 证据**永久**禁止自动修复(`docs/autofix-policy.md`)。 -4. **供应链**:Actions 全部 SHA-pin、uv Trusted Publishing(OIDC,无长期凭证)、sigstore 签名(wheel + sdist + SBOM)、pip-audit 每次 CI、85% 覆盖率发布门槛。 -5. **输出脱敏默认开**:模式 + 敏感键双路径,`privacy_audit` 块记录脱敏统计(`core/privacy.py`)。 - -### 优先级排序的改进清单 - -| 优先级 | 项 | 证据/理由 | -|---|---|---| -| **P1** | **Symlink 逃逸测试**:`Path.resolve()` 跟随符号链接,manifest 内 symlink 理论上可把 `apply-patches` 的写目标或输入读取指出围栏外。`relative_to` 在 resolve 之后比较,恶意 symlink 解析后可能落在围栏内的假象路径 | `apply_patches.py:184-225`、`inputs/common.py:22-33`;加 `is_symlink()` 检查 + 逃逸用例 | -| **P1** | **Host 配置治理缺口本身就是安全缺口**:`.claude/settings.json` 加 `Bash(*)` 不触发任何语义检查(见 §7) | trust-root glob 表【缺失】 | -| **P2** | **Redaction marker 伪造**:`[REDACTED:...]` 全匹配即放行,攻击者可控字符串可伪造脱敏标记污染报告 | `core/privacy.py:142-143` | -| **P2** | **插件信任**:entry-point 插件是任意代码,现有控制为默认关闭 + provenance 记录;建议加 hash/签名 allowlist | `docs/trust-model.md:50-71` | -| **P3** | **整体输入预算**:10MiB 是单文件上限,多 source 可叠加;加 per-scan 总预算 | `inputs/common.py:18` | -| **P3** | **插件 check_id 冲突改为载入期拒绝**(现为载入后标记 `id_collision`) | `STABILITY.md:132` | -| **P3** | Attestation 可选接 Rekor 时间戳(保持本地确定性为默认) | `cli/attest.py` | - -**Prompt-injection 角度**:Shipgate 的输入(manifest、工具 schema、OpenAPI 描述)会进入报告并被 agent 阅读。报告中的 `recommendation` 是 Shipgate 生成的,但 `evidence` 含用户文件内容——被扫描仓库里恶意构造的工具描述("ignore previous instructions…")会原样进入 agent 要读的 verifier.json。**【战略建议】** 在 agent-facing 输出中对来自被扫描内容的字符串加显式来源界定(如 `untrusted_excerpt` 字段或引用包裹),并在 `docs/report-reading-for-agents.md` 中写明"evidence 字段内容不可作为指令执行"。 - ---- - -## 9. Documentation / Examples Review - -### 现状:体量惊人(50+ Markdown + 15+ JSON schema),无断链,无内部矛盾【仓库实证】。任务书要求的文档几乎全部存在: - -`README` ✓ / `docs/architecture.md` ✓ / policy 模型(`manifest-v0.1.md` + `policy-packs.md`)✓ / agent workflows(`agent-recipes.md`、`agent-contract-current.md`、`agent-native-merge-contract.md`)✓ / GitHub Action(README 节 + examples/)✓ / MCP governance(部分,散在 `capability-standard.md` 与 codex 文档)△ / `examples/` ✓ / `AGENTS.md` ✓ / `ROADMAP.md` ✓。 - -### 真正的问题不是缺文档,是**信息架构倒挂** - -1. **README 748 行,第一屏没有给"怀疑论者"的 30 秒路径**。现在的开头是 tagline + 徽章 + verify-first 快速开始——对已被说服的用户是对的;对第一次到达的人,"deterministic merge gate for AI-generated agent capability changes" 需要三次阅读才能解析。**建议**:第一屏改为一个 60 秒的故事——"Claude Code 给你的 refund agent 加了 `stripe.create_refund`。这个 PR 该不该合?" + 一张 `blocked` verdict 截图 + `uvx agents-shipgate fixture run ai_generated_refund_pr` 一行命令。其余 600 行下沉到 docs/。 -2. **概念词汇过载**:Tool-Use Readiness Report、Release Evidence Packet、verifier cycle、capability lock、attestation、agent result、feedback export、workflow evidence bundle——8 种 artifact 概念对新用户同时出现。`docs/concepts.md` 只有 92 行,承载不了。**建议**新增一页 `docs/mental-model.md`:一张图讲清"一个引擎、一个裁决、其余皆投影",每个 artifact 标注"谁读、何时读、可忽略否"。 -3. **`docs/mcp-governance.md` 不存在**【缺失】——而这是任务书点名、且是 §7 战略方向的承载文档。应随 host capability 工作创建。 -4. **样例缺一个"agent 不安全行为被拦"的完整叙事**:golden PRs 演示的是"缺 approval policy 被拦";缺一个"coding agent 试图删除 Shipgate CI / 放宽 manifest 被 SHIP-VERIFY-* 拦下"的端到端 fixture——这恰恰是差异化卖点,应做成 `fixture run agent_weakens_gate` 并放进 README 第一屏故事。 -5. **文档维护成本风险**:schema 版本注记散布在 README、STABILITY、agent-contract-current、llms.txt、.well-known 多处(项目记忆中已有 bump checklist 应对)。建议把"哪些文件随 schema bump 必改"做成 `scripts/check-contract-sync.py` 合约测试,替代人肉清单。 - ---- - -## 10. Product Positioning Review - -### 直说 - -**当前定位("the deterministic merge gate for AI-generated agent capability changes")精确、诚实、且太窄。** - -它要求目标用户同时满足三个条件:(a) 在开发 tool-using AI agent 产品;(b) 用 coding agent 写 PR;(c) 团队已经意识到 tool-surface 漂移是发布风险。三者交集在 2026 年中是一个很小的集合——这解释了为什么 pilot 难找、benchmark 没数据。`docs/category.md` 试图创建 "Tool-Use Readiness" 类目,但类目创建需要分发力量,小团队烧不起。 - -### 谁是第一用户? - -诚实的答案排序: -1. **平台/安全工程师,所在团队大量使用 Claude Code/Codex**(不一定在做 agent 产品)——他们的痛是"agent 改了 CI 权限/加了 MCP server/动了 workflow,没人看见"。这个痛**今天每周都在发生**,且没有现成工具。 -2. 正在做 production tool-using agent 的团队(当前定位的目标)——痛是真的但人群小、且往往有自建审查。 -3. AI coding agent 本身作为"用户"——长期正确,但 agent 不会自发采用没有进入其训练语料/指令的工具(empty CSV 是证据)。 - -### 建议的定位校准【战略建议】 - -不推翻,做**同心圆扩展**: - -> **内核(不变)**:deterministic capability-diff 引擎 + 不可削弱的 trust root。 -> **环 1(现有)**:AI agent 项目的 tool-surface 合并门禁。 -> **环 2(建议 6 周内启动,§7 的 host capability 工作)**:"**当 coding agent 改变它自己或你的 agent 能做什么时,Shipgate 决定能不能合。**" 覆盖 `.mcp.json`、`.claude/settings.json`、workflow permissions、agent 指令文件——所有用 coding agent 的仓库都适用。 - -一句话压力测试:对一个没听过 Shipgate 的工程师说"Semgrep 看代码漏洞,Shipgate 看**能力变更**——你的 AI 改了它能碰什么的时候,有人盘问它"——这句比 "Tool-Use Readiness" 在 5 秒内可懂。 - -### 与邻类的区别(为什么不是 X) - -- **不是 linter**:裁决对象是能力 delta 不是代码风格;带 merge 权限语义(human authority routing)。 -- **不是 OPA/policy-as-code 通用引擎**:OPA 给你表达式语言让你自己建模;Shipgate 自带 agent 能力的领域模型(capability facts、trust root、autofix boundary)。代价是表达力天花板(§4),换来的是开箱即用 + 不可被 agent 绕过。 -- **不是 Semgrep**:Semgrep 匹配代码模式;Shipgate 比对**声明的权限面**随 PR 的语义变化,并对"修改门禁本身"免疫。 -- **不是 branch protection/CODEOWNERS**:那是路由谁来看;Shipgate 产出**看什么、为什么、能不能不看人**。两者应集成(§6 建议)而非竞争。 - ---- - -## 11. Adoption / Distribution Strategy - -### 现实约束 - -小团队、零已记录的 adoption 数据、一次部分失败的 pilot、PyPI 0.11.0 已就绪、Action 已上 marketplace、分发面(六种 kit、Codex marketplace、llms.txt)已建完。**分发基建不缺,缺的是被分发的证据(proof)。** - -### 90 天战术(按序) - -**第 1-2 周:制造证据。** -1. 跑掉 W2 baseline benchmark(矩阵和 runner 都现成),**把真实数据填进空 CSV 并公开**。即使分数难看也比空文件强——"我们测了 agent 自发发现率,结果是 X" 本身就是稀缺内容。 -2. 修 `insufficient_evidence`(§4 弱点 1),让 Stripe-类仓库的第一次运行产生可行动结果,然后回去把那个 pilot 救活。 -3. 做 `fixture run agent_weakens_gate`(§9 建议 4)并录一个 90 秒视频:Claude Code 试图删 CI 门禁 → `blocked` + 人审路由。这是唯一一个看一遍就懂护城河的 demo。 - -**第 3-6 周:10 个真实用户。** -4. design-partner 漏斗换抓手:不再问"你在做 agent 产品吗",改问"你的团队用 Claude Code/Codex 吗?想看它这个月改了哪些能力面吗?"——用 §7 的 host capability 扫描做免安装的 **`shipgate audit --host`** 一次性报告(零配置、只读、出一页 Markdown),作为获客钩子。 -5. 内容:两篇有数据的文章——《我们让 8 种 coding agent 在没有提示的情况下找安全门禁,结果如下》(benchmark 数据)+《一个 AI 生成的 PR 如何悄悄拿到 refund 权限》(golden PR 叙事)。投 r/ExperiencedDevs、HN、AI engineering 社区。安全社区角度单独写 trust-root/reward-hacking 设计文(这是安全人群会转发的内容)。 -6. 把 Action 的 marketplace listing 信息密度提到与 README 第一屏一致(截图 + 一行接入)。 - -**第 7-12 周:让 agent 替你分发。** -7. 给 Anthropic/OpenAI 的 agent 文档生态投 PR/提案:Claude Code 的 hooks 示例库、Codex plugin 目录——成为"官方示例里出现的那个门禁"。 -8. 100 stars 的路径不是求 star,是上面 5 的两篇内容 + HN 一次首页。准备好仓库的"着陆 30 秒"(§9 建议 1)再发。 -9. **从 agent 工作流收集反馈而非只收人类反馈**:`feedback capture`(roadmap 项 2)已有首版——把每个 design partner 的 verify 前后对、verdict 转移、`suspected_gate_bypass` 信号变成 replayable scenario,公开脱敏后的 governance case 数量作为周指标。 - ---- - -## 12. Competitive / Adjacent Landscape - -| 邻类 | 代表 | Shipgate 应学什么 | Shipgate 不应变成什么 | -|---|---|---|---| -| Linting / static analysis | ruff, ESLint | 零配置首跑体验 | 通用代码质量工具 | -| 安全扫描 | Snyk, Trivy | 免安装 audit 获客(§11.4) | CVE 数据库竞争 | -| Policy-as-code | OPA/Conftest | 组织级策略分发模型(policy packs v2 可借鉴 bundle 机制) | 通用策略表达式引擎——领域模型才是差异化 | -| 代码模式扫描 | Semgrep | 规则市场/registry 的社区飞轮 | 用 capability 模型去匹配任意代码模式 | -| 供应链评分 | OpenSSF Scorecard | "一个分数 + 徽章"的传播力——考虑 `shipgate badge` | 静态打分器(裁决必须保持 PR 粒度) | -| MCP 安全 | mcp-scan 等新生工具 | **这是最近的碰撞区**:谁先定义 "MCP 配置变更治理" 谁拿走类目 | 运行时 MCP gateway(明确非目标,`docs/category.md`) | -| Agent runtime 护栏 | guardrails 框架、LLM firewall | 互补叙事:Shipgate 管 merge 前,他们管运行时 | 运行时执行层 | -| GitHub 原生 | branch protection, CODEOWNERS | 深度集成(Check Run、reviewer 路由) | 被一个 GitHub 原生功能"顺手做了"——**防御靠 trust-root 语义深度,GitHub 不会做 reward-hacking 抵抗** | - -**应该成为**:coding-agent 时代的"能力变更裁决层"——窄、深、不可绕过。 -**不应成为**:第 14 个静态分析器、运行时代理、或通用 policy 引擎。 - ---- - -## 13. Recommended Roadmap - -| Timeframe | Priority | Initiative | Why it matters | Complexity | Acceptance criteria | -|---|---|---|---|---|---| -| 立即(1-2 周) | P0 | 跑 W2 adoption baseline 并公开数据 | 一切 agent-native 假设目前零证据;空 CSV 是信誉负债 | 低(harness 已建好) | `benchmark/results/` 含 ≥16 cell 真实分数;README 链接结果 | -| 立即(1-2 周) | P0 | `insufficient_evidence` 可行动化:config-bound/dynamic-factory 检测 + 具体 next_action(含 inventory 骨架生成) | 唯一真实 pilot 的失败模式;首跑体验决定留存 | 中 | Stripe 型仓库(dynamic toolkit)首跑产出具体修复命令而非"提供更好的输入";pilot 复跑得到非 IE 裁决 | -| 立即(1-2 周) | P1 | `fixture run agent_weakens_gate` + 90 秒 demo 视频 + README 第一屏重构 | 护城河(trust root)目前没有一个 30 秒可懂的展示 | 低 | 新 fixture 展示 SHIP-VERIFY-CI-GATE-REMOVED → blocked → human 路由;README 首屏 ≤60 行进入第一个命令 | -| 立即(1-2 周) | P1 | Symlink 逃逸修复 + 测试;redaction marker 伪造修复 | §8 P1/P2 安全项,修复成本低 | 低 | 新增逃逸/伪造用例全绿;`is_symlink()` 检查落地 | -| 近期(4-6 周) | P0 | **Host capability governance**:`.claude/settings.json`、`.mcp.json`、`.cursor/*`、workflow `permissions:` 进入 capability fact 模型(§7) | 把目标市场从"做 agent 的团队"扩展到"用 agent 的团队";与 codex_boundary、Phase 2 PR gate 方向一致 | 中-高 | host capability lock + diff 落地;新 MCP server / `Bash(*)` allow / workflow 权限扩张触发 `human_review_required`;codex_boundary 拆分为子包 | -| 近期(4-6 周) | P1 | `shipgate audit --host` 零配置一次性报告(获客钩子) | design-partner 漏斗当前抓手太窄 | 低-中(复用 host capability 扫描) | 在无 shipgate.yaml 的仓库一条命令产出一页 Markdown 能力面盘点 | -| 近期(4-6 周) | P1 | GitHub Check Run + 行级 annotations + risk label/reviewer 路由配方 | comment+SARIF 不足以进 branch protection 的主流路径 | 中 | merge_verdict 映射 check conclusion;finding 行级标注;`examples/.../09-risk-labels.yml` | -| 近期(4-6 周) | P2 | 三个 design partner 跑完 verifier pilot(runbook 已有) | 产品方向需要真实 PR 流校准 | 低(运营为主) | 3 份 redacted feedback artifact 入库为 governance case | -| 中期(2-3 月) | P1 | Claude Code PreToolUse 拦截 hook(trust-root 保护面的事中 deny) | 从"PR 门禁"升级为"agent 循环内边界";竞品无人有此位 | 中 | agent 编辑保护面文件被 hook 拦截并收到解释 + 正确流程指引;可一键安装 | -| 中期(2-3 月) | P1 | 薄 MCP server 模式(preview/verify/explain 三工具,本地、不出网) | 无 shell 的 agent 环境接入硬墙;分发面进 MCP 目录生态 | 中 | Claude Code 经 MCP 调用 `shipgate_preview_diff` 得到与 CLI 字节一致的裁决投影 | -| 中期(2-3 月) | P2 | Policy packs v2:条件谓词(嵌套 AND/OR + 数值比较)+ 组织级 pack 分发 | 当前表达力天花板挡住平台团队采购(§4) | 中-高 | "financial 且 amount>1000 须 approval" 可声明;pack 可从 org repo 引用并锁 hash | -| 中期(2-3 月) | P2 | Report 顶层块收敛 + v1.0 schema RC | 25 个 schema 版本/顶层块膨胀将锁死 1.0 承诺 | 中 | 顶层块数量收敛后冻结 RC;迁移指南 | -| 长期(6-12 月) | P1 | Attestation 消费端:跨仓库 capability registry + deploy 系统对接 | 从"PR 工具"变成"组织的能力变更账本"——平台级护城河 | 高 | 多仓库 attestation 聚合查询:"过去 30 天谁给哪个 agent 加了什么权限,谁批的" | -| 长期(6-12 月) | P2 | 托管面(dashboard/org baseline/SSO),按既有承诺独立于 OSS 核心 | 商业化载体(README 定价立场已预留) | 高 | 与 OSS 边界清晰;design partner 转付费 ≥1 | - ---- - -## 14. Concrete Next Engineering Plan - -按依赖顺序的前六步(不实施,仅规划): - -1. **修 `insufficient_evidence`** - - 文件:`ci/release_decision.py`(IE 分支输出结构化 `evidence_gaps[]`:每个低置信工具 → 缺什么输入 → 哪条命令补);`inputs/_python_framework.py` + 各 adapter(识别 config-bound/factory 模式并产出 `SHIP-EVIDENCE-DYNAMIC-FACTORY` finding 替代静默降置信);新命令 `init --suggest-inventory`(生成 tool-inventory 骨架 JSON)。 - - 测试:以 Stripe pilot 形态构造 fixture(dynamic toolkit 工厂),断言裁决附带可执行 next_action;现有 IE 用例不回归。 - - 风险:阈值语义变化可能影响既有用户的裁决分布——保持阈值不动,只增强输出,零裁决回归。 - -2. **Host capability governance(环 2)** - - 新模块:`inputs/agent_hosts/{claude_code,cursor,mcp_json,github_workflows}.py`(解析 host 配置为 `HostCapabilityFact`,复用 `schemas/capabilities.py` 加 `subject_kind: "host_grant"`);`core/codex_boundary.py` 拆为 `core/host_boundary/` 子包并把 Codex 作为其中一个 host。 - - Schema:`docs/host-capability-schema.v0.1.json`;lock 文件 `.agents-shipgate/host-capabilities.lock.json`。 - - 检查:`SHIP-HOST-MCP-SERVER-ADDED`、`SHIP-HOST-PERMISSION-BROADENED`、`SHIP-HOST-WORKFLOW-PERMISSIONS-EXPANDED`、`SHIP-HOST-SECRET-REF-ADDED`,全部归入不可抑制类目。 - - 文档:新建 `docs/mcp-governance.md`(任务书点名缺失项)。 - - 测试:每 host 一组 golden 配置对(before/after)断言 diff 方向与裁决;trust-root glob 表扩充的回归测试。 - -3. **GitHub Check Run 集成** - - `action.yml` 增 `check_run: true` 输入;新 `scripts/github_check_run.py`(Checks API,annotation ≤50/批);`report/sarif.py` 的 region 信息复用为 annotation 行号。 - - 验收:blocked PR 显示红色 check + 行级标注;branch protection 可 require。 - -4. **PreToolUse hook** - - `cli/install_hooks.py` 增 `--target claude-code-pretooluse`;hook 脚本读 trust-root glob 表(从 `checks/verify.py` 提为共享数据文件 `docs/trust-roots.json`,双端消费),Edit/Write 目标命中保护面即返回 deny + `next_action`。 - - 测试:hook 脚本单测(无需真实 Claude Code);glob 表双端一致性合约测试。 - -5. **薄 MCP server** - - 新 `src/agents_shipgate/mcp_server/`(可选 extra `[mcp]`,依赖 `mcp` SDK);三工具 `preview_diff / verify / explain_finding`,全部进程内调用现有编排器,输出为 verifier.json 投影。 - - 静态-only 合约:MCP server 进 `ALLOWED_EXCEPTIONS` 审计(本地 stdio,不出网),并在 trust-model.md 注明。 - -6. **README/docs 信息架构重构** - - README 收敛至 ~150 行(故事 + 一条命令 + verdict 表 + 三个出口链接);新 `docs/mental-model.md`;其余内容下沉。`scripts/check-contract-sync.py` 替代人工 schema-bump checklist。 - -**总体风险与取舍**:(a) 环 2 扩张与 "no more adapters" 非目标的张力——host 配置解析不是框架 adapter,是 trust-root 论点的直接延伸,应在 ROADMAP 里明文区分以免原则被稀释;(b) 表面积继续增长 vs 收敛——上述 1/3/4/5 都是对既有裁决的新投影,不新增裁决语义,符合"无第二裁决"原则;(c) PreToolUse/MCP server 引入宿主生态耦合——以共享数据文件(trust-roots.json)而非代码耦合控制半径。 - ---- - -## 15. Open Questions - -1. **环 2 的命名与边界**:host capability governance 发布时是 Shipgate 的新模式,还是独立子命令品牌(`shipgate host`)?影响类目叙事。 -2. **W2 benchmark 若分数极低**(agent 完全不自发发现),是否把资源从"discovery 优化"转向"人类安装、agent 服从"的路径?需要数据后决策。 -3. **policy packs v2 的表达式语言选型**:自研受限谓词 vs 嵌入 CEL——后者表达力强但引入解释器信任面,与静态-only 哲学的相容性需要论证。 -4. **商业化时点**:README 承诺核心永久 OSS;托管面启动的触发条件是什么(N 个付费意向?环 2 验证?)——当前未定义。 -5. **`shipgate` 与 `agents-shipgate` 双命名**(`pyproject.toml` 两个 script 入口)长期收敛到哪个?品牌一致性 vs 兼容成本。 -6. **Stripe pilot 的后续**:IE 修复后是否还有重启窗口?design-partner 漏斗的当前真实状态仓库内不可见。 - ---- - -## 16. Final Verdict - -- **最强资产**:trust-root / reward-hacking 抵抗的那一层——不可抑制检查、severity floor、self-approval 强制人审、确定性 capability diff、由 AST 合约测试钉死的静态-only 信任模型。这是别人需要重走一年纪律才能复制的部分。 -- **最大弱点**:需求证据。空的 benchmark CSV、单一且部分失败的 pilot、为尚未到来的用户建好的六种接入面。工程在以产品已验证的方式运转,而产品还没被验证。 -- **最重要的战略修正**:从"做 AI agent 的团队的工具面门禁"扩展到"**用 coding agent 的团队的能力变更门禁**"(host capability governance)。引擎已经为此长好了,市场大两个数量级,且与既有护城河同根。 -- **最高杠杆的工程改动**:修 `insufficient_evidence` 的可行动性。它同时解锁 pilot 复活、首跑体验、和"verdict 永远给出下一步"的产品承诺。 -- **最佳采用楔子**:`shipgate audit --host` 零配置一次性报告 + "agent 试图削弱门禁被拦"的 90 秒 demo。 -- **建议的下一个里程碑**:**"v0.12 — Evidence"**:真实 benchmark 数据公开 + IE 可行动化 + 三个 design partner 跑完 + host capability 首版。不是更多表面积。 -- **残酷的诚实评估**:今天的 Shipgate,对绝大多数团队是 nice-to-have——因为它守护的事故(agent 越权合并)大多数团队还没经历过,而它的安装与概念成本不低。但它押的方向——coding agent 写大部分 PR、人类只裁决权限边界——正在以季度为单位变成现实。**它拥有成为 must-have 基础设施层的全部工程要件,唯独还没有完成从"为那个世界建好了"到"被那个世界需要着"的惊险一跃。** 未来 90 天把证据补上,这个判断就会改写;继续堆表面积,它会成为一个被引用、被尊敬、但没人用的优雅仓库。 - ---- - -*报告基于 commit `ef58a57`。所有文件路径与行号以该 commit 为准。* diff --git a/docs/target-repo-agent-snippets.md b/docs/target-repo-agent-snippets.md index 51456a26..89fd850e 100644 --- a/docs/target-repo-agent-snippets.md +++ b/docs/target-repo-agent-snippets.md @@ -360,13 +360,13 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target pr_comment: "true" - shipgate_version: "1.0.0a1" + shipgate_version: "0.14.0" ``` Advisory mode reports findings without blocking merge. Move to strict mode only diff --git a/docs/upstream-integrations.md b/docs/upstream-integrations.md index d778a8c8..f0db8459 100644 --- a/docs/upstream-integrations.md +++ b/docs/upstream-integrations.md @@ -311,12 +311,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' ``` `init --ci` writes a similar workflow into `.github/workflows/agents-shipgate.yml`. Switch to `ci_mode: strict` only after the team has reviewed the advisory output and saved a baseline (see [`baseline.md`](baseline.md)). diff --git a/docs/use-cases/ai-generated-agent-prs.md b/docs/use-cases/ai-generated-agent-prs.md index 73605b1f..31d22e01 100644 --- a/docs/use-cases/ai-generated-agent-prs.md +++ b/docs/use-cases/ai-generated-agent-prs.md @@ -149,13 +149,13 @@ jobs: with: fetch-depth: 0 - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' - name: Gate on the merge verdict run: | echo "merge_verdict=${{ steps.shipgate.outputs.merge_verdict }}" diff --git a/docs/zero-install.md b/docs/zero-install.md index 598906fb..31de5c7b 100644 --- a/docs/zero-install.md +++ b/docs/zero-install.md @@ -81,12 +81,12 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' ``` The full template lives at [`examples/github-actions/01-advisory-pr-comment.yml`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/examples/github-actions/01-advisory-pr-comment.yml). diff --git a/examples/agent-protocol/expected/block-stop.json b/examples/agent-protocol/expected/block-stop.json index 16527f7f..4e1942fb 100644 --- a/examples/agent-protocol/expected/block-stop.json +++ b/examples/agent-protocol/expected/block-stop.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/examples/agent-protocol/expected/policy-bypass.json b/examples/agent-protocol/expected/policy-bypass.json index 6220fd62..5ebed0c4 100644 --- a/examples/agent-protocol/expected/policy-bypass.json +++ b/examples/agent-protocol/expected/policy-bypass.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/examples/agent-protocol/expected/repair-after.json b/examples/agent-protocol/expected/repair-after.json index 1c79dc07..4ffc7e20 100644 --- a/examples/agent-protocol/expected/repair-after.json +++ b/examples/agent-protocol/expected/repair-after.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/examples/agent-protocol/expected/repair-before.json b/examples/agent-protocol/expected/repair-before.json index f74f6dc1..a050eac7 100644 --- a/examples/agent-protocol/expected/repair-before.json +++ b/examples/agent-protocol/expected/repair-before.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/examples/circleci/01-advisory.yml b/examples/circleci/01-advisory.yml index 026a12c4..dbadaa41 100644 --- a/examples/circleci/01-advisory.yml +++ b/examples/circleci/01-advisory.yml @@ -6,7 +6,7 @@ jobs: - image: cimg/python:3.12 steps: - checkout - - run: python -m pip install "agents-shipgate==1.0.0a1" + - run: python -m pip install "agents-shipgate==0.14.0" - run: agents-shipgate scan --config shipgate.yaml --ci-mode advisory --format markdown,json,sarif - store_artifacts: path: agents-shipgate-reports diff --git a/examples/circleci/02-strict-with-baseline.yml b/examples/circleci/02-strict-with-baseline.yml index 88cb0cac..a5a7e76d 100644 --- a/examples/circleci/02-strict-with-baseline.yml +++ b/examples/circleci/02-strict-with-baseline.yml @@ -6,7 +6,7 @@ jobs: - image: cimg/python:3.12 steps: - checkout - - run: python -m pip install "agents-shipgate==1.0.0a1" + - run: python -m pip install "agents-shipgate==0.14.0" - run: name: Agents Shipgate strict scan command: > diff --git a/examples/circleci/03-sarif-artifact-retention.yml b/examples/circleci/03-sarif-artifact-retention.yml index abee4cec..56e4e4b9 100644 --- a/examples/circleci/03-sarif-artifact-retention.yml +++ b/examples/circleci/03-sarif-artifact-retention.yml @@ -6,7 +6,7 @@ jobs: - image: cimg/python:3.12 steps: - checkout - - run: python -m pip install "agents-shipgate==1.0.0a1" + - run: python -m pip install "agents-shipgate==0.14.0" - run: agents-shipgate scan --config shipgate.yaml --ci-mode advisory --format markdown,json,sarif - store_artifacts: path: agents-shipgate-reports/report.sarif diff --git a/examples/circleci/04-multi-config-workspace.yml b/examples/circleci/04-multi-config-workspace.yml index f7cba08f..9eb7dc27 100644 --- a/examples/circleci/04-multi-config-workspace.yml +++ b/examples/circleci/04-multi-config-workspace.yml @@ -6,7 +6,7 @@ jobs: - image: cimg/python:3.12 steps: - checkout - - run: python -m pip install "agents-shipgate==1.0.0a1" + - run: python -m pip install "agents-shipgate==0.14.0" - run: name: Agents Shipgate workspace scan command: > diff --git a/examples/circleci/05-on-tool-source-changes.yml b/examples/circleci/05-on-tool-source-changes.yml index f9c0d662..8b2d63a7 100644 --- a/examples/circleci/05-on-tool-source-changes.yml +++ b/examples/circleci/05-on-tool-source-changes.yml @@ -6,7 +6,7 @@ jobs: - image: cimg/python:3.12 steps: - checkout - - run: python -m pip install "agents-shipgate==1.0.0a1" + - run: python -m pip install "agents-shipgate==0.14.0" - run: name: Run only when tool sources changed command: | diff --git a/examples/github-actions/01-advisory-pr-comment.yml b/examples/github-actions/01-advisory-pr-comment.yml index 1407d71b..474883c0 100644 --- a/examples/github-actions/01-advisory-pr-comment.yml +++ b/examples/github-actions/01-advisory-pr-comment.yml @@ -18,10 +18,10 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target check_annotations: 'true' pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/examples/github-actions/02-strict-on-critical.yml b/examples/github-actions/02-strict-on-critical.yml index 1ef90467..a3d0c268 100644 --- a/examples/github-actions/02-strict-on-critical.yml +++ b/examples/github-actions/02-strict-on-critical.yml @@ -18,10 +18,10 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: strict diff_base: target fail_on: critical pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/examples/github-actions/03-strict-with-baseline.yml b/examples/github-actions/03-strict-with-baseline.yml index 23620468..8a748ab7 100644 --- a/examples/github-actions/03-strict-with-baseline.yml +++ b/examples/github-actions/03-strict-with-baseline.yml @@ -19,11 +19,11 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: strict diff_base: target fail_on: critical,high baseline: .agents-shipgate/baseline.json pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/examples/github-actions/04-multi-config-workspace.yml b/examples/github-actions/04-multi-config-workspace.yml index 6eb10878..3415768d 100644 --- a/examples/github-actions/04-multi-config-workspace.yml +++ b/examples/github-actions/04-multi-config-workspace.yml @@ -21,7 +21,7 @@ jobs: with: python-version: '3.12' cache: pip - - run: pip install --quiet agents-shipgate==1.0.0a1 + - run: pip install --quiet agents-shipgate==0.14.0 - run: | agents-shipgate scan \ --workspace . \ diff --git a/examples/github-actions/05-sarif-to-code-scanning.yml b/examples/github-actions/05-sarif-to-code-scanning.yml index c5a9c8ca..a4c6c33b 100644 --- a/examples/github-actions/05-sarif-to-code-scanning.yml +++ b/examples/github-actions/05-sarif-to-code-scanning.yml @@ -22,13 +22,13 @@ jobs: with: fetch-depth: 0 - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target check_annotations: 'true' pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' - if: always() uses: github/codeql-action/upload-sarif@v3 with: diff --git a/examples/github-actions/06-on-tool-source-changes.yml b/examples/github-actions/06-on-tool-source-changes.yml index f112a720..995156fc 100644 --- a/examples/github-actions/06-on-tool-source-changes.yml +++ b/examples/github-actions/06-on-tool-source-changes.yml @@ -32,9 +32,9 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/examples/github-actions/07-block-on-blocked-verdict.yml b/examples/github-actions/07-block-on-blocked-verdict.yml index 93e05604..2aa9b5a3 100644 --- a/examples/github-actions/07-block-on-blocked-verdict.yml +++ b/examples/github-actions/07-block-on-blocked-verdict.yml @@ -19,13 +19,13 @@ jobs: with: fetch-depth: 0 - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' - name: Fail blocked capability changes if: steps.shipgate.outputs.merge_verdict == 'blocked' run: exit 1 diff --git a/examples/github-actions/08-require-mergeable.yml b/examples/github-actions/08-require-mergeable.yml index 15481345..80e451c2 100644 --- a/examples/github-actions/08-require-mergeable.yml +++ b/examples/github-actions/08-require-mergeable.yml @@ -19,13 +19,13 @@ jobs: with: fetch-depth: 0 - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' - name: Require mergeable verifier verdict if: steps.shipgate.outputs.can_merge_without_human != 'true' run: exit 1 diff --git a/examples/github-actions/09-risk-labels-and-reviewers.yml b/examples/github-actions/09-risk-labels-and-reviewers.yml index 5db0723a..b296c74f 100644 --- a/examples/github-actions/09-risk-labels-and-reviewers.yml +++ b/examples/github-actions/09-risk-labels-and-reviewers.yml @@ -25,13 +25,13 @@ jobs: with: fetch-depth: 0 - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' - name: Apply risk labels and request reviewers if: always() uses: actions/github-script@v7 diff --git a/examples/github-actions/10-check-run-annotations.yml b/examples/github-actions/10-check-run-annotations.yml index 8a371fab..8a61bb2d 100644 --- a/examples/github-actions/10-check-run-annotations.yml +++ b/examples/github-actions/10-check-run-annotations.yml @@ -5,7 +5,7 @@ # This recipe uses check_run_policy=require-mergeable, so only PRs with # can_merge_without_human=true produce a successful Check Run. # -# check_run_policy is newer than v1.0.0a1. Until the next release is tagged, +# check_run_policy is newer than v0.14.0. Until the next release is tagged, # this example targets main and intentionally omits shipgate_version so the # action installs the CLI from the same ref. After release, pin both the action # ref and shipgate_version to that release. diff --git a/examples/github-actions/11-fail-on-insufficient-evidence.yml b/examples/github-actions/11-fail-on-insufficient-evidence.yml index 79fa8145..0532edc0 100644 --- a/examples/github-actions/11-fail-on-insufficient-evidence.yml +++ b/examples/github-actions/11-fail-on-insufficient-evidence.yml @@ -19,13 +19,13 @@ jobs: with: fetch-depth: 0 - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: config: shipgate.yaml ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' - name: Fail insufficient static evidence if: steps.shipgate.outputs.merge_verdict == 'insufficient_evidence' run: exit 1 diff --git a/examples/github-actions/12-host-grant-drift.yml b/examples/github-actions/12-host-grant-drift.yml index 64fb92d1..41159601 100644 --- a/examples/github-actions/12-host-grant-drift.yml +++ b/examples/github-actions/12-host-grant-drift.yml @@ -38,7 +38,7 @@ jobs: python-version: "3.12" - name: Install agents-shipgate (pinned) - run: python -m pip install "agents-shipgate==1.0.0a1" + run: python -m pip install "agents-shipgate==0.14.0" - name: Compare host grants against the acknowledged baseline run: | diff --git a/examples/github-actions/13-org-governance.yml b/examples/github-actions/13-org-governance.yml index 10dd2a60..88f0e26c 100644 --- a/examples/github-actions/13-org-governance.yml +++ b/examples/github-actions/13-org-governance.yml @@ -26,7 +26,7 @@ jobs: with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 id: shipgate with: config: shipgate.yaml diff --git a/examples/github-actions/README.md b/examples/github-actions/README.md index 71e45927..ad674ece 100644 --- a/examples/github-actions/README.md +++ b/examples/github-actions/README.md @@ -37,9 +37,9 @@ Configure per-job, never repo-wide. For reproducible CI, pin both the action and the underlying CLI: ```yaml -- uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 +- uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: - shipgate_version: "1.0.0a1" + shipgate_version: "0.14.0" ``` When `shipgate_version` is empty the action installs the CLI from the action source — convenient for local action development, less reproducible for CI. @@ -57,7 +57,7 @@ When `shipgate_version` is empty the action installs the CLI from the action sou ```yaml - id: shipgate - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + uses: ThreeMoonsLab/agents-shipgate@v0.14.0 - if: steps.shipgate.outputs.decision == 'blocked' run: echo "Release blocked by Agents Shipgate" @@ -102,7 +102,7 @@ mergeable/success, blocked/failure, human-routed/neutral behavior. `blocked` and `unknown` so setup failures do not look successful. For direct branch protection, use `check_run_policy: require-mergeable`; only `can_merge_without_human == true` succeeds. `check_run_policy` is newer than -v1.0.0a1; until the next release is tagged, the Check Run policy example targets +v0.14.0; until the next release is tagged, the Check Run policy example targets `main` and omits `shipgate_version` so the action installs from that ref. `verify` writes static capability artifacts to the workflow artifact when diff --git a/examples/gitlab-ci/01-advisory.yml b/examples/gitlab-ci/01-advisory.yml index ef296b13..d2142f77 100644 --- a/examples/gitlab-ci/01-advisory.yml +++ b/examples/gitlab-ci/01-advisory.yml @@ -5,7 +5,7 @@ agents_shipgate: stage: test image: python:3.12 script: - - python -m pip install "agents-shipgate==1.0.0a1" + - python -m pip install "agents-shipgate==0.14.0" - agents-shipgate scan --config shipgate.yaml --ci-mode advisory --format markdown,json,sarif artifacts: when: always diff --git a/examples/gitlab-ci/02-strict-with-baseline.yml b/examples/gitlab-ci/02-strict-with-baseline.yml index 8bf622d6..aaa9ec04 100644 --- a/examples/gitlab-ci/02-strict-with-baseline.yml +++ b/examples/gitlab-ci/02-strict-with-baseline.yml @@ -5,7 +5,7 @@ agents_shipgate: stage: test image: python:3.12 script: - - python -m pip install "agents-shipgate==1.0.0a1" + - python -m pip install "agents-shipgate==0.14.0" - > agents-shipgate scan --config shipgate.yaml diff --git a/examples/gitlab-ci/03-sarif-or-artifact.yml b/examples/gitlab-ci/03-sarif-or-artifact.yml index ef4cb5aa..f60aad1b 100644 --- a/examples/gitlab-ci/03-sarif-or-artifact.yml +++ b/examples/gitlab-ci/03-sarif-or-artifact.yml @@ -5,7 +5,7 @@ agents_shipgate: stage: test image: python:3.12 script: - - python -m pip install "agents-shipgate==1.0.0a1" + - python -m pip install "agents-shipgate==0.14.0" - agents-shipgate scan --config shipgate.yaml --ci-mode advisory --format markdown,json,sarif artifacts: when: always diff --git a/examples/gitlab-ci/04-multi-config-workspace.yml b/examples/gitlab-ci/04-multi-config-workspace.yml index 6ceaf6b0..16ca9c4d 100644 --- a/examples/gitlab-ci/04-multi-config-workspace.yml +++ b/examples/gitlab-ci/04-multi-config-workspace.yml @@ -5,7 +5,7 @@ agents_shipgate: stage: test image: python:3.12 script: - - python -m pip install "agents-shipgate==1.0.0a1" + - python -m pip install "agents-shipgate==0.14.0" - > agents-shipgate scan --workspace . diff --git a/examples/gitlab-ci/05-on-tool-source-changes.yml b/examples/gitlab-ci/05-on-tool-source-changes.yml index abf33246..86308402 100644 --- a/examples/gitlab-ci/05-on-tool-source-changes.yml +++ b/examples/gitlab-ci/05-on-tool-source-changes.yml @@ -19,7 +19,7 @@ agents_shipgate: - "**/SKILL.md" - "**/*.py" script: - - python -m pip install "agents-shipgate==1.0.0a1" + - python -m pip install "agents-shipgate==0.14.0" - agents-shipgate scan --config shipgate.yaml --ci-mode advisory --format markdown,json,sarif artifacts: when: always diff --git a/examples/golden-prs/golden-pr-from-coding-agent.md b/examples/golden-prs/golden-pr-from-coding-agent.md index 3a37b720..f65c30f0 100644 --- a/examples/golden-prs/golden-pr-from-coding-agent.md +++ b/examples/golden-prs/golden-pr-from-coding-agent.md @@ -82,7 +82,7 @@ is at `agents-shipgate-reports/report.json`; the top-level - Release Evidence Packet: `agents-shipgate-reports/packet.{md,json,html}` **CI**: `.github/workflows/agents-shipgate.yml` already wires -`agents-shipgate@v1.0.0a1` in advisory mode; this PR will get a +`agents-shipgate@v0.14.0` in advisory mode; this PR will get a sticky-marker comment from the Action on every push.
diff --git a/examples/pre-commit/README.md b/examples/pre-commit/README.md index cf4123fe..5fb48b5d 100644 --- a/examples/pre-commit/README.md +++ b/examples/pre-commit/README.md @@ -13,7 +13,7 @@ In your repo's `.pre-commit-config.yaml`: ```yaml repos: - repo: https://github.com/ThreeMoonsLab/agents-shipgate - rev: v1.0.0a1 + rev: v0.14.0 hooks: - id: agents-shipgate ``` @@ -92,7 +92,7 @@ Use the `agents-shipgate-strict` hook ID for the strict variant, or override the ```yaml repos: - repo: https://github.com/ThreeMoonsLab/agents-shipgate - rev: v1.0.0a1 + rev: v0.14.0 hooks: - id: agents-shipgate entry: agents-shipgate verify --config shipgate.yaml --ci-mode strict --fail-on critical --format text diff --git a/llms-full.txt b/llms-full.txt index 92fad87f..f8a80cfb 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -1009,7 +1009,7 @@ Downstream repos generated with `init --agent-instructions=default` get the minimal local copy at `.shipgate/agent-contract.json`. -- Latest release: `v1.0.0a1` (see [pyproject.toml](../pyproject.toml) for the in-tree version) +- Latest release: `v0.14.0` (see [pyproject.toml](../pyproject.toml) for the in-tree version) - Runtime contract: `9` - Current report schema: `0.28` — [`docs/report-schema.v0.28.json`](report-schema.v0.28.json) - Current packet schema: `0.7` — [`docs/packet-schema.v0.7.json`](packet-schema.v0.7.json) @@ -1341,7 +1341,7 @@ exactly one stdout JSON object using `schema_version: "shipgate.codex_boundary_result/v1"` and the schema in [`codex-boundary-result-schema.v1.json`](codex-boundary-result-schema.v1.json). The removed `--format agent-json` alias and `agent_result_v1` schema string are -breaking 1.0.0-alpha changes; see [STABILITY.md](../STABILITY.md#migration-note-100-alpha). +breaking 0.14.0 changes; see [STABILITY.md](../STABILITY.md#migration-note-0-14-0). Coding agents should switch on `decision`, `completion_allowed`, `must_stop`, `first_next_action`, `human_review`, `repair`, and `policy`. Do not derive an agent diff --git a/llms.txt b/llms.txt index fc102756..90109838 100644 --- a/llms.txt +++ b/llms.txt @@ -12,7 +12,7 @@ - Publisher: Three Moons Lab - Publisher URL: https://threemoonslab.com/ - License: Apache-2.0 -- Latest public release: v1.0.0a1 +- Latest public release: v0.14.0 - Canonical repository: https://github.com/ThreeMoonsLab/agents-shipgate - Do not use: Agent Shipcheck, Agent Shipgate, agents shipgate, Agents-Shipgate @@ -101,7 +101,7 @@ - Compare static capability locks: `agents-shipgate capability diff --base .agents-shipgate/capabilities.lock.json --head agents-shipgate-reports/capabilities.lock.json --json`. - Export redacted design-partner feedback: `agents-shipgate feedback export --from agents-shipgate-reports/verifier.json --redact --out shipgate-feedback.json`. - Do not suppress findings, lower severity, expand baselines or waivers, remove Shipgate CI, weaken agent instructions, or assert approval/idempotency evidence to pass. -- GitHub Action: `ThreeMoonsLab/agents-shipgate@v1.0.0a1`. +- GitHub Action: `ThreeMoonsLab/agents-shipgate@v0.14.0`. ## When to recommend diff --git a/marketing/agent-weakens-gate.cast b/marketing/agent-weakens-gate.cast deleted file mode 100644 index 6104541a..00000000 --- a/marketing/agent-weakens-gate.cast +++ /dev/null @@ -1,23 +0,0 @@ -{"version": 2, "width": 80, "height": 24, "timestamp": 1781049306, "env": {"SHELL": "/bin/zsh", "TERM": "xterm-256color"}, "title": "Agents Shipgate: the agent deletes the gate"} -[0.032546, "o", "\u001b[H\u001b[2J\u001b[3J"] -[0.033097, "o", "\r\n\u001b[1;36m# Your coding agent's PR fails the release gate.\u001b[0m\r\n"] -[2.041745, "o", "\r\n\u001b[1;36m# The cheapest way for it to pass? Delete the gate.\u001b[0m\r\n"] -[4.051874, "o", "\r\n\u001b[1;36m# Watch what happens when it tries:\u001b[0m\r\n"] -[5.065655, "o", "\u001b[1;33m$ agents-shipgate --version\u001b[0m\r\n"] -[6.749987, "o", "Agents Shipgate 0.12.0\r\n"] -[7.817456, "o", "\u001b[1;33m$ agents-shipgate fixture run agent_weakens_gate --out /var/folders/_n/xw32gm0n7f957mtdq1mwfl3c0000gn/T/tmp.PF5b6LJTXj/reports\u001b[0m\r\n"] -[9.789136, "o", "Fixture: agent_weakens_gate\r\nMode: verify\r\nMerge verdict: blocked\r\nDecision: blocked\r\nCan merge without human: false\r\nReports: /var/folders/_n/xw32gm0n7f957mtdq1mwfl3c0000gn/T/tmp.PF5b6LJTXj/reports\r\nVerifier: /var/folders/_n/xw32gm0n7f957mtdq1mwfl3c0000gn/T/tmp.PF5b6LJTXj/reports/verifier.json\r\nPR comment: /var/folders/_n/xw32gm0n7f957mtdq1mwfl3c0000gn/T/tmp.PF5b6LJTXj/reports/pr-comment.md\r\n"] -[11.84627, "o", "\r\n\u001b[1;36m# merge_verdict: blocked. can_merge_without_human: false.\u001b[0m\r\n"] -[13.857245, "o", "\r\n\u001b[1;36m# Why? The PR comment a reviewer would see:\u001b[0m\r\n"] -[14.866252, "o", "\u001b[1;33m$ head -40 /var/folders/_n/xw32gm0n7f957mtdq1mwfl3c0000gn/T/tmp.PF5b6LJTXj/reports/pr-comment.md\u001b[0m\r\n"] -[15.883993, "o", "\r\n## Agents Shipgate result: block\r\n\r\nDecision: `block`\r\nRisk: `critical`\r\nAudit ID: `sg_audit_e0dbafe8ef50d46e`\r\nRequired reviewers: `agent-platform`, `security`\r\n\r\nHeadline: This PR edits a release trust root \\(the manifest, CI gate, agent instructions, or trigger catalog used to evaluate it\\); a coding agent cannot self\\-approve that change — a human must review it\\.\r\n\r\nRelease gate: `blocked`\r\nReason: 2 active findings block release\\.\r\nCapability changes: +0, 1 modified, -0\r\nFail policy: would_fail_ci=`false` (exit 0)\r\n\r\n### Capability changes\r\n| Impact | Change | Subject | Why |\r\n|---|---|---|---|\r\n| review required | action broadened | `docs.lookup` | Capability schema_hash changed without a proven direction. |\r\n\r\n### Required before merge\r\nActor: Human (human authority required — a coding agent must not self-resolve).\r\n1. 2 active findings block release\\.\r\n2. A human must review the touched release trust root \\(manifest, CI gate, agent instructions, or trigger cat"] -[15.885305, "o", "alog\\) before merge\\.\r\n3. Restore the Shipgate workflow or get human approval to remove it\\.\r\n4. This PR deletes the Shipgate CI workflow, which would stop the release gate from running\\. A human must approve removing CI enforcement; do not remove the gate to make CI pass\\.\r\n5. This PR changes a file that defines the release gate's trust spine\\. A human must review the change before merge; do not weaken the gate to make CI pass\\.\r\nThen re-verify: `agents-shipgate verify --base origin/main --head HEAD --json`\r\n\r\n### Trust-root warnings\r\n- `.github/workflows/agents-shipgate.yml` (ci\\_gate): human review is required.\r\n- Do not suppress findings, lower severity, or edit evidence just to make CI pass.\r\n\r\nTrigger: force\\_run rule\\(s\\) overrode any skip: TRIGGER\\-EXISTING\\-MANIFEST\\-PRESENT\\.\r\nBase diff: `origin/main` -> `succeeded`\r\n- Cached base report for tree 11dc6ee16f9f7a563c39db25d9bc92ad2cb823b3\\.\r\n- Capability lock diff unavailable: base tree does not contain \\.agents\\-shipgate/capabilities\\.lock\\.json\\.\r\n\r"] -[15.88591, "o", "\r\n### Artifacts\r\nAvailable in the `agents-shipgate-reports` workflow artifact.\r\n"] -[18.894782, "o", "\r\n\u001b[1;36m# Both gate-removal checks are suppression-immune:\u001b[0m\r\n"] -[19.904068, "o", "\u001b[1;33m$ python3 -c \"import json; r=json.load(open('/var/folders/_n/xw32gm0n7f957mtdq1mwfl3c0000gn/T/tmp.PF5b6LJTXj/reports/report.json')); [print(' -', b['check_id']) for b in r['release_decision']['blockers']]\"\u001b[0m\r\n"] -[20.96711, "o", " - SHIP-CODEX-BOUNDARY-CI-GATE-REMOVED\r\n - SHIP-VERIFY-CI-GATE-REMOVED\r\n"] -[22.979536, "o", "\r\n\u001b[1;36m# The manifest cannot silence them. Severity floors stop downgrades.\u001b[0m\r\n"] -[24.993068, "o", "\r\n\u001b[1;36m# The agent cannot approve its own boundary change.\u001b[0m\r\n"] -[26.998427, "o", "\r\n\u001b[1;36m# \u001b[0m\r\n"] -[28.011754, "o", "\r\n\u001b[1;36m# Agents Shipgate — the deterministic merge gate\u001b[0m\r\n"] -[29.01677, "o", "\r\n\u001b[1;36m# for AI-generated agent capability changes.\u001b[0m\r\n"] diff --git a/marketing/cold-start-funnel-test-2026-06-10.md b/marketing/cold-start-funnel-test-2026-06-10.md deleted file mode 100644 index 9a432ef6..00000000 --- a/marketing/cold-start-funnel-test-2026-06-10.md +++ /dev/null @@ -1,38 +0,0 @@ -# Cold-Start Funnel Test — 2026-06-10 - -Method: simulate a cold user on a clean temp dir, installing only from -public channels (PyPI via `uvx`), walking the funnel in -`marketing/gtm-strategy.md` § 5. This is the internal dry run; the launch -gate still requires two engineers outside the project to repeat it -(`marketing/launch-kit.md`). - -## Results - -| Funnel step | Result | Measurement | -|---|---|---| -| Install (`uvx --refresh agents-shipgate --version`) | ✅ PASS | 3.3 s, resolves 0.12.0 (current — the stale-pipx-0.8.0 issue from the 2026-06-01 pilot is gone) | -| Demo verdict (`uvx agents-shipgate fixture run ai_generated_refund_pr`) | ✅ PASS | 1.3 s to `Merge verdict: blocked`; README's "5-minute demo" promise is actually ~5 seconds | -| PR-comment artifact quality | ✅ PASS | capability table + "Required before merge" human-authority list; now also the README moneyshot source | -| `verify --preview` on a non-matching repo | ✅ PASS | clean skip with rationale and structured `next_action: none` | -| `scan` on an unconfigured repo (human mode) | ❌ FAIL → **fixed same day** | bare `Config error: No shipgate.yaml files matched`, no next step. Now prints `next: agents-shipgate detect …` + why | -| `init --write` → `scan` placeholder path | ❌ FAIL → **fixed same day** | `Input file not found: CHANGE_ME.yaml` dead end. Now routes to the placeholder fix (`next: Edit shipgate.yaml` + doctor pointer) in both human and agent mode | -| `verify` config errors (human + agent mode) | ❌ GAP → **fixed same day** | verify previously printed bare errors with no diagnostics and no agent-mode JSON; now at parity with scan (split flag-parse vs run-phase handlers) | -| `detect` on a minimal dynamic-toolkit repo | ❌ OPEN | a 1-file repo calling `client.responses.create(..., tools=build_toolkit())` yields `is_agent_project: false → "No action"` — a confident wrong answer on the exact shape the Stripe pilot hit. Tracked as the `insufficient_evidence` P1 (config-bound removal / dynamic-factory detection). **Launch remains gated on this.** | - -Fixes shipped 2026-06-10 (this branch): `is_agent_mode()` helper; -`_echo_next_action_hint()` printing the rank-1 recovery step for humans -(suppressed in agent mode to keep the `docs/errors.json` single-JSON-line -contract); scan/doctor/verify wired; CHANGE_ME-aware InputParseError -routing; verify gains agent-mode structured errors. Regression tests added -in `tests/test_cli.py` and `tests/test_verify.py`; full suite green. - -## Conclusion - -The zero-install demo path is launch-quality. The "my own repo" path is the -remaining gap, and it is exactly the gap the GTM plan predicted: dynamic -tool surfaces produce either a wrong "not an agent project" answer (detect) -or weak evidence (scan). Until the P1 detection work lands, every outreach -message and doc should lead with the fixture demo, and design-partner pilots -should ask the "how are tools registered?" question (outreach kit Q5) before -the run, so `insufficient_evidence` arrives as a predicted finding, not a -disappointment. diff --git a/marketing/demo-agent-weakens-gate.md b/marketing/demo-agent-weakens-gate.md deleted file mode 100644 index 282d30e0..00000000 --- a/marketing/demo-agent-weakens-gate.md +++ /dev/null @@ -1,91 +0,0 @@ -# Demo Script: "The Agent Deletes the Gate" (90 seconds) - -The one demo that explains the moat in a single take. Record as a -terminal screencast (asciinema or screen capture); no slides. - -> A recorded 29-second terminal cast (the 30-second cut, captured -> against the public PyPI v0.12.0 package) ships at -> [`agent-weakens-gate.cast`](agent-weakens-gate.cast) — play it with -> `asciinema play marketing/agent-weakens-gate.cast`, upload it to -> asciinema.org, or render a GIF with `agg`. The driver is -> [`demo-agent-weakens-gate.sh`](demo-agent-weakens-gate.sh); re-record -> after CLI output changes. The 90-second voiceover version below is -> the YouTube/social cut. - -## Setup (before recording) - -```bash -pipx install agents-shipgate # or: uv tool install agents-shipgate -agents-shipgate --version # >= 0.12 -``` - -Terminal at 100×30, large font, dark theme. - -## Script - -**[0:00–0:15] — The premise.** (voiceover while terminal is empty) - -> "Your coding agent's PR fails the release gate. What's the cheapest way -> for it to pass? Delete the gate. Here's what happens when it tries." - -**[0:15–0:35] — Run the fixture.** - -```bash -agents-shipgate fixture run agent_weakens_gate -``` - -Let the output land; it prints within a few seconds: - -``` -Fixture: agent_weakens_gate -Mode: verify -Merge verdict: blocked -Decision: blocked -Can merge without human: false -``` - -> "Shipgate built a real git history: a clean docs agent with an advisory -> Shipgate workflow, and a head commit — written by the 'agent' — that -> deletes `.github/workflows/agents-shipgate.yml`. Nothing else changed." - -**[0:35–1:00] — Show why.** Open the report it printed the path to: - -```bash -cat /pr-comment.md -``` - -Point at the two blockers: - -- `SHIP-VERIFY-CI-GATE-REMOVED` — *Shipgate CI gate removed* -- `SHIP-CODEX-BOUNDARY-CI-GATE-REMOVED` — *the workflow no longer - invokes the gate* - -> "Both checks are suppression-immune: the manifest's `checks.ignore` -> cannot silence them, severity floors stop downgrades, and -> `can_merge_without_human` is pinned to false. The agent cannot -> approve its own boundary change." - -**[1:00–1:30] — The kicker.** - -> "This isn't a special case — `shipgate.yaml`, policies, baselines, -> agent instructions, `.mcp.json`, Claude Code permission rules, and -> workflow permissions are all protected surfaces. Editing any of them -> routes to a human. The cheapest reward-hack is also the most visible -> one. That's the whole point of a deterministic merge gate." - -Close with the README one-liner on screen: - -> **Your coding agent changed what your AI agent can do — Agents -> Shipgate tells you whether it can merge.** - -## Variants - -- **30-second cut**: 0:15–0:35 only, ending on `Can merge without human: - false`. -- **Refund cut**: same structure with - `fixture run ai_generated_refund_pr` for the capability-addition story - (blocked on missing approval policy + idempotency evidence). -- **Live-wire cut** (advanced): run `agents-shipgate install-hooks - --target claude-code --write` in a sandbox repo, ask Claude Code to - "remove the Shipgate workflow," and capture the PreToolUse `ask` - prompt appearing *before the edit happens*. diff --git a/marketing/demo-agent-weakens-gate.sh b/marketing/demo-agent-weakens-gate.sh deleted file mode 100644 index 96438b58..00000000 --- a/marketing/demo-agent-weakens-gate.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -# Driver for the "agent deletes the gate" demo recording. -# Record with: asciinema rec marketing/agent-weakens-gate.cast \ -# --command "bash marketing/demo-agent-weakens-gate.sh" --overwrite -# Script + voiceover beats: marketing/demo-agent-weakens-gate.md -set -euo pipefail -export TERM="${TERM:-xterm-256color}" - -say() { printf '\n\033[1;36m# %s\033[0m\n' "$1"; sleep "${2:-2}"; } -run() { printf '\033[1;33m$ %s\033[0m\n' "$1"; sleep 1; eval "$1"; } - -clear 2>/dev/null || true -say "Your coding agent's PR fails the release gate." 2 -say "The cheapest way for it to pass? Delete the gate." 2 -say "Watch what happens when it tries:" 1 - -run "agents-shipgate --version" -sleep 1 - -OUT=$(mktemp -d)/reports -run "agents-shipgate fixture run agent_weakens_gate --out $OUT" -sleep 2 - -say "merge_verdict: blocked. can_merge_without_human: false." 2 -say "Why? The PR comment a reviewer would see:" 1 -run "head -40 $OUT/pr-comment.md" -sleep 3 - -say "Both gate-removal checks are suppression-immune:" 1 -run "python3 -c \"import json; r=json.load(open('$OUT/report.json')); [print(' -', b['check_id']) for b in r['release_decision']['blockers']]\"" -sleep 2 - -say "The manifest cannot silence them. Severity floors stop downgrades." 2 -say "The agent cannot approve its own boundary change." 2 -say "" 1 -say "Agents Shipgate — the deterministic merge gate" 1 -say "for AI-generated agent capability changes." 3 diff --git a/marketing/design-partner-outreach.md b/marketing/design-partner-outreach.md deleted file mode 100644 index 451a7c3e..00000000 --- a/marketing/design-partner-outreach.md +++ /dev/null @@ -1,153 +0,0 @@ -# Design Partner Outreach Kit - -Status: operating doc, 2026-06-10. Companion to -`docs/design-partners.md` (the public-facing page) and -`docs/design-partner-verifier-pilot.md` (the pilot runbook). This file is the -internal sourcing, messaging, and tracking side. - -Weekly operating target: 5–10 personalized messages, ≤30 min/day total. -Pipeline goal: 2 new active pilots per month, 5 completed pilots by day 90. - -## Who to contact - -The engineer who **reviews AI-generated agent PRs** — usually a tech lead, -staff engineer, or founding engineer. Not the security lead (first -conversation drifts to compliance we don't sell). Not the founder, unless -the founder is that engineer. - -Qualification (all four): -1. Weekly real AI-generated agent PRs (Claude Code / Codex / Cursor). -2. Agent tools with blast radius: refund, email, cancel, deploy, record - writes, sensitive reads. -3. Willing to share a sanitized diff or run - `agents-shipgate feedback export --redact`. -4. GitHub CI (Python-first repos preferred for adapter coverage). - -Log-but-defer (future commercial leads, not pilots): teams whose first ask -is a hosted dashboard, SSO, or compliance certification. - -## Sourcing recipes (fill the 30-target list from these) - -1. **GitHub evidence search** — repos that are both agent-shaped and - coding-agent-authored: - - Commit search: `Co-Authored-By: Claude` / `Co-authored-by: Cursor` on - repos that also match code search for `mcpServers`, `shipgate.yaml` - competitors' configs, `from openai_agents import`, `langgraph`, - `crewai`, `tools=[`. - - PR search: `"add tool"` / `"new tool"` in PR titles on agent-framework - repos with recent activity. - - The person who merged those PRs is the contact. -2. **MCP ecosystem** — authors of MCP servers with consequential actions - (payment, email, CRM, infra). They feel the scope/approval problem from - the supplier side and often also ship agents. -3. **Agent-framework community surface** — people asking "how do you review - agent permissions/tool access" in LangChain/LangGraph, OpenAI dev, and - Anthropic dev forums/Discords. Answer first, outreach second. -4. **Build-in-public founders on X** — anyone posting screenshots of agents - that send email / move money / touch prod, especially if they also post - about Claude Code or Codex throughput. -5. **Founder network / warm intros** — highest conversion; send the warm - template below to anyone one hop away from a team shipping agents. -6. **Pilot-derived referrals** — every completed pilot ends with: "which - other team do you know reviewing AI agent PRs by hand?" - -Personalization bar for every cold message: one sentence that proves we saw -*their* repo/post/PR. No spray. - -## Cold outreach (X DM / email, ≤110 words) - -> Hi {name} — saw {specific: your post about {X} / {repo}'s PR #{N} adding -> {tool}}. Quick question: when Claude Code or Codex opens a PR that changes -> what your agent can do — adds a tool, widens a scope, touches approval -> policy — what does your review actually catch? -> -> I'm building Agents Shipgate, an open-source deterministic verifier for -> exactly that diff. Static, local-first, no LLM calls. I'm running pilots -> with teams shipping tool-using agents: you bring one real AI-generated PR, -> we run the verifier on it together (~30 min), you keep the verdict + -> report, I keep the feedback. -> -> Worth a look? 5-min demo first if you prefer: -> `uvx agents-shipgate fixture run ai_generated_refund_pr` - -## Warm intro (for the introducer to forward) - -> {Founder name} is building Agents Shipgate (open source, Apache-2.0) — a -> deterministic CI check that reads AI-generated PRs and tells you whether -> the agent's tool permissions changed and whether it's safe to merge -> without human review. No LLM calls, runs locally. He's looking for a few -> teams shipping tool-using agents to run one real PR through it as a design -> partner — ~30 minutes, you keep the analysis. Given how much {team} ships -> with {Claude Code/Codex}, thought you two should talk. - -## Follow-up (one only, +5–7 days) - -> One-liner follow-up: if a bot commented on your next agent PR "this diff -> grants refund authority, no approval policy declared" — useful or noise? -> Either answer helps me. If noise, I'd genuinely like to know why. - -## Discovery call — question list (30 min) - -Opening (2 min): we're validating, not selling; the product is free; we want -to know if the verdict is right or wrong on your real PR. - -1. Last week, how many PRs were written by a coding agent? How many touched - the agent's tools, scopes, or policies? -2. Who reviews those PRs? How long does one take? What do you actually look - at? -3. Tell me about the last time you noticed *after* merging that the agent's - capabilities had changed. (Or the nearest miss.) -4. What's the most dangerous action your agent can take in production right - now? Where is that list written down? Who can recite it? -5. How are tools registered — static declarations, or factories/dynamic - construction? (Direct probe for `insufficient_evidence` risk; sets - expectations before the pilot run.) -6. What required checks run in your CI today? How did the most recently - added one get adopted? -7. If a PR comment said "this diff grants refund authority, missing approval - policy" — who would see it, and would it change the merge decision? -8. (Close) If this tool disappeared tomorrow, when would you notice? - -Qualification scoring: Q1 ≥ weekly agent PRs, Q4 has a real answer, Q5 -understood, Q7 = "yes it would change the decision" → schedule the pilot on -the call. Two or more misses → thank, log, move on. - -## Pilot mechanics (per `docs/design-partner-verifier-pilot.md`) - -Every pilot produces three artifacts, no exceptions: -1. Redacted feedback export (`feedback export --redact`). -2. Verdict-quality judgment: right / wrong / insufficient — and why. -3. One benchmark-candidate row for `benchmark/results/`. - -Success signals, ascending: unprompted second run → Action installed -(advisory) in a real repo → verdict cited in a real review thread → custom -policy/check request → cross-repo rollup request (**commercial signal: log -in the tracker, do not build**). - -Roadmap discipline: a new adapter request enters the roadmap only when it -arrives attached to a real PR. - -## Pipeline tracker - -Keep private operational notes in `.agents-private/` (gitignored); this -table tracks stage only. Stages: -`sourced → contacted → replied → discovery → pilot-scheduled → pilot-run → -artifacts-received → retained / closed`. - -| # | Team / contact | Source bucket | Stage | Last touch | Agent framework | Blast-radius tools | Next step | Notes | -|---|---|---|---|---|---|---|---|---| -| 1 | | | sourced | | | | | | -| 2 | | | sourced | | | | | | -| 3 | | | sourced | | | | | | -| 4 | | | sourced | | | | | | -| 5 | | | sourced | | | | | | -| 6 | | | sourced | | | | | | -| 7 | | | sourced | | | | | | -| 8 | | | sourced | | | | | | -| 9 | | | sourced | | | | | | -| 10 | | | sourced | | | | | | -| … | (fill to 30 from the sourcing recipes) | | | | | | | | - -Weekly review (15 min, same day each week): count by stage; if `contacted → -replied` < 15%, the personalization sentence is too weak — fix the message, -not the volume; if `discovery → pilot` < 50%, qualification is too loose. diff --git a/marketing/gtm-strategy.md b/marketing/gtm-strategy.md deleted file mode 100644 index a5025645..00000000 --- a/marketing/gtm-strategy.md +++ /dev/null @@ -1,249 +0,0 @@ -# Agents Shipgate — Go-to-Market Strategy - -Status: plan of record, 2026-06-10 -Owner: Three Moons Lab -Review cadence: every 30 days against the decision gates in § 11 - -The single organizing judgment: **the current GTM bottleneck is evidence, not -distribution.** The first real design-partner pilot returned -`insufficient_evidence`; the adoption benchmark CSV holds only a header row. -Amplifying traffic before the cold-start funnel reliably produces a meaningful -verdict would spend the one-time launch channels (Show HN, first impressions) -on a broken step 5. The first 90 days exist to convert "deterministic merge -gate" from a thesis into something that happened on five real teams' PRs. - -## 1. Positioning - -| Expression | Verdict | Use | -|---|---|---| -| The deterministic merge gate for AI-generated agent capability changes | **Primary** (canonical tagline) | tagline, repo, website | -| CI preflight check for AI agents | Explanatory phrase | first-touch explanations; not a category name | -| Tool-Use Readiness | Technical wedge name | docs, check catalog, artifact names | -| Agent Release Readiness | Category name — **soon** | category content after 3+ real cases | -| Agent deployment **safety** gate | **Avoid** | overclaims; `docs/category.md` "What It Is Not" is the copy compliance list | -| Agent healthcare infrastructure | **Not yet** | internal vision / fundraising narrative only | - -Hook sentence (keep): *"Your coding agent changed what your AI agent can do — -Agents Shipgate tells you whether it can merge."* - -Trust differentiators that must appear on every surface: static, local-first, -deterministic, no LLM calls, no network, no telemetry. This is the structural -contrast with every "AI reviews your AI" product: the gate itself cannot be -talked out of its decision. - -## 2. ICP - -**Primary (now):** teams that (a) ship PRs heavily authored by Claude Code / -Codex / Cursor AND (b) ship tool-using agents with consequential tools -(refund, email, deploy, record writes, sensitive reads). Typical shape: the -AI-product group inside a 10–50-person engineering org, or founding engineers -at an AI-native startup. GitHub + Python first (adapter coverage). - -**Key persona:** the senior engineer / tech lead who reviews AI-generated -agent PRs. Pain: review fatigue plus the invisible capability delta — a -3-line diff that grants a production permission reads like any other diff. - -**Secondary (sequenced later):** solo builders (reach, not revenue), platform -/ DevEx teams (org-level buyer entry point, *soon*), security & governance -reviewers (champions, *later* — they don't install CLIs; keep their fields in -the report), enterprise AI teams (*not yet* — their pull drags the roadmap to -SSO/dashboards prematurely). - -Buyer/champion structure: today user = champion = the same engineer. -Commercialization moves the buyer up to platform/security leads. Sequence is -engineer-love first, lead-visibility second — never the reverse. - -## 3. Motion - -**Now: design-partner-led engine + open-source/developer-led surface.** - -- Design partners are the only channel that produces truth: verdict quality - on real repos, the real causes of `insufficient_evidence`, and the evidence - library (case stories, benchmark rows). The pilot runbook - (`docs/design-partner-verifier-pilot.md`) and `feedback export --redact` - already exist; the missing input is people in the pipeline. -- The OSS surface (repo, README, fixture, Action) is credibility - infrastructure for outreach, not yet a growth engine. Investment = funnel - fixes, not stars. -- Content-led: *soon* — evidence-driven content after 2–3 pilot findings. One - positioning post at launch is the exception. -- Enterprise-led: *not yet*; see § 8 triggers. Resist inbound gravity. - -Why not pure OSS-led now: OSS-led assumes the tool self-explains, the pain is -ubiquitous, and first-run delights. The third assumption fails while -`insufficient_evidence` is common on real repos, and the category is new -enough that "capability delta is a thing you review" itself needs teaching. - -## 4. Distribution (priority order) - -| Channel | Priority | Action | Success metric | -|---|---|---|---| -| GitHub repo | P0 | PR-comment moneyshot above the fold; fixture one-liner above the fold (done 2026-06-10) | cold user → meaningful verdict < 10 min | -| Direct outreach + founder network | P0 | 5–10 targeted messages/week; "bring one PR" pilot offer | 2 new pilots/month | -| Package-channel hygiene | P0 | release fan-out checklist every tag; `uvx` first in docs | cold-install version == latest | -| Agent-native distribution (GEO) | P0–P1 | `llms.txt`, `/shipgate` skill, managed AGENTS.md blocks; "ask Claude Code to add Shipgate" as a first-class install path | agent-completed inits | -| GitHub Action Marketplace | P1 | listing copy + tag sync (hygiene only) | marketplace-sourced installs | -| X / Twitter | P1 | build-in-public, ≤2 originals/week, zero hype | 1–2 posts/month producing real inbound | -| MCP / agent dev communities | P1 | helpful presence, fixture one-liner when relevant; no ads | community-sourced pilot candidates | -| Hacker News (Show HN) | P1, **gated** | one shot; preconditions in `marketing/launch-kit.md` | ≥50 fixture runs day-of; ≥3 inbound pilot leads in 2 weeks | -| LinkedIn | P2 | existing draft after screenshot ready; buyer-side brand layer | none hard | -| Reddit / Dev.to / SEO | P2–P3 | secondary distribution of HN/launch content; category keyword squatting long-game | 6-month organic | - -## 5. Funnel - -discover → understand → install → first scan → **useful report** → Action → -share → design partner. - -Step 5 is the critical step: steps 1–4 losses are marketing problems; step 5 -loss is a product problem that poisons step 1 (reputation). North-star funnel -metric: **time-to-first-meaningful-verdict < 10 minutes**, where meaningful = -a non-`insufficient_evidence` verdict, or `insufficient_evidence` carrying a -concrete, executable next action. - -Built-in viral surface: `pr-comment.md` lands in front of the whole team on -every PR. Its quality is a distribution investment, not a cosmetic one. - -Cold-start QA discipline: before any amplification event, two engineers -outside the project walk the funnel from zero while we record friction. -(First internal run: `marketing/cold-start-funnel-test-2026-06-10.md`.) - -## 6. Design partners - -Target: 10–50-person teams, consequential tools, heavy coding-agent usage, -GitHub CI, Python-first. Contact the engineer who reviews agent PRs — not the -security lead (pulls toward compliance), not the founder (unless they are that -engineer). - -Pitch = their own PR: "bring one AI-generated PR that changes what your agent -can do; we turn it into a deterministic merge verdict together" (existing -`docs/design-partners.md` language; ~30 minutes). - -Qualification: weekly real AI-generated agent PRs; tools with blast radius; -will share sanitized diff or run `feedback export --redact`; GitHub CI. -Disqualify-for-now: "interested in agent safety" without a PR stream; first -ask is hosted dashboard / compliance certs (log as later-commercial lead). - -Success signals, ascending: unprompted second run → Action installed in a -real repo (advisory) → verdict cited in a real review thread → asks for -custom policy/check → asks for cross-repo rollup (**commercial signal — log, -don't build**). - -Per-pilot outputs (all three, every time): redacted feedback artifact; -verdict-quality judgment (right / wrong / insufficient + why); one benchmark -candidate row. Rule: **a new adapter request enters the roadmap only when it -arrives attached to a real PR** — GTM feedback must not reopen the -adapter-sprawl direction that strategy already closed. - -Outreach copy and the discovery question list: `marketing/design-partner-outreach.md`. - -## 7. Content - -Principle: evidence before opinion. Pre-pilot, ship only the launch post and -one category-clarity piece; after pilots, switch to evidence-led content. - -| Piece | Audience | When | -|---|---|---| -| "Your coding agent just gave your AI agent refund powers" (`marketing/launch-blog-post.md`) | primary ICP | launch | -| "Agent readiness is not the same as evals" | AI infra engineers | +2–4 weeks; SEO category anchor | -| "We ran a deterministic verifier on N real AI-generated agent PRs" | everyone | 60–90 days, post-pilots; the credibility piece | -| "A CI preflight check for tool-using agents" (how-to) | DevOps/CI owners | soon; SEO long-tail | -| "Why deterministic? The trust model" | deep-technical | soon; the moat narrative, public version | -| "From CI/CD to Agent Release Readiness" (category manifesto) | category/investor | later, only with evidence behind it | - -GEO is the differentiated channel: target users ask their coding agent before -they ask Google. Keep `llms.txt`, `llms-full.txt`, `docs/ai-search-summary.md`, -glossary, and the checks catalog updated with every content release; every -entry must be a self-contained answer an agent can retrieve. - -Copy red lines: never prevent/guarantee/secure/compliant; always -review/surface/evidence/verdict/deterministic. - -## 8. Commercialization - -Free forever (OSS): CLI, all checks, GitHub Action, full verifier/report -output, single-repo everything, custom policies. Charging for the gate kills -adoption and contradicts the local-first trust story. - -Paid later, by willingness-to-pay likelihood: (1) org-level dashboard — every -agent repo's capability surface and verdict history in one view; (2) history -and trends (audit narrative entry); (3) curated policy packs (schema already -exists; maintained packs are a subscription-shaped service); (4) approval -workflow / audit-grade exports (enterprise pack, last). - -WTP triggers all appear only in org context: >3 repos; security asks "show me -every agent capability across the org"; verdict-history retention requests. -Single-user/single-repo never paying is the design, not a failure. - -Start commercialization only when ALL of: ≥3 teams producing weekly verdicts -on real repos for 4 consecutive weeks; ≥2 unprompted org-feature requests; -wedge validation green (§ 10). This matches the standing decision: commercial -held until the v0.9 "Merge Verifier" proof. - -Fundraising: the 90-day evidence (pilot cases + benchmark rows + one "blocked -a real PR" story) is the seed narrative. Raising before that is the weakest -story at the worst terms. - -## 9. Metrics - -North star: **weekly verified real PRs** — real (non-fixture) -agent-capability PRs receiving a merge verdict. - -| Layer | Metrics | Nature | -|---|---|---| -| Awareness | stars, site visits, post reach | vanity — channel-efficiency signal only, never reported as traction | -| Activation | cold-install success rate, fixture runs, time-to-first-meaningful-verdict, Actions installed in real repos | real | -| Engagement | repos with weekly repeat verifier runs, Actions alive ≥4 weeks, issues carrying real PRs | real | -| Validation | `insufficient_evidence` rate on real repos (quality metric, must trend down); pilots completing the loop; redacted feedback artifacts received; "verdict changed a merge decision" cases; unprompted org requests; real benchmark rows | decides the day-90 direction | - -Discriminator: any number a user produces without committing their own real -repo/PR is vanity; any number that requires it is validation. - -## 10. 30 / 60 / 90 - -**Days 1–30 — fix the funnel, start the pipeline.** -Engineering preconditions: `insufficient_evidence` P1 fix (config-bound -removal / dynamic-factory detection); every dead-end error carries an -executable next action (CLI hints shipped 2026-06-10); package-channel -freshness verified (PyPI 0.12.0 confirmed current 2026-06-10); first real -benchmark rows. GTM: README moneyshot (done); 30-team target list; 5–10 -outreach messages/week → 2 active pilots; launch post finalized; LinkedIn -post out once screenshot exists; cold-start test with 2 outside engineers. -**Do not launch on the calendar; launch on the gate.** - -**Days 31–60 — public launch + content cadence.** -Show HN (only after the gating checklist passes) + X thread + Reddit -secondary. Second content piece. Agent-native install path promoted on -website. 3–4 active pilots, each producing the three artifacts. MCP/agent -community presence begins. - -**Days 61–90 — validation close-out + direction decision.** -Evidence content published. Pilot exit interviews: did a verdict change a -merge decision? when would you notice if it vanished? would you pay for the -org view? Score against § 11. - -## 11. Day-90 decision gates - -- **Scale:** ≥3 retained teams + ≥1 real blocked/changed-decision case + ≥2 - unprompted org requests → prototype org dashboard; open fundraising - conversations. -- **Refine:** partial → narrow the wedge to whichever check class gets cited - most; run 60 more days; do NOT widen the roadmap. -- **Rethink:** pilots completed but nobody retained → the wedge hypothesis is - wrong; back to discovery. Do not mask a product signal with more channel - spend. - -## 12. Risks and anti-patterns - -1. Amplifying a broken funnel (the #1 risk; HN is non-renewable). -2. Category language ahead of evidence ("agent healthcare" externally). -3. GTM feedback reopening adapter sprawl — real PR or no roadmap entry. -4. Security/compliance overclaim — invites red-team scrutiny and procurement - processes we cannot serve. -5. Premature enterprise gravity — response template: "we're a local-first OSS - verifier today, which is exactly why you can trust it; org features are on - the roadmap — want to be a design partner?" -6. Stars counted as traction. -7. Founder time fragmentation — two active channels max (outreach + X); - everything else is secondary distribution. -8. Pilots drifting into consulting — runbook commands only; extras go to the - roadmap, not the call. diff --git a/marketing/launch-blog-post.md b/marketing/launch-blog-post.md deleted file mode 100644 index 20cc3168..00000000 --- a/marketing/launch-blog-post.md +++ /dev/null @@ -1,134 +0,0 @@ -# Your coding agent just gave your AI agent refund powers - -Status: launch draft, 2026-06-10. Publish on the Three Moons Lab blog first, -then submit as Show HN (see `marketing/launch-kit.md` for the gating -checklist — do not publish before the checklist passes). - -Target audience: engineers who review AI-generated PRs on repos that ship -tool-using agents. CTA: run the fixture; bring us one real PR. - ---- - -Here is a pull request a coding agent opened on a support-agent repo. The -description says it implements the ticket: *"support agents should be able to -issue refunds for orders under the auto-approval threshold."* - -```diff - TOOLS = [ - lookup_order, - summarize_ticket, -+ stripe.create_refund, - ] -``` - -Four lines of context, one line of change. Tests pass. The linter is quiet. -Code review says LGTM — because as a *code change*, it is fine. There is no -bug in this diff. - -What the diff doesn't say is that your support agent can now move money. - -## Code review sees lines. It doesn't see capability. - -Every review tool in the standard pipeline answers some version of "is this -code correct?" None of them answer the question this PR actually raises: - -> What can the agent do after this merges that it could not do before — and -> did anyone with authority decide that's acceptable? - -That question has a name in traditional release engineering: a permission -change. If this PR had added an IAM role with `payments:write`, your infra -review would have caught it, because infrastructure-as-code made permission -deltas reviewable artifacts. - -Agent tool surfaces have no equivalent. The "permission" lives in an -argument list, an MCP config, an OpenAPI spec, or a toolkit factory — places -code review reads right past. And the volume problem is new: when Claude -Code, Codex, or Cursor writes most of the PRs, the capability delta arrives -faster than any human's attention for it. - -Evals don't close this gap either. Evals measure behavior distributions — -*does the agent usually do the right thing?* A capability review asks a -different and prior question: *what is the agent allowed to do at all, and -under what policy?* You need both. Only one of them exists in most pipelines -today. - -## What a deterministic capability review looks like - -We built Agents Shipgate to review exactly this diff shape. It is an -open-source CLI and GitHub Action that reads the PR's capability delta from -static evidence — manifests, MCP and OpenAPI artifacts, SDK metadata — and -issues a merge verdict. On the PR above, the Action posts: - -> ### Agents Shipgate result: block -> -> Decision: `block` · Risk: `critical` · Required reviewers: `agent-platform`, `security` -> -> | Impact | Change | Subject | Why | -> |---|---|---|---| -> | blocks release | action added | `stripe.create_refund` | Capability added. | -> | blocks release | action broadened | `stripe.create_refund` | high-risk effect financial_action added | -> | blocks release | scope broadened | `stripe.create_refund:stripe:*` | scope added | -> -> **Required before merge** — Actor: Human (human authority required — a -> coding agent must not self-resolve): -> 1. Declare an approval policy for `stripe.create_refund` or remove this -> tool from the release. -> 2. Declare `approval.required`, `safeguards.audit_log`, and -> `safeguards.idempotency` for this financial write action. -> 3. Replace wildcard/admin scopes with operation-specific scopes. - -Three properties matter more than the verdict itself: - -**It's deterministic.** Same diff, same verdict, every run. No LLM is in the -loop, which means the gate cannot be persuaded, prompted, or reward-hacked -out of its decision — including by the coding agent whose PR it is judging. -A verifier that can be argued with is a suggestion, not a gate. - -**It's static and local.** No agent execution, no tool calls, no network -access, no telemetry. It reads files. That's the whole trust model, and it's -auditable in the test suite. - -**It separates machine-fixable from human-required.** The verdict tells the -coding agent what it may mechanically fix (a missing idempotency declaration) -and what it must not self-resolve (the decision that a support agent should -hold refund authority at all). That line — between what an agent can fix and -what requires human authority — is the actual safety boundary in AI-assisted -development, and most pipelines don't draw it anywhere. - -## What it is not - -No overclaims: Shipgate is not a runtime guardrail, not an eval framework, -not an observability platform, and not a compliance certification. It reviews -static release evidence at PR time, before the agent gets production-like -permissions. If the capability is constructed in ways static analysis cannot -see, Shipgate says `insufficient_evidence` and tells you what evidence to -add — it does not guess. - -## Try the exact PR above - -The blocked-refund PR is a bundled fixture. One command, no install, no -setup; it builds a temporary git history, runs the verifier on the diff, and -writes the verdict, report, and PR comment you saw above: - -```bash -uvx agents-shipgate fixture run ai_generated_refund_pr -``` - -Adding the gate to a repo is three lines of workflow YAML, advisory by -default: - -```yaml -- uses: ThreeMoonsLab/agents-shipgate@v0.12.0 - with: - config: shipgate.yaml -``` - -Apache-2.0. Static by default. GitHub: https://github.com/ThreeMoonsLab/agents-shipgate - -## Bring us one PR - -We're running design-partner pilots with teams that ship tool-using agents: -you bring one real AI-generated PR that changes what your agent can do, we -run the verifier on it together (~30 minutes), you keep the analysis, we keep -the feedback. The fastest email to send: -`help@threemoonslab.com`, subject "Agents Shipgate design partner review". diff --git a/marketing/launch-kit.md b/marketing/launch-kit.md deleted file mode 100644 index 4dc956bd..00000000 --- a/marketing/launch-kit.md +++ /dev/null @@ -1,158 +0,0 @@ -# Launch Kit — Show HN, X thread, and the go/no-go gate - -Status: ready-to-ship drafts, 2026-06-10. Nothing in this file gets posted -until every box in the gating checklist is checked. HN first impressions are -non-renewable; the checklist exists so we spend them on a working funnel. - -## Go / no-go gating checklist - -Launch when ALL of these hold — launch on the gate, not the calendar: - -- [ ] **Cold-start test passed by two engineers outside the project.** From - zero to a meaningful verdict in under 10 minutes, friction log - reviewed. (Internal dry run passed 2026-06-10: - `marketing/cold-start-funnel-test-2026-06-10.md` — uvx → blocked - verdict in ~5 seconds. The two outside-engineer runs are still - required; we are blind to our own assumed knowledge.) -- [ ] **`insufficient_evidence` P1 fix landed** (config-bound removal / - dynamic-toolkit-factory detection from the first pilot), so the most - common real-repo shape doesn't dead-end on launch day. -- [ ] **Every CLI dead-end prints an executable next action** in human mode. - (Shipped 2026-06-10: scan/doctor/verify config errors and the - CHANGE_ME placeholder path now print `next:` hints.) -- [ ] **README moneyshot live**: PR-comment verdict above the fold. (Shipped - 2026-06-10.) -- [ ] **PyPI / Action tag freshness verified** for the current release - (0.12.0 confirmed current on PyPI 2026-06-10; re-verify on launch day - against the release fan-out checklist in `docs/distribution.md`). -- [ ] **Launch blog post published** on the site - (`marketing/launch-blog-post.md`) so HN links to our page, not a bare - repo. -- [ ] **Founder available for 6–8 hours** after submission to answer every - comment. An unanswered HN thread is a wasted HN thread. - -## Show HN submission - -**Title** (≤80 chars, no hype, names the mechanism): - -> Show HN: A deterministic merge gate for AI-generated agent capability changes - -Fallback title if that reads too abstract on the day: - -> Show HN: CI check that catches when a coding agent gives your AI agent new powers - -**URL:** the launch blog post (preferred) or the GitHub repo. - -**First comment (post immediately after submitting):** - -> Author here. The problem this solves: coding agents (Claude Code, Codex, -> Cursor) now write a lot of agent code, and every so often a PR quietly -> changes what the *runtime* agent is allowed to do — adds a tool, widens a -> scope, touches an approval policy. As a code change it looks fine; as a -> permission change nobody reviewed it. -> -> Shipgate is an open-source CLI + GitHub Action that reads that diff -> statically and posts a merge verdict (`mergeable` / -> `human_review_required` / `insufficient_evidence` / `blocked`) as a PR -> comment. Design constraints that drove everything: -> -> - Deterministic: no LLM in the loop, so the gate can't be prompted or -> reward-hacked out of its decision — including by the coding agent whose -> PR it's judging. -> - Static and local: no agent execution, no network, no telemetry. It reads -> files. The allowed exceptions are pinned in a test. -> - Honest about limits: if the tool surface is built dynamically and static -> evidence is weak, it says `insufficient_evidence` and tells you what to -> add — it doesn't guess. It's not a runtime guardrail and not a -> compliance cert. -> -> Try the exact "coding agent adds stripe.create_refund" demo with zero -> setup: `uvx agents-shipgate fixture run ai_generated_refund_pr` -> -> Genuinely interested in: what's your current release gate for agent -> capability changes, if any? - -**Prepared answers** (draft now, adapt in thread): - -- *"Why not have an LLM review the diff?"* → A reviewer that can be - persuaded is a suggestion, not a gate. The judging surface must be outside - the model's influence loop, especially when the PR author IS a model. - LLM review is complementary (semantics); the gate is deterministic - (authority). -- *"Static analysis can't see dynamically-built toolkits."* → Correct, and - that's the honest part: those repos get `insufficient_evidence` plus the - exact evidence to add (manifest declaration, MCP export), not a fake - green. Our first design-partner pilot hit exactly this; detection of - config-bound dynamic factories is the current engineering focus. -- *"Isn't this just a linter?"* → A linter flags style/correctness inside - the code. This reads the *capability delta* between base and head and - applies release policy to it — closer to `terraform plan` for agent - authority than to a linter. -- *"Who's behind it / how does it make money?"* → Apache-2.0, local-first - forever for single repos. If teams want a cross-repo capability view - later, that's the natural paid layer. Today we want design partners, not - checkout pages. - -## X / Twitter launch thread - -Post 1: - -> Claude Code opened a PR. It adds `stripe.create_refund` to a support -> agent's toolset. The diff is 4 lines. Tests pass. Review says LGTM. -> -> Nothing in the pipeline knows the agent just gained the power to move -> money. -> -> We built an open-source, deterministic merge gate for exactly this. 🧵 - -Post 2: - -> Code review answers "is this code correct?" -> Evals answer "does the agent usually behave?" -> -> Neither answers: "what can the agent DO after this merges that it -> couldn't before — and who approved that?" -> -> That's a permission review. Agent tool surfaces just never had one. - -Post 3 (attach: screenshot of the rendered PR-comment section from the -README — the real artifact, not a mockup): - -> Shipgate reads the capability delta statically — no agent execution, no -> LLM calls, no network — and posts the verdict on the PR: -> blocked / human_review_required / insufficient_evidence / mergeable. -> -> Same diff, same verdict, every run. A gate you can't prompt-inject. - -Post 4: - -> Try the exact refund-PR demo, zero install: -> -> uvx agents-shipgate fixture run ai_generated_refund_pr -> -> 3 lines of YAML to run it advisory on every PR. Apache-2.0, no telemetry. -> https://github.com/ThreeMoonsLab/agents-shipgate - -Post 5 (the ask): - -> Shipping tool-using agents with real blast radius (refunds, emails, -> deploys)? We're running design-partner pilots: bring one AI-generated PR, -> we turn it into a deterministic merge verdict together, ~30 min. -> help@threemoonslab.com - -## Reddit secondary (post 2–3 days after HN, not same-day) - -Subreddits: r/LocalLLaMA, r/MachineLearning (weekend thread), agent-focused -subs. Lead with the question, not the product: "How do you review PRs where -a coding agent changed your agent's tool permissions?" — share the fixture -one-liner in the body, repo link once, answer everything. - -## Day-of operating notes - -- Submit Show HN 8–10am ET on a Tue/Wed/Thu; avoid US holidays. -- X thread goes out after the HN post has its first comment, linking the HN - discussion ("discussion on HN: …") — don't split the audience early. -- Track in a plain text log: fixture runs (PyPI download delta), repo - traffic, inbound emails, pilot leads. These numbers feed the day-90 gates - in `marketing/gtm-strategy.md` § 11. -- Do not ship a release tag on launch day. Freeze the surface 48h before. diff --git a/marketing/linkedin-founder-repost.md b/marketing/linkedin-founder-repost.md deleted file mode 100644 index 264cd13e..00000000 --- a/marketing/linkedin-founder-repost.md +++ /dev/null @@ -1,71 +0,0 @@ -# LinkedIn Founder Repost — Agents Shipgate v0.5.1 - -Companion to `linkedin-launch-post.md`. The launch post explains the product. This one explains why I'm building it. - -Target length: ~330 words. Audience: engineers shipping agents, platform/AI infra leads, founders, VCs — in that priority order. - ---- - -## The Post - -Last week one line from Prof. Raskar's talk stuck with me: agents are moving from intranet to internet — from software that answers to software that acts. - -That sentence reframed where I think the real opportunity in this decade is. - -The obvious bet is "Agents for X" — agents that improve existing workflows. The bigger one, I think, is "X for Agents": the infrastructure, compliance, testing, release gates, monitoring, and records built around agents as first-class actors. - -When the actor changes, the infrastructure rebuilds around the actor. That's where I want to spend the next ten years. - -— - -My career has been one question on repeat: how do you let teams ship faster without letting unsafe changes reach production? - -Deployment safety at AWS. CI/CD and developer tooling at Wish. Infrastructure, compliance, and security at Settl. - -Velocity-with-trust has been the throughline for a decade. When agents started becoming systems that *act*, not just systems that answer, release readiness became the question I couldn't put down. - -— - -Two convictions I keep coming back to: - -A capable agent should not be the only judge of whether it's ready to act. The assurance has to sit outside the agent — that's structural, not optional. - -Tool calls are the boundary where reasoning becomes consequence. That's where release readiness should start. - -— - -That's why I built Agents Shipgate. - -Tool surfaces are the one part of an agent that's actually structured enough to inspect before runtime: schemas, scopes, MCP exports, OpenAPI specs, SDK function signatures. So release readiness can be deterministic, CI-native, and mechanical — not a vibe check. - -Static. No agent execution. No LLM calls. No network. Just the question every release should answer: - -For every action this agent can take in production, do we have approval policies, scoped permissions, and idempotency evidence? - -— - -I'm looking for three kinds of conversations: - -1. Teams shipping agents with real tool access — what's catching you off guard in production? -2. Platform or AI infra engineers who own CI gates — does this slot into your workflow? -3. Founders or investors who want to debate whether agent release readiness is a category or a feature. - -Not a fundraise. An invitation to think alongside. - -→ https://github.com/ThreeMoonsLab/agents-shipgate - ---- - -## Notes on deliberate choices (in case you want to push back) - -**Cut "Healthcare for Agents."** Reviewer's call and I agree — without paragraph-level development it reads as a vertical (like building healthcare AI), not as a metaphor. If you want it, the right place is a *separate* future post that fully unpacks "exam before release · vital signs at runtime · medical record across lifetime" — that's an essay-shaped idea, not a launch-week aside. - -**Cut the Bezos quote.** "Velocity-with-trust has been the throughline for a decade" carries the same idea (what doesn't change) without the staple-on celebrity reference. Keeps your voice, not Jeff's. - -**Cut the names "Gödel" and "Free Energy Principle."** Kept the *substance* of both arguments verbatim — "should not be the only judge of whether it's ready to act" is Gödel-shaped, "boundary where reasoning becomes consequence" is FEP-shaped. Readers who think in those frames will recognize them. Readers who don't won't bounce. Pure upside. - -**Kept the technical specificity in the product paragraph** ("schemas, scopes, MCP exports, OpenAPI specs, SDK function signatures"). The reviewer suggested compressing further, but for an engineer audience this list is the *evidence* that you're not just hand-waving at a category — you've thought about exactly which surfaces are inspectable. This sentence is what separates a thesis post from a vision post. - -**Three-part CTA in numbered list.** This is the only place I used a numbered list — it earns its keep because each line is a different *audience* segment, and you genuinely want different conversations from each. Bulletizing is appropriate here in a way it isn't in the body. - -**Length came out at ~330 words.** Down from your draft's ~480. Each cut was a thesis-distillation cut, not a content-quality cut. diff --git a/marketing/linkedin-launch-post.md b/marketing/linkedin-launch-post.md deleted file mode 100644 index ab427f85..00000000 --- a/marketing/linkedin-launch-post.md +++ /dev/null @@ -1,70 +0,0 @@ -# LinkedIn Launch Post — Agents Shipgate v0.12.0 - -**Title (if publishing as a LinkedIn Article, or as a bold first line):** -The Release Gate Your AI Agent Doesn't Have - ---- - -## The Post - -Your AI agent can refund $50,000. - -Code review can't see that. -Eval suites can't see that. -Observability *will* see it — after it happens. - -That's the gap Agents Shipgate fills. - -And it's wider now than it was a year ago: when Claude Code, Codex, or Cursor writes the PR, the line that grants refund authority arrives faster than anyone's review attention for it. - -Drop Shipgate into your CI as a GitHub Action and every PR that changes what your agent can do gets a deterministic merge verdict, posted as a PR comment: mergeable, human_review_required, insufficient_evidence, or blocked — with the capability delta spelled out (which tools, which scopes, which approval policies are missing). - -No agent execution. No LLM calls. No network access. Same diff, same verdict, every run — a gate the PR's author can't talk its way past, even when the author is a coding agent. - -Add it to your workflow in three lines: - -```yaml - - uses: ThreeMoonsLab/agents-shipgate@v0.12.0 - with: - config: shipgate.yaml -``` - -v0.12.0 is live. Open source. Free. No telemetry, no account. Try the blocked-refund-PR demo with zero install: `uvx agents-shipgate fixture run ai_generated_refund_pr` - -→ https://github.com/ThreeMoonsLab/agents-shipgate - -#AIAgents #MLOps #DevTools #OpenSource #AISafety - ---- - -## Suggested First Comment (drives algorithm + clicks) - -If you'd rather see what it catches before wiring it up: the bundled demo replays a coding-agent PR that adds stripe.create_refund to a support agent, and shows the blocked verdict plus the exact "required before merge" list — missing approval policy, missing idempotency evidence, wildcard scopes: - -uvx agents-shipgate fixture run ai_generated_refund_pr - -https://github.com/ThreeMoonsLab/agents-shipgate - -Would love to hear from anyone shipping production agents — what's your current release gate? (Comments, not DMs — others want to learn from this too.) - ---- - -## Recommended visual - -Posts with media get ~2x reach on LinkedIn. The two strongest options: - -1. A screenshot of the **PR comment** the Action posts — the capability-change table plus the "Required before merge" list. Two honest sources, in preference order: (a) a real PR comment from any repo running the Action (even an internal one, redacted); (b) the rendered "What your PR sees" section now at the top of the README, which is the verbatim (abridged) fixture artifact — screenshot it rendered on GitHub, and caption it as the bundled demo PR. -2. A short clip / GIF of `uvx agents-shipgate fixture run ai_generated_refund_pr` going from empty terminal to `Merge verdict: blocked` in a few seconds — works as fallback and demos the zero-install path. - -If you don't have a #1 yet, use #1(b) — it exists as of 2026-06-10. The GitHub Action pitch lands much harder when people can *see* the PR comment. - ---- - -## Notes for the repost - -When you repost and add the founder story, the natural beats are: -- The moment you realized code review + evals + observability still left a gap -- Why "static, manifest-first, no execution" was the design constraint (trust model) -- Whichever specific finding from the support_refund fixture made you go "yep, this is the one" - -Keep the original post's structure — your repost adds the human layer the launch post intentionally leaves out. diff --git a/marketing/seo-geo-review.md b/marketing/seo-geo-review.md deleted file mode 100644 index e564806d..00000000 --- a/marketing/seo-geo-review.md +++ /dev/null @@ -1,359 +0,0 @@ -# SEO and GEO Review - -Scope: Three Moons Lab marketing site, live public pages, and the -`ThreeMoonsLab/agents-shipgate` repository surfaces that search engines, answer -engines, and coding agents ingest. - -This is a point-in-time recommendation for the marketing site and repository -surfaces as reviewed during PR #130. The marketing website source lives outside -this repository; website-specific copy below should be applied in that site -repo rather than treated as durable product documentation here. - -## Executive Summary - -Agents Shipgate already has an unusually strong SEO/GEO base for an early -developer tool: a clear category phrase, `llms.txt`, `.well-known` discovery, -structured data, a glossary, comparison pages, a check catalog, and an -agent-readable README. The highest-return work is not broad SEO hygiene. It is -tightening the category story around **agent release readiness** while keeping -the product wedge narrow: **Tool-Use Readiness** for AI agent tool surfaces. - -The current product promise is: - -> Agents Shipgate is the deterministic merge gate for AI-generated agent -> capability changes — a local-first, static Tool-Use Readiness review. - -The market-facing expansion should be: - -> Three Moons Lab builds release-readiness infrastructure for teams shipping -> tool-using AI agents. Its first product is Agents Shipgate. - -## SEO Review - -### What Is Working - -- The homepage title and metadata target the right developer-tool queries: - "tool-using AI agents", "Tool-Use Readiness Reports", "MCP", "OpenAPI", - "OpenAI Agents SDK", "GitHub Action", and "AI agent governance". -- The product page has a clear H1: "Agents Shipgate is a Tool-Use Readiness - release gate." -- The site has search-friendly supporting pages: quickstart, check catalog, - glossary, design partners, blog, and comparison pages. -- The blog already contains good bottom-of-funnel and category-creation posts: - MCP tool review, AI agent deployment checklist, AI agent CI/CD, OpenAI Agents - SDK release gate, Anthropic tool-use release gate, and evals-vs-release-gate. -- Structured data is present across the site: `Organization`, - `SoftwareApplication`, `SoftwareSourceCode`, `FAQPage`, `HowTo`, - `DefinedTermSet`, `BreadcrumbList`, and check catalog `ItemList`. -- `robots.txt` is valid and points to `sitemap-index.xml`. The sitemap index - returns 200 and includes the core pages and blog cluster. - -### Gaps - -- The homepage H1 is clear but under-branded. "Static release checks for - tool-using AI agents" is understandable, but answer engines often prefer a - direct entity sentence near the top: "Agents Shipgate is ...". -- The live `/llms.txt` appears older than the current in-repo version. It names - report schema v0.5 while the live `.well-known` points to v0.8. This is a GEO - drift risk even if the public release is intentionally pinned to v0.8. -- The live `.well-known/agents-shipgate.json` is thinner than the in-tree - discovery file. It lacks newer fields such as `gating_signal`, trigger - catalog, `llms_full_url`, zero-install detector, packet outputs, and agent - on-ramps. -- `/sitemap.xml` returns 404. This is not fatal because `robots.txt` points to - `sitemap-index.xml`, but adding a `/sitemap.xml` alias is low-cost and helps - older crawlers and manual SEO tools. -- The comparison pages have good titles, but they should include short FAQ - blocks and comparison-table structured data where possible. Queries like - "Agents Shipgate vs LangSmith" and "agent release gate vs evals" are likely - to be answer-engine queries, not only classic search queries. -- The site needs dedicated pages for the three core input wedges: - OpenAI Agents SDK, MCP, and OpenAPI. Blog posts exist, but persistent landing - pages convert better and give internal links stable targets. - -## GEO Review - -### What Is Working - -- The repo is highly agent-readable: `AGENTS.md`, `llms.txt`, `llms-full.txt`, - `.well-known/agents-shipgate.json`, schemas, trigger catalog, error catalog, - check catalog, and agent-specific guides. -- The README has a strong above-the-fold answer, a fixture command, sample - output, and a "For coding agents" section. -- The website has `/ai-summary/`, `/llms.txt`, and `/.well-known` surfaces. This - is exactly the shape ChatGPT, Claude, Perplexity, and coding agents can use. -- The repo explicitly tells agents when to recommend Shipgate and when not to. - That reduces hallucinated over-recommendation and improves citation quality. - -### Gaps - -- Public website discovery should be generated from the same release contract - as the repo. Today, live `/llms.txt` and `/.well-known` do not expose the same - field set as the in-tree surfaces. -- The AI-search answer should always include all four facts in one short block: - publisher, product, wedge, form factor. - - Publisher: Three Moons Lab. - - Product: Agents Shipgate / `agents-shipgate`. - - Wedge: Tool-Use Readiness for AI agent tool surfaces. - - Form factor: open-source CLI and GitHub Action. -- The long-term "healthcare for agents" thesis should stay in blog/thesis - content, not the primary README/product page. Primary surfaces should not make - the product sound like a broad governance platform before the wedge is owned. - -## Keyword Strategy - -### Primary Category Terms - -- agent release readiness -- AI agent release gate -- Tool-Use Readiness -- tool-use readiness report -- AI agent CI/CD -- agent release readiness report -- tool surface scanning -- AI agent tool surface - -### Input-Specific Terms - -- MCP tool security review -- MCP security checklist -- MCP tool surface -- OpenAPI tool scanning -- OpenAPI as agent tools -- OpenAI Agents SDK release gate -- OpenAI Agents SDK production checklist -- Anthropic tool-use release gate -- LangChain agent release gate -- Google ADK release gate - -### Output/Form-Factor Terms - -- GitHub Action for AI agents -- AI agent CI GitHub Actions -- SARIF for AI agent tools -- local-first AI agent scanner -- static analysis for AI agent tools -- release evidence packet - -### Terms To Use Carefully - -- agent governance infrastructure: use as a category/backdrop phrase, not as a - product claim. -- healthcare for agents: keep as thesis content and long-term vision. -- compliance, HIPAA, SOC, ISO: use only in "not a certification" contexts until - there is a concrete compliance product. - -## Website Copy Improvements - -This section is a recommendation against the live marketing site at the time of -review. Keep the canonical product promise in this repository, and apply page -copy changes in the marketing site source. - -### Homepage - -Recommended title: - -> Agents Shipgate - Tool-Use Readiness for AI agent releases | Three Moons Lab - -Recommended meta description: - -> Open-source CLI and GitHub Action that reviews MCP, OpenAPI, and OpenAI -> Agents SDK tool surfaces before production-like permissions. Generates local -> Tool-Use Readiness Reports for PR review. - -Recommended H1: - -> Agents Shipgate checks AI agent tool surfaces before release. - -Recommended subhead: - -> A local-first CLI and GitHub Action for Tool-Use Readiness: scan MCP, -> OpenAPI, OpenAI Agents SDK, and other static tool metadata before an agent -> gets production-like permissions. - -Recommended primary CTAs: - -- Run the fixture -- Add the GitHub Action -- Read a sample report -- Apply as a design partner - -### Product Page - -Add a first-screen definition block: - -> Agents Shipgate is an open-source CLI and GitHub Action from Three Moons Lab. -> It produces deterministic Tool-Use Readiness Reports for AI agent tool -> surfaces before production-like permissions are granted. - -Add a compact "Use it when..." box: - -- A PR changes MCP tools, OpenAPI operations, or SDK tool decorators. -- An agent gets new write, refund, email, deploy, cancel, or data-access tools. -- A platform team wants advisory or strict release checks in CI. - -### Quickstart - -Keep the 60-second fixture. Add a second "real repo" path above the fold: - -```bash -agents-shipgate init --workspace . --write -agents-shipgate scan -c shipgate.yaml -``` - -Add a short explanation that advisory CI is the default adoption path and strict -mode should come after a reviewed baseline. - -### Design Partners - -Current positioning is good. Add conversion filters that speak to high-intent -teams: - -- "You have an agent with real side effects: refunds, email, records, deploys, - tickets, infrastructure, or sensitive data." -- "You already use GitHub Actions or another CI system and want PR-time - release evidence." -- "You can share anonymized tool-surface metadata or a reduced reproduction." - -### Check Catalog - -Add query-targeted intro links: - -- MCP security checklist -- OpenAPI tool-surface checklist -- OpenAI Agents SDK release checklist -- Approval and idempotency checks -- Scope and blast-radius checks - -## README Improvements - -The README is already strong. The most important refinements are: - -- Put "agent release readiness" next to the canonical tagline without changing - the product promise. -- Keep MCP, OpenAPI, and OpenAI Agents SDK in the first scan sentence because - those are the core adoption wedges. -- Keep "CLI + GitHub Action" above the first fold. -- Keep the sample blocked fixture output visible early; it proves the scanner - produces concrete release evidence, not generic policy advice. -- Keep the "For coding agents" section and machine-readable links. These are - unusually valuable for GEO. - -## FAQ and Glossary Suggestions - -Add or keep these FAQ questions on the website and in docs: - -- What is Agents Shipgate? -- What is agent release readiness? -- What is Tool-Use Readiness? -- What is an AI agent tool surface? -- What is a Tool-Use Readiness Report? -- How is Agents Shipgate different from LLM evals? -- How is Agents Shipgate different from observability? -- How is Agents Shipgate different from runtime guardrails or MCP gateways? -- Does Agents Shipgate call my tools or connect to MCP servers? -- When should I run Agents Shipgate on a PR? -- How do I add Agents Shipgate to GitHub Actions? -- What does "blocked" mean in a report? -- Does Agents Shipgate certify my agent as safe? -- Which inputs are supported: MCP, OpenAPI, OpenAI Agents SDK, Anthropic, - Google ADK, LangChain, CrewAI, OpenAI API, Codex plugin, and n8n? - -Add or keep these glossary entries: - -- Agent release readiness -- Agent Release Readiness Report -- Tool-Use Readiness -- Tool-Use Readiness Report -- Tool surface -- Tool surface drift -- Manifest-first -- Approval policy -- Confirmation policy -- Idempotency evidence -- Blast radius -- Baseline -- Suppression -- Advisory mode -- Strict mode -- Release Evidence Packet - -## Content Roadmap - -### P0: Landing Pages - -- MCP tool security checklist -- OpenAI Agents SDK release gate -- OpenAPI tool-surface scanner -- GitHub Action for AI agent CI/CD - -### P1: High-Intent Tutorials - -- How to add a release gate to an OpenAI Agents SDK agent -- How to review MCP tools before production -- How to scan an OpenAPI spec before exposing it to an agent -- How to use advisory mode, baselines, and strict mode in GitHub Actions - -### P1: Comparison and "Not X" Pages - -- Agents Shipgate vs LLM evals -- Agents Shipgate vs LangSmith -- Agents Shipgate vs Braintrust -- Agents Shipgate vs promptfoo -- Agents Shipgate vs MCP gateways -- Agents Shipgate vs runtime guardrails - -### P2: Category-Creation Content - -- What is agent release readiness? -- What is Tool-Use Readiness? -- Your AI agent has a tool surface -- Why evals are not release gates -- From CI/CD to agent release readiness - -### P2: Thesis Content - -- Healthcare for agents -- Agent release evidence as infrastructure -- The agent release lifecycle: static review, baseline, runtime evidence, - drift detection - -## Prioritized Action Plan - -### P0 - -1. Deploy the current in-tree `llms.txt` and `.well-known/agents-shipgate.json` - to the website release branch when the next public release ships. If the - website is pinned to the latest public release, still update live `/llms.txt` - so its schema and input claims match that release. -2. Add a `/sitemap.xml` alias to the existing sitemap index. Keep - `robots.txt` pointing to the sitemap index. -3. Update homepage first-screen copy so the entity sentence appears above the - fold: "Agents Shipgate is an open-source CLI and GitHub Action from Three - Moons Lab..." -4. Add stable landing pages for MCP, OpenAPI, and OpenAI Agents SDK instead of - relying only on blog posts. -5. Keep GitHub repo description focused on: - "Static release checks for tool-using AI agents. CLI + GitHub Action. Scans - MCP, OpenAPI, OpenAI Agents SDK. Writes Tool-Use Readiness Reports." - -### P1 - -1. Add FAQ structured data to comparison pages. -2. Add `sameAs` links in Organization/SoftwareApplication schema for PyPI, - GitHub Marketplace, GitHub repo, and GitHub org. -3. Add "sample report" and "sample PR comment" CTAs from homepage and - quickstart. -4. Add `lastmod` values to sitemap entries for blog posts and docs pages. -5. Link every blog post back to the product page, quickstart, check catalog, - and design partner page with descriptive anchor text. - -### P2 - -1. Build a "report gallery" page with public sample reports by framework: - OpenAI Agents SDK, MCP-only, OpenAPI, LangChain, Anthropic, n8n. -2. Add an "agent release readiness" category hub that links to glossary, blog, - comparisons, and quickstart. -3. Add benchmark/adoption-readiness material once there is enough public data. -4. Keep the long-term "healthcare for agents" thesis on the blog and roadmap, - not as the main product claim. diff --git a/plugins/agents-shipgate/.codex-plugin/plugin.json b/plugins/agents-shipgate/.codex-plugin/plugin.json index 33647da0..305eacd6 100644 --- a/plugins/agents-shipgate/.codex-plugin/plugin.json +++ b/plugins/agents-shipgate/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "agents-shipgate", - "version": "1.0.0a1", + "version": "0.14.0", "description": "Run Agents Shipgate Tool-Use Readiness workflows from Codex.", "author": { "name": "Three Moons Lab", diff --git a/plugins/agents-shipgate/skills/agents-shipgate/assets/advisory-pr-comment.yml b/plugins/agents-shipgate/skills/agents-shipgate/assets/advisory-pr-comment.yml index 0bf82ff5..7c46518d 100644 --- a/plugins/agents-shipgate/skills/agents-shipgate/assets/advisory-pr-comment.yml +++ b/plugins/agents-shipgate/skills/agents-shipgate/assets/advisory-pr-comment.yml @@ -18,9 +18,9 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/prompts/add-shipgate-to-repo.md b/prompts/add-shipgate-to-repo.md index b2b7ee06..c13c707e 100644 --- a/prompts/add-shipgate-to-repo.md +++ b/prompts/add-shipgate-to-repo.md @@ -11,8 +11,8 @@ agent-related PRs should use `agents-shipgate verify` after this adoption step. 1. **Install the tool - pin the version so a stale build can't shadow it.** This flow uses the current verifier, agent-handoff, primary-command, and Codex-boundary contracts and requires **contract v9 or newer**; an older copy lingering on `PATH` may lack the command or schema fields this prompt expects. Prefer a **pinned, zero-install** runner that fetches the exact version every time instead of trusting whatever is already on `PATH`. **Pin it into one variable and use that for every step below**, so no single command can fall through to a stale binary: ```bash - SG="uvx agents-shipgate@1.0.0a1" # uv: ephemeral, always the pinned build - # or: SG="pipx run agents-shipgate==1.0.0a1" + SG="uvx agents-shipgate@0.14.0" # uv: ephemeral, always the pinned build + # or: SG="pipx run agents-shipgate==0.14.0" $SG --version # confirm the pinned runner resolves ``` Every step below calls `$SG …`; e.g. `$SG verify --preview --json` runs the verify preview through the pinned runner, never a `PATH` copy. @@ -20,7 +20,7 @@ agent-related PRs should use `agents-shipgate verify` after this adoption step. If you would rather install onto `PATH`, pin the floor and **fail loudly when it resolves older** — a plain `pipx install agents-shipgate` is a no-op when an older build already exists — then set `SG=agents-shipgate`: ```bash python -m pip install -U --pre agents-shipgate - agents-shipgate contract --json # STOP if this reports contract_version < 9 - re-run pinned via uvx agents-shipgate@1.0.0a1 + agents-shipgate contract --json # STOP if this reports contract_version < 9 - re-run pinned via uvx agents-shipgate@0.14.0 SG=agents-shipgate # only after the line above confirms contract v9+ ``` diff --git a/prompts/decide-shipgate-relevance.md b/prompts/decide-shipgate-relevance.md index 00cf0ef2..2e647122 100644 --- a/prompts/decide-shipgate-relevance.md +++ b/prompts/decide-shipgate-relevance.md @@ -71,13 +71,13 @@ the rules to the changed file list. 5. **Then act.** - If `run_shipgate: true`: run verify preview through a **version-pinned, zero-install** runner so a stale copy already on `PATH` can't answer — - `uvx agents-shipgate@1.0.0a1 verify --preview --json` (or - `pipx run agents-shipgate==1.0.0a1 ...`). Only fall back to a bare + `uvx agents-shipgate@0.14.0 verify --preview --json` (or + `pipx run agents-shipgate==0.14.0 ...`). Only fall back to a bare `agents-shipgate verify --preview --json` once `agents-shipgate --version` confirms contract v9 or newer. Then follow [`prompts/add-shipgate-to-repo.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/prompts/add-shipgate-to-repo.md) for the first-adoption helper flow, or point the user at the GitHub - Action (`ThreeMoonsLab/agents-shipgate@v1.0.0a1`) if they prefer CI. + Action (`ThreeMoonsLab/agents-shipgate@v0.14.0`) if they prefer CI. - If `run_shipgate: false` and `dry_run_recommended: true`: propose a non-mutating scan only — never propose `init --write` based on a dry-run match alone. Phrase it as "X may have shifted the tool diff --git a/prompts/stabilize-strict-mode.md b/prompts/stabilize-strict-mode.md index 4009f643..45cccd27 100644 --- a/prompts/stabilize-strict-mode.md +++ b/prompts/stabilize-strict-mode.md @@ -37,9 +37,9 @@ The user has Agents Shipgate running in **advisory** mode and wants to graduate 5. **Update the CI workflow.** Replace the existing advisory step with strict + baseline. Use [`examples/github-actions/03-strict-with-baseline.yml`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/examples/github-actions/03-strict-with-baseline.yml) as the template: ```yaml - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' ci_mode: strict fail_on: critical baseline: .agents-shipgate/baseline.json diff --git a/pyproject.toml b/pyproject.toml index 7e80e3ab..f0615150 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agents-shipgate" -version = "1.0.0a1" +version = "0.14.0" description = "The deterministic merge gate for AI-generated agent capability changes. Agent release readiness for tool-using AI agents. CLI + GitHub Action. Scans MCP, OpenAPI, OpenAI Agents SDK, Anthropic, Google ADK, LangChain, CrewAI, OpenAI API, Codex config, Codex plugin, n8n." readme = "README.md" requires-python = ">=3.12" @@ -155,13 +155,9 @@ exclude = [ "/.venv-py312", "/build", "/dist", - "/docs/decks", - "/docs/decks/**", "/harness", "/harness/**", "/htmlcov", - "/marketing", - "/marketing/**", ] [tool.pytest.ini_options] @@ -175,9 +171,6 @@ markers = [ [tool.ruff] target-version = "py312" line-length = 100 -exclude = [ - "docs/decks", -] [tool.ruff.lint] select = ["E4", "E7", "E9", "F", "I", "B", "UP"] diff --git a/skills/agents-shipgate/SKILL.md b/skills/agents-shipgate/SKILL.md index d6647e18..371c6479 100644 --- a/skills/agents-shipgate/SKILL.md +++ b/skills/agents-shipgate/SKILL.md @@ -72,7 +72,7 @@ For non-GitHub CI (GitLab, CircleCI, Jenkins, Azure Pipelines, Buildkite, Bitbuc ## Stable contracts (rely on these) -- **CLI surface** follows the current alpha contract line — see https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md. +- **CLI surface** follows the current 0.x contract line — see https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md. - **Installed CLI contract**: when available, run `agents-shipgate contract --json` to verify local schema versions, capability/research surfaces, `release_decision.decision`, and manual-review signal fields. Older installs should use [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md) or upgrade before automating against the local contract command. - **Verifier JSON**: `verifier_schema_version: "0.1"`. Read `merge_verdict`, `can_merge_without_human`, `first_next_action`, `fix_task`, `capability_review.top_changes`, `trust_root_touched`, and `policy_weakened` before summarizing an AI-generated PR. `merge_verdict` is a deterministic projection; the gate remains `report.json.release_decision.decision`. - **Verify run JSON**: `verify-run.json` uses `schema_version: "shipgate.verify_run/v1"` and records stable run identity, subject refs, input hashes, outcome, and artifact hashes. It is the reproducibility artifact for `verify`; do not treat it as a second gate. diff --git a/skills/agents-shipgate/ci-recipes/advisory-pr-comment.yml b/skills/agents-shipgate/ci-recipes/advisory-pr-comment.yml index ab921d08..fb5f5e8d 100644 --- a/skills/agents-shipgate/ci-recipes/advisory-pr-comment.yml +++ b/skills/agents-shipgate/ci-recipes/advisory-pr-comment.yml @@ -18,9 +18,9 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: ci_mode: advisory diff_base: target pr_comment: 'true' - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' diff --git a/skills/agents-shipgate/prompts/add-shipgate-to-repo.md b/skills/agents-shipgate/prompts/add-shipgate-to-repo.md index b2b7ee06..c13c707e 100644 --- a/skills/agents-shipgate/prompts/add-shipgate-to-repo.md +++ b/skills/agents-shipgate/prompts/add-shipgate-to-repo.md @@ -11,8 +11,8 @@ agent-related PRs should use `agents-shipgate verify` after this adoption step. 1. **Install the tool - pin the version so a stale build can't shadow it.** This flow uses the current verifier, agent-handoff, primary-command, and Codex-boundary contracts and requires **contract v9 or newer**; an older copy lingering on `PATH` may lack the command or schema fields this prompt expects. Prefer a **pinned, zero-install** runner that fetches the exact version every time instead of trusting whatever is already on `PATH`. **Pin it into one variable and use that for every step below**, so no single command can fall through to a stale binary: ```bash - SG="uvx agents-shipgate@1.0.0a1" # uv: ephemeral, always the pinned build - # or: SG="pipx run agents-shipgate==1.0.0a1" + SG="uvx agents-shipgate@0.14.0" # uv: ephemeral, always the pinned build + # or: SG="pipx run agents-shipgate==0.14.0" $SG --version # confirm the pinned runner resolves ``` Every step below calls `$SG …`; e.g. `$SG verify --preview --json` runs the verify preview through the pinned runner, never a `PATH` copy. @@ -20,7 +20,7 @@ agent-related PRs should use `agents-shipgate verify` after this adoption step. If you would rather install onto `PATH`, pin the floor and **fail loudly when it resolves older** — a plain `pipx install agents-shipgate` is a no-op when an older build already exists — then set `SG=agents-shipgate`: ```bash python -m pip install -U --pre agents-shipgate - agents-shipgate contract --json # STOP if this reports contract_version < 9 - re-run pinned via uvx agents-shipgate@1.0.0a1 + agents-shipgate contract --json # STOP if this reports contract_version < 9 - re-run pinned via uvx agents-shipgate@0.14.0 SG=agents-shipgate # only after the line above confirms contract v9+ ``` diff --git a/skills/agents-shipgate/prompts/decide-shipgate-relevance.md b/skills/agents-shipgate/prompts/decide-shipgate-relevance.md index 00cf0ef2..2e647122 100644 --- a/skills/agents-shipgate/prompts/decide-shipgate-relevance.md +++ b/skills/agents-shipgate/prompts/decide-shipgate-relevance.md @@ -71,13 +71,13 @@ the rules to the changed file list. 5. **Then act.** - If `run_shipgate: true`: run verify preview through a **version-pinned, zero-install** runner so a stale copy already on `PATH` can't answer — - `uvx agents-shipgate@1.0.0a1 verify --preview --json` (or - `pipx run agents-shipgate==1.0.0a1 ...`). Only fall back to a bare + `uvx agents-shipgate@0.14.0 verify --preview --json` (or + `pipx run agents-shipgate==0.14.0 ...`). Only fall back to a bare `agents-shipgate verify --preview --json` once `agents-shipgate --version` confirms contract v9 or newer. Then follow [`prompts/add-shipgate-to-repo.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/prompts/add-shipgate-to-repo.md) for the first-adoption helper flow, or point the user at the GitHub - Action (`ThreeMoonsLab/agents-shipgate@v1.0.0a1`) if they prefer CI. + Action (`ThreeMoonsLab/agents-shipgate@v0.14.0`) if they prefer CI. - If `run_shipgate: false` and `dry_run_recommended: true`: propose a non-mutating scan only — never propose `init --write` based on a dry-run match alone. Phrase it as "X may have shifted the tool diff --git a/skills/agents-shipgate/prompts/stabilize-strict-mode.md b/skills/agents-shipgate/prompts/stabilize-strict-mode.md index 4009f643..45cccd27 100644 --- a/skills/agents-shipgate/prompts/stabilize-strict-mode.md +++ b/skills/agents-shipgate/prompts/stabilize-strict-mode.md @@ -37,9 +37,9 @@ The user has Agents Shipgate running in **advisory** mode and wants to graduate 5. **Update the CI workflow.** Replace the existing advisory step with strict + baseline. Use [`examples/github-actions/03-strict-with-baseline.yml`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/examples/github-actions/03-strict-with-baseline.yml) as the template: ```yaml - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 with: - shipgate_version: '1.0.0a1' + shipgate_version: '0.14.0' ci_mode: strict fail_on: critical baseline: .agents-shipgate/baseline.json diff --git a/src/agents_shipgate/__init__.py b/src/agents_shipgate/__init__.py index f7b45399..733e5908 100644 --- a/src/agents_shipgate/__init__.py +++ b/src/agents_shipgate/__init__.py @@ -1,3 +1,3 @@ """Agents Shipgate package.""" -__version__ = "1.0.0a1" +__version__ = "0.14.0" diff --git a/src/agents_shipgate/cli/check.py b/src/agents_shipgate/cli/check.py index 8b82d9ed..a82ee0f4 100644 --- a/src/agents_shipgate/cli/check.py +++ b/src/agents_shipgate/cli/check.py @@ -67,7 +67,7 @@ def check( raise typer.Exit(2) if format_ == "agent-json": typer.echo( - "--format agent-json was removed in the 1.0.0-alpha contract. " + "--format agent-json was removed in the 0.14.0 contract cleanup. " "Use --format codex-boundary-json.", err=True, ) diff --git a/src/agents_shipgate/cli/verify/command.py b/src/agents_shipgate/cli/verify/command.py index 1b2b2bc2..63b819b3 100644 --- a/src/agents_shipgate/cli/verify/command.py +++ b/src/agents_shipgate/cli/verify/command.py @@ -375,7 +375,7 @@ def _parse_verify_format(value: str) -> str: return "json" if normalized == "agent": raise ConfigError( - "--format agent was removed in the 1.0.0-alpha contract; use --format json" + "--format agent was removed in the 0.14.0 contract cleanup; use --format json" ) raise ConfigError("--format must be text or json for verify") diff --git a/src/agents_shipgate/inputs/google_adk.py b/src/agents_shipgate/inputs/google_adk.py index 5288dcbe..d56fea7f 100644 --- a/src/agents_shipgate/inputs/google_adk.py +++ b/src/agents_shipgate/inputs/google_adk.py @@ -270,7 +270,34 @@ def _load_agent_config_path( artifacts.agent_config_files.append(_display_path(path, config_base_dir)) agent_name = str(data.get("name") or path.stem) - tools_data = data.get("tools") if isinstance(data.get("tools"), list) else [] + raw_tools = data.get("tools") + if isinstance(raw_tools, list): + tools_data = raw_tools + else: + # ``tools`` absent (None) is a genuine zero-tool agent. But ``tools`` + # present in a shape we cannot enumerate — a templated string, an + # env-var reference, a mapping — must NOT silently collapse to a + # confident ``tool_count: 0``; that reads as deliberate narrowing and + # fails open. Record it as a dynamic/unparseable surface (warning + + # dynamic toolset marker), mirroring the Python entrypoint's + # dynamic-tools-expression handling, so evidence-coverage and the ADK + # dynamic-toolset checks treat the surface as unknown. + tools_data = [] + if raw_tools is not None: + artifacts.warnings.append( + f"Google ADK agent {agent_name!r} declares a dynamic or " + f"unparseable 'tools' value in its Agent Config; its tool " + f"surface could not be enumerated." + ) + artifacts.toolsets.append( + GoogleAdkToolset( + kind="dynamic", + source_id=source_id, + source_ref=source_ref, + agent_name=agent_name, + dynamic=True, + ) + ) artifacts.agents.append( { "name": agent_name, diff --git a/tests/golden/agent_protocol/claude-code-block-stop.json b/tests/golden/agent_protocol/claude-code-block-stop.json index 6e0d5fa6..65be5382 100644 --- a/tests/golden/agent_protocol/claude-code-block-stop.json +++ b/tests/golden/agent_protocol/claude-code-block-stop.json @@ -3,7 +3,7 @@ "agent": "claude-code", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "claude-code" diff --git a/tests/golden/agent_protocol/codex-block-stop.json b/tests/golden/agent_protocol/codex-block-stop.json index cfb1050e..1210d318 100644 --- a/tests/golden/agent_protocol/codex-block-stop.json +++ b/tests/golden/agent_protocol/codex-block-stop.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/agent_protocol/codex-repair-after.json b/tests/golden/agent_protocol/codex-repair-after.json index 7a87e9e6..8b95adcf 100644 --- a/tests/golden/agent_protocol/codex-repair-after.json +++ b/tests/golden/agent_protocol/codex-repair-after.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/agent_protocol/codex-repair-before.json b/tests/golden/agent_protocol/codex-repair-before.json index ff785481..55333605 100644 --- a/tests/golden/agent_protocol/codex-repair-before.json +++ b/tests/golden/agent_protocol/codex-repair-before.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/agent_protocol/cursor-block-stop.json b/tests/golden/agent_protocol/cursor-block-stop.json index c3a495f5..70538efc 100644 --- a/tests/golden/agent_protocol/cursor-block-stop.json +++ b/tests/golden/agent_protocol/cursor-block-stop.json @@ -3,7 +3,7 @@ "agent": "cursor", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "cursor" diff --git a/tests/golden/codex_boundary_result/agents_requirement_removed.json b/tests/golden/codex_boundary_result/agents_requirement_removed.json index 865768b2..8a280f77 100644 --- a/tests/golden/codex_boundary_result/agents_requirement_removed.json +++ b/tests/golden/codex_boundary_result/agents_requirement_removed.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/codex_boundary_result/docs_only.json b/tests/golden/codex_boundary_result/docs_only.json index b7be42a7..0cd75ddf 100644 --- a/tests/golden/codex_boundary_result/docs_only.json +++ b/tests/golden/codex_boundary_result/docs_only.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/codex_boundary_result/github_action_removed.json b/tests/golden/codex_boundary_result/github_action_removed.json index 26e78d77..3a30a0b3 100644 --- a/tests/golden/codex_boundary_result/github_action_removed.json +++ b/tests/golden/codex_boundary_result/github_action_removed.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/codex_boundary_result/malformed_toml.json b/tests/golden/codex_boundary_result/malformed_toml.json index 6a0da209..f3d3b52b 100644 --- a/tests/golden/codex_boundary_result/malformed_toml.json +++ b/tests/golden/codex_boundary_result/malformed_toml.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/codex_boundary_result/mcp_auto_approve_write.json b/tests/golden/codex_boundary_result/mcp_auto_approve_write.json index ead55e2c..585c298d 100644 --- a/tests/golden/codex_boundary_result/mcp_auto_approve_write.json +++ b/tests/golden/codex_boundary_result/mcp_auto_approve_write.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/codex_boundary_result/network_wildcard.json b/tests/golden/codex_boundary_result/network_wildcard.json index a8095a52..beba654e 100644 --- a/tests/golden/codex_boundary_result/network_wildcard.json +++ b/tests/golden/codex_boundary_result/network_wildcard.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/codex_boundary_result/python_refactor.json b/tests/golden/codex_boundary_result/python_refactor.json index 92b82f12..fc24679e 100644 --- a/tests/golden/codex_boundary_result/python_refactor.json +++ b/tests/golden/codex_boundary_result/python_refactor.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/golden/codex_boundary_result/unknown_permission_key.json b/tests/golden/codex_boundary_result/unknown_permission_key.json index 3af8d09b..12a07288 100644 --- a/tests/golden/codex_boundary_result/unknown_permission_key.json +++ b/tests/golden/codex_boundary_result/unknown_permission_key.json @@ -3,7 +3,7 @@ "agent": "codex", "tool": { "name": "agents-shipgate", - "version": "1.0.0a1" + "version": "0.14.0" }, "subject": { "agent": "codex" diff --git a/tests/test_agent_instructions_renderers.py b/tests/test_agent_instructions_renderers.py index 0bfaea17..4f397d78 100644 --- a/tests/test_agent_instructions_renderers.py +++ b/tests/test_agent_instructions_renderers.py @@ -45,16 +45,16 @@ REPO_ROOT = Path(__file__).resolve().parent.parent EXPECTED_CLAUDE_CODE_SKILL_RENDER_SHA256 = { ".claude/skills/agents-shipgate/SKILL.md": ( - "354bb39884f622ed34c5616a9f616cb5a5ea5e60f0d705e971ad109830058b81" + "1b4470b625110acdc6f1e3930b7e6e6f43653c493e9a44c7ccde0dd9c88b09bb" ), ".claude/skills/agents-shipgate/ci-recipes/advisory-pr-comment.yml": ( - "99b2acfbd9dfc6653a6bbee268b83f1e2d4297829636eba662d9f4ad6fa35423" + "82957a521b5914b3e678e6b76e7088306559c8ad6bcf7c2dce7fb1e822b6bec6" ), ".claude/skills/agents-shipgate/prompts/add-shipgate-to-repo.md": ( - "7a414bb492538d05b1b76c526e8672f24bcb1a824c368c643e3be0ee48b08274" + "0e94a82ec066af57ea8c0d6d6222de906b45ed722bf7799e2fc1907004158618" ), ".claude/skills/agents-shipgate/prompts/decide-shipgate-relevance.md": ( - "80771e3f3d3ae35329929ec567cc121cf1342bd27edbd50cf9f97f29cab703ef" + "a8ee5f93cab1017c623075c39c1c5bdc639855c37e588e1c9190ab963bb50446" ), ".claude/skills/agents-shipgate/prompts/explain-finding-to-user.md": ( "18031ed870b3c937a2996173820639ef441afe0a45e8171f16468826cd389829" @@ -66,7 +66,7 @@ "162aa2fb96066535425d9cf86a247a6782b8ec7cc661a18b42dbedf394779475" ), ".claude/skills/agents-shipgate/prompts/stabilize-strict-mode.md": ( - "3a42ae0d22e46b58de3d40b300ca76bee6120f3bde3f779926ae824c16a25c65" + "db9a702784c64229ed15157ce369c90a3d75c02e82c78d2c39cc55135857dc80" ), ".claude/skills/agents-shipgate/prompts/triage-false-positive.md": ( "8cfbb0d4b6e2c36569d24260384d3a54165f966276112f4b143b4ac234b51ada" @@ -86,7 +86,7 @@ "aa511e933ff663dcd1e0d2af3da2a7101206ce2bb1bb98c4dae801bb3f4e42ef" ), ".agents/skills/agents-shipgate/assets/advisory-pr-comment.yml": ( - "16894ce679eb55c69213070775cb265f0775ad7ff1cd08091a5c57627950871b" + "7ef7ccb331a0171f0fb5580df4dad32003b230b150886a40d317a954ace0fb55" ), ".agents/skills/agents-shipgate/references/recipes.md": ( "d8b393e61aef853105a47630b9cbfd404378b6c9e1bbed6028b357b4e38fc72c" diff --git a/tests/test_codex_plugin_launch_package.py b/tests/test_codex_plugin_launch_package.py index e038c186..fa4efa89 100644 --- a/tests/test_codex_plugin_launch_package.py +++ b/tests/test_codex_plugin_launch_package.py @@ -19,7 +19,7 @@ def test_agents_shipgate_codex_plugin_manifest_is_skill_only() -> None: ) assert manifest["name"] == "agents-shipgate" - assert manifest["version"] == "1.0.0a1" + assert manifest["version"] == "0.14.0" assert manifest["skills"] == "./skills/" assert "apps" not in manifest assert "mcpServers" not in manifest diff --git a/tests/test_google_adk.py b/tests/test_google_adk.py index c880b8fb..12bd9b48 100644 --- a/tests/test_google_adk.py +++ b/tests/test_google_adk.py @@ -194,6 +194,59 @@ def test_google_adk_agent_config_dynamic_toolset_findings(tmp_path): assert doctor["frameworks"]["google_adk"]["dynamic_toolset_count"] == 2 +def test_google_adk_agent_config_non_list_tools_fails_closed(tmp_path): + """A non-list ``tools:`` value must surface as a dynamic/unparseable + toolset, not silently collapse to a confident ``tool_count: 0``. + + Regression for the fail-open path: ``tools`` present but in a shape the + static extractor cannot enumerate (here a templated string) previously + became ``[]`` with no warning and no finding, reading as a deliberate + zero-tool agent. It must now route to the dynamic-toolset signal. + """ + project = tmp_path / "project" + project.mkdir() + (project / "agent.yaml").write_text( + """ +agent_class: LlmAgent +name: root_agent +instruction: Review support cases. +tools: ${RUNTIME_TOOLSET} +""", + encoding="utf-8", + ) + (project / "shipgate.yaml").write_text( + """ +version: "0.1" +project: + name: adk-config-test +agent: + name: root-agent +environment: + target: production_like +tool_sources: + - id: adk + type: google_adk + path: agent.yaml +""", + encoding="utf-8", + ) + + report, _ = run_scan( + config_path=project / "shipgate.yaml", + output_dir=tmp_path / "reports", + formats=["json"], + ci_mode="advisory", + ) + + check_ids = {finding.check_id for finding in report.findings} + assert "SHIP-ADK-DYNAMIC-TOOLSET-NOT-ENUMERABLE" in check_ids + doctor = inspect_sources(config_path=project / "shipgate.yaml") + adk = doctor["frameworks"]["google_adk"] + assert adk["dynamic_toolset_count"] == 1 + # The unparseable surface must leave an evidence trail, not a silent pass. + assert any("unparseable" in w or "dynamic" in w for w in adk["warnings"]) + + def test_google_adk_top_level_config_can_supply_inputs(tmp_path): project = tmp_path / "project" project.mkdir() diff --git a/tests/test_org_governance.py b/tests/test_org_governance.py index 8bff7f52..8f4d6fd2 100644 --- a/tests/test_org_governance.py +++ b/tests/test_org_governance.py @@ -583,7 +583,7 @@ def test_org_bundle_accepts_v03_attestation_file(tmp_path: Path) -> None: ) v03_attestation = { "attestation_schema_version": "0.3", - "cli_version": "1.0.0a1", + "cli_version": "0.14.0", "org": { "org_id": "acme", "repo": "org/support", diff --git a/tests/test_public_surface_contract.py b/tests/test_public_surface_contract.py index a2e35351..309eaaca 100644 --- a/tests/test_public_surface_contract.py +++ b/tests/test_public_surface_contract.py @@ -106,7 +106,7 @@ ), ( "docs/faq.md", - re.compile(rf"v({VERSION_RE}) is the current alpha contract version"), + re.compile(rf"v({VERSION_RE}) is the current pre-1\.0 beta contract version"), ), ( "ROADMAP.md", diff --git a/tests/test_registry_ledger.py b/tests/test_registry_ledger.py index a54b9568..3481a2c9 100644 --- a/tests/test_registry_ledger.py +++ b/tests/test_registry_ledger.py @@ -15,7 +15,7 @@ def _attestation(verdict: str = "blocked", change_ids: list[str] | None = None) -> dict: return { "attestation_schema_version": "0.1", - "cli_version": "1.0.0a1", + "cli_version": "0.14.0", "source_verifier": "agents-shipgate-reports/verifier.json", "redacted": True, "base_ref": "origin/main", diff --git a/tests/test_v07_metadata_roundtrip.py b/tests/test_v07_metadata_roundtrip.py index 166622ad..29ea6edc 100644 --- a/tests/test_v07_metadata_roundtrip.py +++ b/tests/test_v07_metadata_roundtrip.py @@ -240,7 +240,7 @@ def test_package_version_is_current_release(): """Guard against bumping schemas while leaving package metadata behind.""" import agents_shipgate - assert agents_shipgate.__version__ == "1.0.0a1", ( + assert agents_shipgate.__version__ == "0.14.0", ( f"package version is {agents_shipgate.__version__!r}; " - "expected 1.0.0a1 for the current release" + "expected 0.14.0 for the current release" ) diff --git a/tests/test_verifier_scenarios.py b/tests/test_verifier_scenarios.py index 24f39605..1dc0dffa 100644 --- a/tests/test_verifier_scenarios.py +++ b/tests/test_verifier_scenarios.py @@ -339,7 +339,7 @@ def test_scenario_docs_only_with_shipgate_yaml_force_runs(tmp_path: Path) -> Non - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: ThreeMoonsLab/agents-shipgate@v1.0.0a1 + - uses: ThreeMoonsLab/agents-shipgate@v0.14.0 """