From 4bcd7a6ab07aa3cc7dc8301b26b7871bb495d979 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Fri, 3 Jul 2026 14:22:47 +0000 Subject: [PATCH 01/11] chore(storage): remove duckdb/ladybug traces from all source (ADR 0019) Complete the single-file SQLite migration cleanup: delete dead code left by the two-backend removal and scrub every prior-backend name from shipping source, so `duckdb`/`ladybug`/`lbug`/`kuzu` appear only in decision history (ADRs, CHANGELOGs) and removal prose. Dead code deleted (~864 LOC): - storage/src/schema-ddl.ts: whole file, superseded by inline DDL, zero callers - cli analyze.ts: 345-LOC wide-column row decoder, superseded by payload-JSON rehydration through the store's typed listNodes/listEdges finders - storage/src/test-utils/conformance.ts: 448-LOC assertIGraphStoreConformance suite with zero callers; drop the false "SqliteStore opts into this suite" claim in interface.ts. The load-bearing assertGraphParity/rebuildFromStore parity harness is kept (sqlite-parity.test.ts uses it). Traces scrubbed to zero in live .ts source (9 files) plus 3 CI/acceptance scripts and 8 per-package READMEs/docs that misdescribed current behavior. Tests: fix one dead assertion (interface.test.ts store-path values), migrate ~40 fixture path literals and comments across 20 test files, and keep the sqlite-adapter.test.ts guard that asserts no .lbug/.duckdb sidecar ever reappears. No test file deleted. Enforcement: check-banned-strings.sh now hard-bans the four prior-backend names in packages/*/src (excluding tests + test-utils), fixes its own stale "LadybugDB is the default backend" comments, so the traces cannot regrow. Also fixes pack-determinism-audit.sh gating on the nonexistent .codehub/duck.db (acceptance gate 16 was a silent permanent SKIP; now gates on store.sqlite). Carries an in-flight fix: re-ingest cached scan.sarif on the analyze scan-skip fast-path so a replace-mode bulkLoad no longer wipes prior findings to zero (+ analyze-findings-survival.test.ts regression test). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../references/data-source-map.md | 29 +- .../references/document-templates.md | 2 +- .../references/mermaid-patterns.md | 12 +- .../agents/doc-diagrams-components.md | 2 +- .../agents/doc-diagrams-dependency-graph.md | 2 +- .claude/skills/codehub-onboarding/SKILL.md | 2 +- .claude/skills/opencodehub-debugging/SKILL.md | 14 +- .claude/skills/opencodehub-exploring/SKILL.md | 6 +- .claude/skills/opencodehub-guide/SKILL.md | 162 ++++--- .../skills/opencodehub-refactoring/SKILL.md | 29 +- .erpaval/INDEX.md | 1 + ...ce-fields-stay-out-of-packhash-preimage.md | 49 ++ .github/workflows/ci.yml | 4 +- .github/workflows/verify-global-install.yml | 2 +- AGENTS.md | 44 +- CLAUDE.md | 5 +- OBJECTIVES.md | 7 +- README.md | 28 +- SPECS.md | 69 +-- mise.toml | 2 +- packages/analysis/src/test-utils.ts | 2 +- packages/cli/README.md | 15 +- .../commands/analyze-carry-forward.test.ts | 8 +- .../analyze-findings-survival.test.ts | 171 +++++++ packages/cli/src/commands/analyze.test.ts | 2 +- packages/cli/src/commands/analyze.ts | 401 ++-------------- packages/cli/src/commands/api-impact.test.ts | 4 +- packages/cli/src/commands/augment.test.ts | 6 +- packages/cli/src/commands/baseline.test.ts | 2 +- packages/cli/src/commands/change-pack.test.ts | 6 +- packages/cli/src/commands/code-pack.ts | 6 +- packages/cli/src/commands/context.test.ts | 4 +- packages/cli/src/commands/dead-code.test.ts | 4 +- .../cli/src/commands/dependencies.test.ts | 4 +- packages/cli/src/commands/findings.test.ts | 4 +- .../cli/src/commands/license-audit.test.ts | 4 +- packages/cli/src/commands/open-store.ts | 4 +- packages/cli/src/commands/owners.test.ts | 4 +- .../cli/src/commands/project-profile.test.ts | 4 +- packages/cli/src/commands/query.test.ts | 6 +- packages/cli/src/commands/replay.test.ts | 2 +- packages/cli/src/commands/replay.ts | 4 +- packages/cli/src/commands/route-map.test.ts | 4 +- packages/cli/src/commands/status.test.ts | 2 +- packages/cli/src/commands/verdict.test.ts | 10 +- packages/cli/src/skills-gen.test.ts | 2 +- packages/cli/tsup.config.ts | 4 +- packages/docs/astro.config.mjs | 6 +- packages/docs/public/tool-catalog.json | 15 +- .../docs/agents/discovery-and-resources.mdx | 2 +- .../docs/agents/editors/claude-code.mdx | 4 +- .../content/docs/agents/editors/cursor.mdx | 4 +- .../content/docs/agents/editors/opencode.mdx | 2 +- .../content/docs/agents/editors/windsurf.mdx | 2 +- .../docs/src/content/docs/agents/index.mdx | 6 +- .../docs/src/content/docs/agents/install.mdx | 33 +- .../src/content/docs/agents/registries.mdx | 2 +- .../docs/src/content/docs/agents/why-mcp.mdx | 2 +- .../src/content/docs/architecture/adrs.md | 95 ++-- .../content/docs/architecture/determinism.md | 14 +- .../content/docs/architecture/embeddings.md | 62 ++- .../content/docs/architecture/monorepo-map.md | 31 +- .../src/content/docs/architecture/overview.md | 79 +-- .../architecture/parsing-and-resolution.md | 2 +- .../docs/architecture/storage-backend.md | 149 +++--- .../architecture/summarization-and-fusion.md | 4 +- .../content/docs/guides/indexing-a-repo.md | 21 +- .../docs/guides/using-with-claude-code.md | 2 +- .../content/docs/guides/using-with-codex.md | 2 +- .../docs/guides/using-with-opencode.md | 2 +- .../docs/guides/using-with-windsurf.md | 2 +- packages/docs/src/content/docs/index.mdx | 8 +- .../docs/src/content/docs/mcp/overview.md | 6 +- .../docs/src/content/docs/mcp/resources.md | 2 +- packages/docs/src/content/docs/mcp/tools.md | 18 +- .../docs/src/content/docs/reference/cli.md | 309 +++++++++++- .../content/docs/reference/configuration.md | 26 +- .../src/content/docs/reference/error-codes.md | 8 +- .../src/content/docs/reference/languages.md | 2 +- .../content/docs/skills/codehub-code-pack.mdx | 23 +- .../content/docs/start-here/first-query.md | 4 +- .../src/content/docs/start-here/install.md | 61 ++- .../content/docs/start-here/quick-start.md | 12 +- .../docs/start-here/what-is-opencodehub.md | 28 +- packages/mcp/README.md | 2 +- packages/mcp/src/connection-pool.test.ts | 44 +- packages/mcp/src/resources/repo-context.ts | 1 - packages/mcp/src/test-utils.ts | 2 +- packages/mcp/src/tools/change-pack.test.ts | 2 +- packages/mcp/src/tools/group-tools.test.ts | 2 +- packages/mcp/src/tools/list-dead-code.test.ts | 4 +- .../mcp/src/tools/list-findings-delta.test.ts | 4 +- packages/mcp/src/tools/query.test.ts | 6 +- packages/mcp/src/tools/query.ts | 6 +- packages/mcp/src/tools/run-smoke.test.ts | 8 +- packages/mcp/src/tools/sql.test.ts | 6 +- packages/pack/README.md | 2 +- packages/policy/README.md | 2 +- packages/policy/src/evaluate.test.ts | 6 +- packages/search/README.md | 4 +- packages/storage/README.md | 50 +- packages/storage/src/column-encode.test.ts | 4 +- packages/storage/src/column-encode.ts | 48 +- packages/storage/src/index.ts | 13 +- packages/storage/src/interface.test.ts | 8 +- packages/storage/src/interface.ts | 41 +- packages/storage/src/license.ts | 9 +- packages/storage/src/relations.ts | 8 +- packages/storage/src/schema-ddl.ts | 69 --- packages/storage/src/sqlite-adapter.test.ts | 2 +- packages/storage/src/sqlite-adapter.ts | 64 ++- .../storage/src/test-utils/conformance.ts | 448 ------------------ packages/storage/src/test-utils/index.ts | 16 +- packages/summarizer/README.md | 2 +- packages/wiki/src/index.test.ts | 2 +- .../skills/codehub-code-pack/SKILL.md | 14 +- .../references/determinism-contract.md | 18 +- .../skills/codehub-debugging/SKILL.md | 16 +- .../references/data-source-map.md | 4 +- .../references/document-templates.md | 2 +- .../references/mermaid-patterns.md | 14 +- .../agents/doc-diagrams-components.md | 2 +- .../agents/doc-diagrams-dependency-graph.md | 2 +- .../skills/codehub-exploring/SKILL.md | 12 +- .../opencodehub/skills/codehub-guide/SKILL.md | 136 +++--- .../skills/codehub-onboarding/SKILL.md | 2 +- .../skills/codehub-refactoring/SKILL.md | 26 +- scripts/acceptance.sh | 30 +- scripts/check-banned-strings.sh | 64 ++- scripts/pack-determinism-audit.sh | 12 +- scripts/verify-global-install.sh | 9 +- 131 files changed, 1643 insertions(+), 1815 deletions(-) create mode 100644 .erpaval/solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md create mode 100644 packages/cli/src/commands/analyze-findings-survival.test.ts delete mode 100644 packages/storage/src/schema-ddl.ts delete mode 100644 packages/storage/src/test-utils/conformance.ts diff --git a/.claude/skills/codehub-document/references/data-source-map.md b/.claude/skills/codehub-document/references/data-source-map.md index 44fe433b..55a05e80 100644 --- a/.claude/skills/codehub-document/references/data-source-map.md +++ b/.claude/skills/codehub-document/references/data-source-map.md @@ -11,7 +11,7 @@ graph_hash: ## Repo profile # from project_profile - languages: TypeScript 87%, Rust 11%, Python 2% -- stacks: Node 22, pnpm 10, DuckDB, Vitest +- stacks: Node 24, pnpm 10, SQLite (node:sqlite), Vitest - entry points: packages/mcp/src/index.ts, packages/cli/src/bin.ts ## Top communities (≤ 10) # from sql: SELECT name, inferred_label, cohesion, symbol_count @@ -80,17 +80,20 @@ File-level fan-out means one role may seed multiple packets (for example, `doc-a ## Schema preflight (non-optional) -**Before composing any SQL query over `nodes`, `relations`, or any other -graph table, Phase 0 MUST probe the schema once and cache the result in -`.prefetch.md`.** Subagents then consult the cached schema instead of -guessing column names, which would fail with `Binder Error: Referenced -column "X" not found in FROM clause`. +**Before composing any SQL query over `nodes`, `edges`, or any other +table in `store.sqlite`, Phase 0 MUST probe the schema once and cache the +result in `.prefetch.md`.** Subagents then consult the cached schema +instead of guessing column names, which would fail with a `no such column` +SQLite error. -The probe is one SQL call: +The probe is one SQL call over SQLite's schema catalog: ``` -sql("SELECT table_name, column_name FROM information_schema.columns - WHERE table_name IN ('nodes','relations') ORDER BY table_name, column_name") +sql("SELECT m.name AS table_name, c.name AS column_name + FROM sqlite_master m + JOIN pragma_table_info(m.name) c + WHERE m.type = 'table' AND m.name IN ('nodes','edges') + ORDER BY table_name, column_name") ``` Write the result as a dedicated `.context.md § Schema` subsection (top 30 @@ -100,8 +103,8 @@ rows, no cap) and as a digest line in `.prefetch.md` with Historical note: `nodes` does not have a `path` column — routes store their endpoint under `name` (as `"METHOD /path"`), and the file path is `file_path`. Observed during a 2026-04-27 dogfood when subagent prompts -blindly referenced `path` and hit a Binder Error on an otherwise fresh -graph. The preflight prevents this class of bug across every subagent. +blindly referenced `path` and hit a `no such column` error on an otherwise +fresh index. The preflight prevents this class of bug across every subagent. ## Phase 0 algorithm (pseudocode) @@ -111,7 +114,7 @@ Steps marked `# wave 0a` and `# wave 0b` each run as a single parallel tool-use # wave 0a — independent precompute (one parallel batch) 1. staleness = list_repos → entry for this repo → _meta.codehub/staleness 2. profile = project_profile({repo}) -3. schema = sql("SELECT table_name, column_name FROM information_schema.columns …") +3. schema = sql("SELECT … FROM sqlite_master JOIN pragma_table_info(name) …") 4. routes = route_map({repo}) 5. tools = tool_map({repo}) 6. deps = dependencies({repo}) @@ -126,7 +129,7 @@ Steps marked `# wave 0a` and `# wave 0b` each run as a single parallel tool-use # wave 0b — depends on schema + profile (one parallel batch) 11. communities = sql("SELECT … FROM nodes WHERE kind='Community' …") 12. processes = sql("SELECT … FROM nodes WHERE kind='Process' …") -13. relations = sql("SELECT … FROM relations …") # for diagrams +13. relations = sql("SELECT … FROM edges …") # for diagrams 14. top_folders = top-5 folders by file count (from profile.entryPoints + glob) 15. owners_summary = [owners({path}) for path in top_folders] 16. if --group: group_hits = group_query({group, canonical_terms}) diff --git a/.claude/skills/codehub-document/references/document-templates.md b/.claude/skills/codehub-document/references/document-templates.md index c4d22c3e..6ff0e4de 100644 --- a/.claude/skills/codehub-document/references/document-templates.md +++ b/.claude/skills/codehub-document/references/document-templates.md @@ -23,7 +23,7 @@ Cites `packages/foo/src/index.ts` (200 LOC) style file references. | Layer | Technology | Source | |---|---|---| | Runtime | Node 22 | `package.json:7` | -| Storage | DuckDB + hnsw_acorn | `packages/storage/src/index.ts:12` | +| Storage | SQLite (single-file, node:sqlite) — FTS5 + vector KNN | `packages/storage/src/index.ts:12` | | ... | ... | ... | ## Module map diff --git a/.claude/skills/codehub-document/references/mermaid-patterns.md b/.claude/skills/codehub-document/references/mermaid-patterns.md index ae047e3d..83837a73 100644 --- a/.claude/skills/codehub-document/references/mermaid-patterns.md +++ b/.claude/skills/codehub-document/references/mermaid-patterns.md @@ -14,11 +14,11 @@ flowchart LR core[Core types] ingestion[Ingestion DAG] storage[Storage] - duckdb[(DuckDB)]:::external + sqlite[(store.sqlite)]:::external mcp --> core ingestion --> core ingestion --> storage - storage --> duckdb + storage --> sqlite classDef external stroke-dasharray: 3 3 ``` @@ -104,14 +104,14 @@ For `architecture/data-flow.md`. flowchart TB source[Repo files] parse[tree-sitter parser] - graph[DuckDB graph] + store[(store.sqlite)] embed[ONNX embedder] query[MCP query] source --> parse - parse --> graph + parse --> store parse --> embed - embed --> graph - query --> graph + embed --> store + query --> store ``` **Rules:** diff --git a/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md b/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md index 517c3959..5a40ce19 100644 --- a/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md +++ b/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md @@ -26,7 +26,7 @@ Produce `{{ docs_root }}/diagrams/architecture/components.md`: a single Mermaid | Shared context | `Read {{ context_path }}` | always first | | Prefetch ledger | `Read {{ prefetch_path }}` | always first | | Top communities | `{{ context_path }} § Top communities` | cached | -| Community relations | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT source, target, kind FROM relations WHERE kind IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | +| Community relations | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT src, dst, type FROM edges WHERE type IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | | Component method list | `mcp__codehub__context({symbol: })` per top 8 | mid-run | ## 4. Process diff --git a/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md b/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md index 2b629074..653f71f9 100644 --- a/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md +++ b/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md @@ -26,7 +26,7 @@ Produce `{{ docs_root }}/diagrams/structural/dependency-graph.md`: a single Merm | Shared context | `Read {{ context_path }}` | always first | | Prefetch ledger | `Read {{ prefetch_path }}` | always first | | Top communities | `{{ context_path }} § Top communities` | cached | -| Internal edges | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT source, target, kind FROM relations WHERE kind IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | +| Internal edges | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT src, dst, type FROM edges WHERE type IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | | External dependencies | `{{ context_path }} § Stack` or `mcp__codehub__dependencies({repo: "{{ repo }}"})` | cached if digest present; mid-run otherwise | ## 4. Process diff --git a/.claude/skills/codehub-onboarding/SKILL.md b/.claude/skills/codehub-onboarding/SKILL.md index 8296905a..94087f7d 100644 --- a/.claude/skills/codehub-onboarding/SKILL.md +++ b/.claude/skills/codehub-onboarding/SKILL.md @@ -52,7 +52,7 @@ Produces a single ONBOARDING.md with a ranked reading order drawn from graph cen | Layer | Tech | Source | |---|---|---| | Runtime | Node 22 | `package.json:7` | -| Storage | DuckDB | `packages/storage/src/index.ts:12` | +| Storage | SQLite (single-file, node:sqlite) | `packages/storage/src/index.ts:12` | | ... | ... | ... | ## Read these 10 files first (in order) diff --git a/.claude/skills/opencodehub-debugging/SKILL.md b/.claude/skills/opencodehub-debugging/SKILL.md index 59cf0db8..1655a0a4 100644 --- a/.claude/skills/opencodehub-debugging/SKILL.md +++ b/.claude/skills/opencodehub-debugging/SKILL.md @@ -86,20 +86,20 @@ Two-hop upstream trace for every caller of `validatePayment`: ```sql WITH direct AS ( - SELECT from_id, to_id, 1 AS depth - FROM relations + SELECT src, dst, 1 AS depth + FROM edges WHERE type = 'CALLS' - AND to_id IN (SELECT id FROM nodes WHERE name = 'validatePayment' AND kind = 'Function') + AND dst IN (SELECT id FROM nodes WHERE name = 'validatePayment' AND kind = 'Function') ), indirect AS ( - SELECT r.from_id, d.to_id, 2 AS depth - FROM relations r - JOIN direct d ON d.from_id = r.to_id + SELECT r.src, d.dst, 2 AS depth + FROM edges r + JOIN direct d ON d.src = r.dst WHERE r.type = 'CALLS' ) SELECT caller.name, caller.file_path, caller.start_line, u.depth FROM (SELECT * FROM direct UNION ALL SELECT * FROM indirect) u -JOIN nodes caller ON caller.id = u.from_id +JOIN nodes caller ON caller.id = u.src ORDER BY u.depth ASC, caller.name; ``` diff --git a/.claude/skills/opencodehub-exploring/SKILL.md b/.claude/skills/opencodehub-exploring/SKILL.md index a4345788..17516442 100644 --- a/.claude/skills/opencodehub-exploring/SKILL.md +++ b/.claude/skills/opencodehub-exploring/SKILL.md @@ -75,9 +75,9 @@ When a name is ambiguous, `context` returns a ranked candidate list instead of s ```sql SELECT r.step, callee.name, callee.file_path, callee.start_line -FROM relations r -JOIN nodes proc ON proc.id = r.from_id -JOIN nodes callee ON callee.id = r.to_id +FROM edges r +JOIN nodes proc ON proc.id = r.src +JOIN nodes callee ON callee.id = r.dst WHERE r.type = 'PROCESS_STEP' AND proc.kind = 'Process' AND proc.name = 'CheckoutFlow' diff --git a/.claude/skills/opencodehub-guide/SKILL.md b/.claude/skills/opencodehub-guide/SKILL.md index 29803824..d1dd833e 100644 --- a/.claude/skills/opencodehub-guide/SKILL.md +++ b/.claude/skills/opencodehub-guide/SKILL.md @@ -5,7 +5,7 @@ description: "Use when the user asks about OpenCodeHub itself — available MCP # OpenCodeHub Guide -Quick reference for every OpenCodeHub MCP tool, MCP resource, and the graph + temporal store schema. +Quick reference for every OpenCodeHub MCP tool, MCP resource, and the single-file `store.sqlite` schema. ## Always Start Here @@ -59,7 +59,7 @@ standalone artifact producer with its own preconditions and output path. | `mcp__codehub__context` | 360-degree symbol view + `confidenceBreakdown` + `cochanges` side-section | | `mcp__codehub__impact` | Blast radius with risk tier + `confidenceBreakdown` | | `mcp__codehub__detect_changes` | Map an uncommitted or committed diff to affected symbols and flows | -| `mcp__codehub__sql` | Read-only query: `sql` arg → temporal DuckDB (cochanges/summaries); `cypher` arg → lbug graph (5 s timeout) | +| `mcp__codehub__sql` | Read-only SQL over the single-file `store.sqlite` (all tables: nodes, edges, embeddings, cochanges, symbol_summaries, store_meta; 5 s timeout). `cypher` arg is reserved for community-fork adapters (unsupported by the default backend) | | `mcp__codehub__signature` | Symbol declaration + stubbed members (class/interface header + method/property signatures, bodies elided) | ### HTTP / RPC surface @@ -115,91 +115,135 @@ Lightweight reads for navigation (every URI uses the `codehub://` scheme): | `codehub://repo/{name}/context` | Stats + staleness envelope | | `codehub://repo/{name}/schema` | Live node kinds / relation types for `sql` | -> Cluster and process navigation resources (`codehub://repo/{name}/clusters`, `codehub://repo/{name}/processes`, etc.) are slated for a later wave. Until then, use the typed tools or Cypher (below) filtered to `kind = 'Community'` / `kind = 'Process'`. - -## Where the graph lives (ADR 0016) - -There are **two stores**, and they are queried differently: - -- **Graph tier — `graph.lbug`** (ladybug, Cypher dialect). Holds nodes, edges, - and embeddings. Query it via the typed tools (`query` / `context` / `impact` / - `route_map` / …) or, for bespoke questions, **Cypher** via the MCP `sql` - tool's `cypher` argument. There is NO `nodes` or `relations` SQL table. -- **Temporal tier — `temporal.duckdb`** (DuckDB SQL). Holds only the - `cochanges` and `symbol_summaries` tables. The `sql` argument of the MCP - `sql` tool (and `codehub sql` on the CLI) targets THIS store. - -Pass exactly one of `sql` (temporal DuckDB) or `cypher` (lbug graph) to the MCP -`sql` tool. - -### Graph schema (lbug / Cypher) - -One node label `CodeNode` carrying `kind` as a **property** (NOT a per-kind -label). One relationship table per relation type. Properties are **snake_case** -(`file_path`, `start_line`, `inferred_label`, `step_count`, `entry_point_id`); -a camelCase RETURN alias comes back as the alias you give it, but the stored -property names are snake_case. - -**Node kinds** (`n.kind` values): File, Folder, Function, Class, Method, +> Cluster and process navigation resources (`codehub://repo/{name}/clusters`, `codehub://repo/{name}/processes`, etc.) are slated for a later wave. Until then, use the typed tools or a `sql` query (below) filtered to `kind = 'Community'` / `kind = 'Process'`. + +## Where the index lives (ADR 0019) + +There is **one store**: a single-file `/.codehub/store.sqlite` +(WAL, via Node's built-in `node:sqlite`). ADR 0019 supersedes ADR 0016: +the old two-tier backend (a `graph.lbug` Ladybug graph plus a +`temporal.duckdb` DuckDB file) is gone. One `SqliteStore` class implements +both the graph and temporal surfaces over that single file. + +Everything is directly SQL-queryable through the MCP `sql` tool's `sql` +argument (and `codehub sql` on the CLI): + +- **Graph tables (`nodes` and `edges`).** `nodes` holds the typed base + columns plus a `payload` JSON overflow; `edges` is one polymorphic table + keyed by `(src, dst, type, step)`. Query them via the typed tools + (`query` / `context` / `impact` / `route_map` / …) or, for bespoke + questions, plain SQL. Multi-hop traversal is a recursive SQL CTE over + `edges`, NOT Cypher. +- **Embeddings (the `embeddings` table).** Vectors live in a BLOB column; + there is NO Parquet sidecar (it was dropped with DuckDB). +- **Temporal tables (`cochanges` and `symbol_summaries`).** Same file, no + second engine. +- **`store_meta`.** Index metadata (graph hash, timestamps). + +Full-text search is BM25 via a SQLite FTS5 virtual table (`nodes_fts`). +The `cypher` argument to the MCP `sql` tool is **reserved for community-fork +graph adapters** (AGE / Memgraph / Neo4j / Neptune) and is **NOT supported +by the default SQLite backend**, so pass `sql` for every query against the +default store. + +### Graph schema (`nodes` / `edges` tables) + +The `nodes` table carries typed base columns (`id`, `kind`, `name`, +`file_path`, `start_line`, `end_line`) plus a `payload` JSON column holding +every kind-specific field. Reach payload fields with SQLite JSON1: +`payload->>'$.inferredLabel'`, `payload->>'$.stepCount'`, +`payload->>'$.entryPointId'`, `payload->>'$.cohesion'`, +`payload->>'$.symbolCount'`. + +The `edges` table is polymorphic: `src`, `dst`, `type`, `confidence`, +`step`, `reason`. The relation kind lives in the `type` column (there is no +per-type table). + +**Node kinds** (`kind` values): File, Folder, Function, Class, Method, Interface, Constructor, Struct, Enum, Macro, Typedef, Union, Namespace, Trait, Impl, TypeAlias, Const, Static, Variable, Property, Record, Delegate, Annotation, Template, Module, CodeElement, Community, Process, Route, Tool, Finding, Dependency, Contributor, Repo, ProjectProfile, Section. -**Relationship types** (each is its own edge label): CONTAINS, DEFINES, IMPORTS, +**Relationship types** (`edges.type` values): CONTAINS, DEFINES, IMPORTS, CALLS, EXTENDS, IMPLEMENTS, HAS_METHOD, HAS_PROPERTY, ACCESSES, METHOD_OVERRIDES, OVERRIDES, METHOD_IMPLEMENTS, MEMBER_OF, PROCESS_STEP, HANDLES_ROUTE, FETCHES, HANDLES_TOOL, ENTRY_POINT_OF, WRAPS, QUERIES, REFERENCES, FOUND_IN, DEPENDS_ON, OWNED_BY. -Cochanges live only in the **temporal** `cochanges` table (DuckDB SQL), never as -graph edges. +Cochanges live only in the `cochanges` table, never as graph edges. -## Cypher cheat-sheet (MCP `sql` tool, `cypher` arg) +## SQL cheat-sheet (MCP `sql` tool, `sql` arg) -All inbound callers of a function by name: +All inbound callers of a function by name (join `edges` to `nodes` on both +endpoints): -```cypher -MATCH (caller:CodeNode)-[r:CALLS]->(callee:CodeNode) +```sql +SELECT caller.name AS name, caller.file_path AS file, + caller.start_line AS line, e.confidence AS confidence, + e.reason AS reason +FROM edges e +JOIN nodes caller ON caller.id = e.src +JOIN nodes callee ON callee.id = e.dst WHERE callee.name = 'validateUser' AND callee.kind = 'Function' -RETURN caller.name AS name, caller.file_path AS file, caller.start_line AS line, - r.confidence AS confidence, r.reason AS reason -ORDER BY r.confidence DESC -LIMIT 50 + AND e.type = 'CALLS' +ORDER BY e.confidence DESC +LIMIT 50; ``` -Top communities by cohesion: +Top communities by cohesion (kind-specific fields via JSON1): -```cypher -MATCH (n:CodeNode) -WHERE n.kind = 'Community' -RETURN n.name AS name, n.inferred_label AS label, n.cohesion AS cohesion, - n.symbol_count AS symbols -ORDER BY n.cohesion DESC -LIMIT 20 +```sql +SELECT name, + payload->>'$.inferredLabel' AS label, + payload->>'$.cohesion' AS cohesion, + payload->>'$.symbolCount' AS symbols +FROM nodes +WHERE kind = 'Community' +ORDER BY cohesion DESC +LIMIT 20; ``` Process entry points: -```cypher -MATCH (n:CodeNode) -WHERE n.kind = 'Process' -RETURN n.name AS name, n.inferred_label AS label, n.step_count AS steps, - n.entry_point_id AS entry_point -ORDER BY n.step_count DESC +```sql +SELECT name, + payload->>'$.inferredLabel' AS label, + payload->>'$.stepCount' AS steps, + payload->>'$.entryPointId' AS entry_point +FROM nodes +WHERE kind = 'Process' +ORDER BY steps DESC; ``` SCIP-confirmed CALLS edges only (strict impact): -```cypher -MATCH ()-[r:CALLS]->() -WHERE r.confidence >= 0.95 AND r.reason STARTS WITH 'scip:' -RETURN r +```sql +SELECT * FROM edges +WHERE type = 'CALLS' + AND confidence >= 0.95 + AND reason LIKE 'scip:%'; +``` + +Multi-hop blast radius is a recursive CTE over `edges`. The typed `impact` +tool wraps this, so prefer it unless you need a bespoke traversal: + +```sql +WITH RECURSIVE reach(id, depth) AS ( + SELECT id, 0 FROM nodes WHERE name = 'validateUser' + UNION + SELECT e.src, r.depth + 1 + FROM edges e JOIN reach r ON e.dst = r.id + WHERE e.type IN ('CALLS', 'REFERENCES') AND r.depth < 3 +) +SELECT DISTINCT n.name, n.file_path, MIN(r.depth) AS depth +FROM reach r JOIN nodes n ON n.id = r.id +GROUP BY n.id ORDER BY depth; ``` -### Temporal SQL cheat-sheet (MCP `sql` tool, `sql` arg) +### Co-change cheat-sheet (MCP `sql` tool, `sql` arg) -Tightest co-change pairs (DuckDB SQL — temporal store): +Tightest co-change pairs (`cochanges` table): ```sql SELECT source_file, target_file, lift, cocommit_count diff --git a/.claude/skills/opencodehub-refactoring/SKILL.md b/.claude/skills/opencodehub-refactoring/SKILL.md index 8acd833f..264f12c1 100644 --- a/.claude/skills/opencodehub-refactoring/SKILL.md +++ b/.claude/skills/opencodehub-refactoring/SKILL.md @@ -149,20 +149,27 @@ mcp__codehub__shape_check({ route: "GET /users/:id", repo: "my-app" }) → mismatches: [{ consumer, expected, actual }] ``` -### `mcp__codehub__sql` — custom reference query (temporal store) - -The `sql` arg is read-only DuckDB over the temporal store (cochanges + -symbol_summaries). To enumerate every file referencing a symbol from the graph, -use the `cypher` arg of the same tool instead (the node/edge graph lives in -`graph.lbug`, not the SQL store): - -```cypher -MATCH (caller:CodeNode)-[r:REFERENCES|CALLS|IMPORTS]->(target:CodeNode) +### `mcp__codehub__sql` — custom reference query (single-file SQLite) + +The `sql` arg is read-only SQL over the single-file `store.sqlite` index +(ADR 0019). Every table is directly queryable: `nodes`, `edges`, `embeddings`, +`cochanges`, `symbol_summaries`, `store_meta`. To enumerate every file +referencing a symbol from the graph, join `edges` to `nodes` on both endpoints: + +```sql +SELECT DISTINCT caller.file_path AS file +FROM edges e +JOIN nodes caller ON caller.id = e.src +JOIN nodes target ON target.id = e.dst WHERE target.name = 'validateUser' -RETURN DISTINCT caller.file_path AS file + AND e.type IN ('REFERENCES', 'CALLS', 'IMPORTS') ORDER BY file ``` +The `cypher` arg of the same tool is reserved for community-fork graph +adapters (AGE / Memgraph / Neo4j / Neptune) and is not supported by the +default SQLite backend. + This catches references a textual rename might miss — useful as a manual-check list before and after you edit. @@ -172,7 +179,7 @@ list before and after you edit. | --------------------------------- | ----------------------------------------------------------------------- | | Many callers (> 5) | Use your editor's LSP rename for the mechanical work; `impact` is the checklist | | Cross-module references | Run `detect_changes` after editing; watch for missed imports | -| String / dynamic references | Use the `cypher` arg with `REFERENCES`; the graph cannot see string-keyed dispatch — read those by hand | +| String / dynamic references | Query the `edges` table for `REFERENCES` rows; the graph cannot see string-keyed dispatch, so read those by hand | | Public / exported API | Version and deprecate; mirror symbol names in a transition layer | | Heuristic edges (confirmed = 0) | Cross-check by reading source; the SCIP oracle did not weigh in | diff --git a/.erpaval/INDEX.md b/.erpaval/INDEX.md index de8fd5a4..de224b29 100644 --- a/.erpaval/INDEX.md +++ b/.erpaval/INDEX.md @@ -11,6 +11,7 @@ development sessions. Solutions are reusable; specs are per-feature. - [New code-pack BOM items must anchor on graph nodes, not chunker data](solutions/architecture-patterns/pack-bom-additions-anchor-on-graph-nodes-not-chunker.md) — `generatePack` only gets `chunkerFiles` from the determinism TEST fixture; production `runPackEngine` never wires it, so `ast-chunks.jsonl` is empty in real packs. Anchor new BOM items on `File` nodes (filePath/contentHash/lineCount/language, prod-populated); byte ranges/token counts are best-effort. (Latent prod bug now FIXED — see next entry.) - [Pack provenance is derived in the CLI, with hash-verified disk bytes](solutions/architecture-patterns/pack-provenance-derived-in-cli-with-hash-verified-bytes.md) — `runPackEngine` derives commit/origin from the `Repo` node, chunkerFiles from disk (each hash-verified against `FileNode.contentHash`, drifted files skipped), and grammar pins from `parse.grammarVersions()`, then threads them through `generatePack`'s `internal` seam. Derivation is the unset-path fallback so pack fixtures keep their behavior; defensive against a stubbed/empty graph. Fixes the empty-ast-chunks/hollow-manifest bug while preserving byte-identity. +- [Pack provenance/channel fields ride outside the packHash preimage unless they change the decision set](solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md) — classify every new pack field FIRST: hash-bound iff it changes the decision set (selection/sizing/BOM content), hash-free iff it only annotates delivery. tokenizer lane + CycloneDX 1.7 citation = hash-bound (legitimately flip packHash); `--cache-channel` = hash-free (kept out of the manifest preimage so `auto` stays byte-identical). Mis-binding breaks pinned fixtures; mis-unbinding breaks the re-derivability contract. - [`@opencodehub/pack` strict `tsc -b` fails on code the test runner accepts](solutions/build-errors/pack-strict-tsc-vs-loose-test-runner-exact-optional.md) — green `test` ≠ green `build`. `noUncheckedIndexedAccess` (iterate `for..of` + accumulator, not `arr[i]`) and `exactOptionalPropertyTypes` (explicit literals, not spread-over-readonly) bite the build only. Run `build` after pack edits; strip ANSI before grepping `error TS`. - [Collapse a publish-many TS monorepo into one bundled CLI with tsup](solutions/architecture-patterns/tsup-collapse-monorepo-to-single-cli.md) — `noExternal:[/^@scope//]` + `external:[/^[^.]/]`; workers as named entries (esbuild won't follow `new URL(...,import.meta.url)`); copy import.meta.url assets in onSuccess; tsconfig.test.json → dist-test/ because tsup drops *.test.ts; convert hidden string-imports to static. Kills the pack-all-publishables bug class. - [Make a heavy native dep optional + lazy so a default install can prune it](solutions/architecture-patterns/optional-native-dep-lazy-import.md) — onnxruntime-node 254MB: deps→optionalDependencies, top-level value-import→`import type`, dynamic `import()` at use site threading the runtime constructor in; bundler must keep it `external`. diff --git a/.erpaval/solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md b/.erpaval/solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md new file mode 100644 index 00000000..74403266 --- /dev/null +++ b/.erpaval/solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md @@ -0,0 +1,49 @@ +--- +name: Pack provenance/channel fields ride outside the packHash preimage unless they change the decision set +description: New pack metadata (tokenizer lane, cache channel, provenance citations) must be classified as either hash-bound (changes what was selected) or hash-free (only annotates), or it silently breaks determinism fixtures or the decision-equivalence contract +type: architecture-patterns +--- + +When adding a new field to the pack, decide up front whether it belongs in +the `packHash` preimage. The preimage (`packages/pack/src/manifest.ts` +`toSnakeCaseManifest` → `canonicalJson` → `sha256Hex`) binds the fields +that define WHAT was selected: `commit`, `tokenizerId`, `budgetTokens`, +`determinismClass`, `pins`, per-file `fileHash`, and `contextBomHash`. Under +ADR-0020 the real contract is decision-equivalence (same inputs ⇒ same +retrieval decision set); byte-identity is a cheap witness. So the test is: +does the field change the decision set? + +Three fields added across Moves 1/2/4 sorted cleanly into the two classes: + +- **Sonnet-5 tokenizer lane (Move 1)** — HASH-BOUND. `tokenizerId` is already + in the preimage; a new lane value (`anthropic:claude-sonnet-5@2026-06-30`) + legitimately flips `packHash` because it changes chunk sizing and the + `resolveDeterminism` verdict (`index.ts:287` downgrades any `anthropic:` + prefix to `best_effort`). No fixture broke because the determinism tests + assert cross-run EQUALITY (`m1 === m2`), not golden literals. +- **CycloneDX 1.7 + per-file provenance citation (Move 2)** — HASH-BOUND, by + design. `context-bom.json` is a BOM item AND its `contextBomHash` is a named + preimage field, so bumping `specVersion`/`$schema` and adding a per-file + `externalReferences[{type:"vcs",url}]` + `opencodehub:commit` property flips + the hash. That is correct: the receipt's content genuinely changed. Only the + one hard `specVersion === "1.6"` assertion needed editing; equality-based + determinism suites passed untouched. +- **`--cache-channel` (Move 4)** — HASH-FREE. The channel only shapes the + agent-facing assembled context string (`assemblePackContext` in + `variance-probe.ts`), not the BOM. It is recorded on `PackOpts.cacheChannel` + but deliberately kept OUT of `toSnakeCaseManifest`, so the default (`auto`, + marker-free) path is byte-identical to pre-Move-4 packs and every + determinism/golden fixture stays green. If it had leaked into the preimage, + every existing pinned pack would have broken for a field that changes nothing + about what was selected. + +Mechanism: classify the field FIRST. Hash-bound iff it changes the decision +set (selection, sizing, or the recorded content of a BOM item). Hash-free iff +it only annotates delivery/consumption. A cache/delivery/rendering knob is +hash-free; a tokenizer/budget/selection/content knob is hash-bound. Getting +this wrong is silent: a mis-bound knob breaks every pinned fixture; a +mis-unbound content field breaks the re-derivability contract. + +Related: [[collapse-parallel-switches-into-record-registry]] (the channel enum ++ `cacheChannelNeedsMarkers` switch is exhaustive over the union so a new +channel forces a compile-time decision). diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 36d0be40..36729fbc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,8 +39,8 @@ jobs: # Parsing is WASM-only on every supported Node version (ADR 0015), so the # test suite needs no native grammar build — `--ignore-scripts` is the # single install path across the matrix. The remaining native deps - # (@duckdb/node-api, @ladybugdb/core, onnxruntime-node) ship prebuilds, so - # storage/embedder tests pass without running postinstall. + # the WASM embedder (onnxruntime-web) and node:sqlite need no native build, so + # storage/embedder tests pass without running postinstall (ADR 0019). # # Build before test: every package's `test` runs `node --test` against its # built `dist/` (and the cli compiles `src` → `dist-test/`), so the dist diff --git a/.github/workflows/verify-global-install.yml b/.github/workflows/verify-global-install.yml index 4f375845..e6b93a5e 100644 --- a/.github/workflows/verify-global-install.yml +++ b/.github/workflows/verify-global-install.yml @@ -208,7 +208,7 @@ jobs: TARBALL_DIR: ${{ runner.temp }}/opencodehub-tarballs FIXTURE_DIR: tests/fixtures/multi-lang # Use the script's documented default budget. A cold-cache global - # install of the native prebuilts (ladybug + duckdb + onnxruntime) + # install of a pure-JS + WASM package (no native storage/embedder bindings) # on a loaded shared runner legitimately varies 30-90s, so the old # hardcoded 60s tripped on slow macOS cells despite a clean install # (see scripts/verify-global-install.sh header + issue #163). The diff --git a/AGENTS.md b/AGENTS.md index e86121e0..deab0320 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,14 +13,15 @@ tiers. - `context` — inbound/outbound refs and participating flows for one symbol. - `impact` — dependents of a target up to a configurable depth, with a risk tier. - `detect_changes` — map an uncommitted or committed diff to affected symbols. -- `sql` — read-only SQL against the local temporal store (the `cochanges` and `symbol_summaries` tables), 5 s timeout. The node/edge graph lives in `graph.lbug` (ADR 0016) and is reached via the typed tools (`query`/`context`/`impact`) or Cypher via the MCP `sql` tool's `cypher` arg — NOT via this SQL path. +- `list_findings` — browse SARIF findings from the latest scan by severity and rule. +- `sql` — read-only SQL against the local temporal store (cochanges + symbol_summaries), 5 s timeout; the node/edge graph is queried via the typed tools or Cypher via the MCP `sql` tool. Run `codehub analyze` after pulling new commits so the index stays aligned with the working tree. `codehub status` reports staleness. ## Full MCP surface -The full MCP surface is **28 tools** (see `packages/mcp/src/server.ts`); +The full MCP surface is **29 tools** (see `packages/mcp/src/server.ts`); the 6 listed above are the high-frequency exploration tools. For the full inventory, use the `/opencodehub-guide` skill. @@ -81,22 +82,23 @@ This repo ships a Claude Code plugin at `plugins/opencodehub/` — it provides a `code-analyst` subagent and 11 skills. Install via `codehub init` (writes `.mcp.json` + links the plugin). -## Storage backend — lbug graph + DuckDB temporal - -The graph tier is always `@ladybugdb/core` (`graph.lbug`); the temporal -tier — cochanges, structured symbol summaries, and the -`codehub query --sql` escape hatch — is always DuckDB -(`temporal.duckdb`). Both files live under `/.codehub/`. There is -no env-var, no probe, no fallback; if the lbug binding fails to load, -`open()` throws `GraphDbBindingError` and the operation aborts. See -ADR 0016 (`docs/adr/0016-duckdb-graph-rip.md`) for the rationale and the -AGE/Memgraph/Neo4j/Neptune community-adapter contract that survives the -rip-out (the segregated `IGraphStore` / `ITemporalStore` interfaces stay -exactly because community-fork adapters are a deliberate escape hatch). - -`IGraphStore` lives only on `GraphDbStore`; `DuckDbStore` implements -`ITemporalStore` only. Embeddings live in `graph.lbug` and stream into a -per-call DuckDB temp table at pack time so the byte-identical Parquet -sidecar still works (see `packages/pack/src/embeddings-sidecar.ts`). -Future temporal swap (e.g. SQLite-WASM) only needs a new `ITemporalStore` -implementor — no graph-tier change. +## Storage backend — single-file SQLite (ADR 0019) + +The entire index lives in ONE `/.codehub/store.sqlite` file (WAL), +via Node's built-in `node:sqlite` — graph nodes, edges, embeddings, and +the temporal tables (cochanges, structured symbol summaries, and the +`codehub query --sql` escape hatch). One `SqliteStore` class implements +BOTH `IGraphStore` and `ITemporalStore`; `openStore()` returns that single +instance as both the `graph` and `temporal` views, so call sites use +`store.graph.X()` / `store.temporal.Y()` unchanged. There are zero native +storage bindings — `@ladybugdb/core` and `@duckdb/node-api` were both +removed. See ADR 0019 (`docs/adr/0019-single-file-sqlite-storage.md`) for +the rationale; it supersedes ADR 0016 +(`docs/adr/0016-duckdb-graph-rip.md`). + +The segregated `IGraphStore` / `ITemporalStore` interfaces remain as the +community-fork escape hatch: an AGE / Memgraph / Neo4j / Neptune adapter +implements `IGraphStore` and pairs with any SQL-shaped `ITemporalStore`. +Embeddings live in the `embeddings` table inside `store.sqlite` — the +write-only Parquet sidecar was dropped with DuckDB (nothing ever read it +back). diff --git a/CLAUDE.md b/CLAUDE.md index 07c4a795..67042268 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,14 +10,15 @@ tiers. - `context` — inbound/outbound refs and participating flows for one symbol. - `impact` — dependents of a target up to a configurable depth, with a risk tier. - `detect_changes` — map an uncommitted or committed diff to affected symbols. -- `sql` — read-only SQL against the single-file `store.sqlite` index (ADR 0019). `nodes`, `edges`, `embeddings`, `cochanges`, `symbol_summaries`, and `store_meta` are all directly SQL-queryable (e.g. `SELECT id, name FROM nodes WHERE kind = 'Function'`; reach kind-specific fields via SQLite JSON1, `payload->>'$.field'`). 5 s timeout. The typed tools (`query`/`context`/`impact`) remain the high-level path; the `cypher` arg is reserved for community-fork graph adapters and is not supported by the default backend. +- `list_findings` — browse SARIF findings from the latest scan by severity and rule. +- `sql` — read-only SQL against the local temporal store (cochanges + symbol_summaries), 5 s timeout; the node/edge graph is queried via the typed tools or Cypher via the MCP `sql` tool. Run `codehub analyze` after pulling new commits so the index stays aligned with the working tree. `codehub status` reports staleness. ## Full MCP surface -The full MCP surface is **28 tools** (see `packages/mcp/src/server.ts`); +The full MCP surface is **29 tools** (see `packages/mcp/src/server.ts`); the 6 listed above are the high-frequency exploration tools. For the full inventory, use the `/codehub-guide` skill. diff --git a/OBJECTIVES.md b/OBJECTIVES.md index f6784b00..f59a400c 100644 --- a/OBJECTIVES.md +++ b/OBJECTIVES.md @@ -52,9 +52,10 @@ quality bar sits, and what is deliberately out of scope. ## Non-goals -8. **Do not operate a server or SaaS.** DuckDB is embedded. The MCP server is - a stdio process. ADR 0001 rejects any engine that would need a daemon. The - product ships as a CLI plus an MCP server, nothing hosted. +8. **Do not operate a server or SaaS.** The index is an embedded single-file + SQLite store (ADR 0019). The MCP server is a stdio process. ADR 0001 + rejects any engine that would need a daemon. The product ships as a CLI + plus an MCP server, nothing hosted. 9. **Do not port to Rust before it is needed.** ADR 0002 measured p95 single-file incremental analysis at 195-250ms on the 100-file fixture, well diff --git a/README.md b/README.md index 05e19d49..a032b61a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ npm install -g @opencodehub/cli cd /path/to/your/repo codehub init && codehub analyze -# your agent now has impact, query, context, detect_changes — 28 tools over MCP +# your agent now has impact, query, context, detect_changes — 29 tools over MCP ``` ## Why this exists @@ -63,11 +63,11 @@ ten round-trips. flowchart LR A[Source tree] -->|tree-sitter parse| B[Symbol graph] B -->|resolve imports / MRO| C[Typed relations] - C -->|BM25 + HNSW index| D[Hybrid graph store] + C -->|BM25 + vector KNN| D[Hybrid graph store] C -->|detect communities + flows| E[Processes / clusters] D --> F[MCP server] E --> F - F -->|28 tools| G[AI coding agent] + F -->|29 tools| G[AI coding agent] ``` ## Design choices worth knowing @@ -79,7 +79,7 @@ flowchart LR | **Deterministic indexing** | Identical inputs produce a byte-identical graph hash. Reproducible. Auditable. Cacheable in CI. | | **First-party source only** | `analyze` honors the repo's `.gitignore` (nested files included) and always skips dependency installs, virtualenvs, build output, and tool caches — `node_modules`, `.venv`/`venv`, `__pycache__`, `dist`/`build`/`target`, `.next`/`.nuxt`/`.turbo`, `.mypy_cache`/`.pytest_cache`/`.ruff_cache`, `coverage`, and similar. Exclusion is decided once at scan time (`HARDCODED_IGNORES` in `packages/ingestion/src/pipeline/gitignore.ts`), so every retrieval surface — `query`, `context`, `impact`, `sql`, `pack` — inherits it. Ambiguous names that are often real source (`vendor`, `env`, `out`, `bin`) are left to your `.gitignore`, which supports `!`-negation a hardcoded rule can't. | | **MCP-native** | Works out-of-the-box with Claude Code, Cursor, Codex, Windsurf, OpenCode. The MCP server is the primary interface; CLI exists for scripts and CI. | -| **Single-file embedded storage** | One `store.sqlite` file holds everything — symbols, edges, embeddings, BM25 (FTS5) + HNSW traversal, and the temporal views (cochanges, summaries) — via Node's built-in `node:sqlite`. No daemon, no database to operate, and **zero native storage bindings** (ADR 0019 removed both `@ladybugdb/core` and `@duckdb/node-api`). | +| **Single-file embedded storage** | One `store.sqlite` file holds everything — symbols, edges, embeddings, BM25 (FTS5) + brute-force vector KNN, and the temporal views (cochanges, summaries) — via Node's built-in `node:sqlite`. No daemon, no database to operate, and **zero native storage bindings** (ADR 0019 removed both `@ladybugdb/core` and `@duckdb/node-api`). | | **15 languages at GA** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, C, C++, Ruby, Kotlin, Swift, PHP, Dart, COBOL — tree-sitter for the first 14 plus a regex provider for fixed-format COBOL. | | **WASM-only parse runtime** | `web-tree-sitter` WASM is the only parse runtime. The 15 grammar `.wasm` blobs are vendored at `packages/ingestion/vendor/wasms/`, so parsing does **zero grammar/native builds and zero GitHub fetches** at install time — there is no native parser opt-in. Storage is pure `node:sqlite`; the only optional native dep is the local embedder (see Platform support). | @@ -149,7 +149,7 @@ pnpm run check # lint + typecheck + test + banned-strings mise run cli:link # puts `codehub` on your PATH ``` -## MCP tool surface (28 tools) +## MCP tool surface (29 tools) | Tool | Purpose | |---|---| @@ -170,7 +170,7 @@ skills + a code-analyst subagent — install via `codehub init`. ## Repository layout -The monorepo is organised as 18 workspace packages under `packages/`: +The monorepo is organised as 19 workspace packages under `packages/`: | Package | Purpose | |---|---| @@ -178,24 +178,26 @@ The monorepo is organised as 18 workspace packages under `packages/`: | `cli` | `codehub` command — `init`, `analyze`, `status`, `setup`, scanners, group federation | | `cobol-proleap` | ProLeap-backed deep-parse path for free-format COBOL (regex provider handles fixed-format) | | `core-types` | Shared TypeScript types, Zod schemas, error codes, canonical `LanguageId` and node/edge kinds | +| `docs` | This Starlight documentation site (private workspace package) | | `embedder` | Embedding backends — local ONNX, HTTP, SageMaker; deterministic `embedderId` fingerprint | +| `eval` | Retrieval / graph-quality evaluation harness (private, test-time only) | | `frameworks` | HTTP route + MCP tool detectors used by `route_map` / `api_impact` / `tool_map` | | `ingestion` | Tree-sitter + WASM parsers, symbol extraction, import resolution, complexity phase | -| `mcp` | Model Context Protocol server — 28 tools, resources, structured error envelopes | +| `mcp` | Model Context Protocol server — 29 tools, resources, structured error envelopes | | `pack` | Deterministic Repomix-compatible code-pack generator (M5) | | `policy` | Allowlist + license-tier policy engine driving `license_audit` and CI gates | | `sarif` | SARIF schema validation and scanner output normalisation | | `scanners` | Subprocess wrappers for 19 scanners — OSV, Semgrep, hadolint, tflint, betterleaks, and the rest | | `scip-ingest` | SCIP indexer runners (TS, Python, Go, Rust, Java) — emits CALLS, REFERENCES, IMPLEMENTS, TYPE_OF | -| `search` | Hybrid BM25 + HNSW (ACORN-1 + RaBitQ) query layer | +| `search` | Hybrid BM25 (FTS5) + vector query layer over `store.sqlite` | | `storage` | One `SqliteStore` (`node:sqlite`) implementing both `IGraphStore` + `ITemporalStore` over a single `store.sqlite`; deterministic `graphHash` | | `summarizer` | Process + cluster summaries for MCP responses | | `wiki` | LLM-narrated module pages emitted by `codehub wiki --llm` | -The retrieval / graph-quality evaluation harness and the per-language F1 -regression gym used to live here as `eval` and `gym`; they were +The per-language F1 regression gym used to live here as `gym`; it was extracted into the sibling `opencodehub-testbed` repository so the -production package set ships free of test-time dependencies. +production package set ships free of that test-time dependency. The +`eval` harness stays in-tree as a private, test-time-only package. ## Embedding backends @@ -281,8 +283,8 @@ superseded. `IGraphStore` / `ITemporalStore` interface segregation), B (19-scanner fleet incl. betterleaks), C (debt sweep — embedder fingerprint, SCIP REFERENCES + TYPE_OF), and D (dogfood polish) have all merged. The -published package is `@opencodehub/cli` (currently `0.7.0`; the monorepo -root tracks `0.8.0`); `1.0.0` is cut once schema + tool-surface stability +published package is `@opencodehub/cli` (currently `0.10.6`; the monorepo +root tracks `0.10.6`); `1.0.0` is cut once schema + tool-surface stability is signed off. While on `0.x`, **any release may contain breaking changes** to the diff --git a/SPECS.md b/SPECS.md index bbba8926..e3ccac7b 100644 --- a/SPECS.md +++ b/SPECS.md @@ -4,9 +4,9 @@ OpenCodeHub is an Apache-2.0, local-first code-intelligence toolchain for AI coding agents. It ingests a source tree into a hybrid knowledge graph -(structural relations + semantic vectors) stored as a two-tier split — an -lbug graph (`@ladybugdb/core`, `graph.lbug`) plus a DuckDB temporal sibling -(`temporal.duckdb`), both under `/.codehub/` (ADR 0016) — and exposes +(structural relations + semantic vectors) stored in a single-file SQLite +index (`store.sqlite`, WAL, via Node's built-in `node:sqlite`) under +`/.codehub/` (ADR 0019, superseding ADR 0016) — and exposes that graph over the Model Context Protocol and a `codehub` CLI. Agents use it to answer "what breaks if I change this, what depends on it, where does this data flow" *before* they produce a diff. @@ -20,7 +20,7 @@ Communities and Processes, and optionally populates embeddings from a pinned F2LLM-v2-80M ONNX model (320-dim; fp32 ~321 MB or int8 ~81 MB) or an OpenAI-compatible HTTP endpoint. -At query time it exposes an MCP server with 28 tools (`query`, `context`, +At query time it exposes an MCP server with 29 tools (`query`, `context`, `impact`, `signature`, `detect_changes`, `sql`, scanner / finding / dependency / verdict / route tools, and cross-repo `group_*` tools), along with a CLI that mirrors the main tools plus administrative @@ -32,9 +32,8 @@ working tree. - Not a language server. It runs SCIP indexers as one-shot artifact producers and does not speak LSP to editors directly. -- Not a SaaS. There is no server to operate; the graph lives as two - embedded files under `/.codehub/` (the lbug `graph.lbug` plus the - DuckDB `temporal.duckdb`). +- Not a SaaS. There is no server to operate; the graph lives as one + embedded file under `/.codehub/` (`store.sqlite`). - Not a hosted vector DB. Embeddings are optional and local; there is no network dependency for analyze or query. - Not a ranking / recommendation product. The graph is precomputed at index @@ -130,35 +129,38 @@ indexers agree on `package{manager,name,version}`. ## 3. Storage & schema -3.1 The system shall persist the graph tier to an lbug graph file -(`graph.lbug`, `@ladybugdb/core`) and the temporal tier — cochanges and -structured symbol summaries — to a DuckDB file (`temporal.duckdb`), both -under `/.codehub/`. Both files are written on every analyze; there is -no `CODEHUB_STORE` env var, no backend probe, and no single-file DuckDB -graph layout (ADR 0016). - -3.2 The storage layer shall segregate `IGraphStore` (graph workload: nodes, -edges, embeddings, multi-hop traversal) from `ITemporalStore` (cochanges, -summary cache). `IGraphStore` lives only on `GraphDbStore`; `DuckDbStore` -implements `ITemporalStore` only; `openStore()` composes them. The -segregated interfaces are the v1.0 contract for community-fork adapters -(AGE / Memgraph / Neo4j / Neptune target `IGraphStore`). If the lbug -binding fails to load, `open()` throws `GraphDbBindingError`. +3.1 The system shall persist the entire index — graph nodes, edges, +embeddings, and the temporal tables (cochanges and structured symbol +summaries) — to a single `store.sqlite` file (WAL, via `node:sqlite`) under +`/.codehub/`. The file is written on every analyze; there is no +`CODEHUB_STORE` env var, no backend probe, and no separate graph/temporal +file split (ADR 0019, superseding ADR 0016). + +3.2 The storage layer shall retain the segregated `IGraphStore` (graph +workload: nodes, edges, embeddings, multi-hop traversal) and `ITemporalStore` +(cochanges, summary cache) interfaces. One `SqliteStore` class implements +BOTH over the single file, and `openStore()` returns that instance as both +views. The segregated interfaces are the v1.0 contract for community-fork +adapters (AGE / Memgraph / Neo4j / Neptune implement `IGraphStore` and pair +with any SQL-shaped `ITemporalStore`). There is no native storage binding to +load, so `open()` cannot fail on a missing binding. 3.3 While executing the `sql` MCP tool or `codehub sql` CLI, the system shall reject non-read-only statements and apply a 5-second default timeout. -The `sql` path targets the DuckDB temporal store (`cochanges` + -`symbol_summaries`); the node/edge graph is queried via the typed tools or -via Cypher (the `sql` tool's `cypher` argument), not this SQL path. +The `sql` path targets the SQLite index directly (`nodes`, `edges`, +`embeddings`, `cochanges`, `symbol_summaries`, `store_meta` are all +SQL-queryable); the typed tools remain the high-level path. The `cypher` +argument is reserved for community-fork graph adapters and is unsupported by +the default backend. -3.4 The vector search path shall use the lbug graph's filter-aware -nearest-neighbour traversal when embeddings are populated. +3.4 The vector search path shall use filter-aware nearest-neighbour search +over the `embeddings` table when embeddings are populated. -3.5 The full-text search path shall use BM25 scoring over the indexed -symbols. +3.5 The full-text search path shall use BM25 scoring (SQLite FTS5) over the +indexed symbols. -3.6 Multi-hop graph traversal shall be expressed in the lbug graph's Cypher -dialect rather than recursive SQL CTEs. +3.6 Multi-hop graph traversal shall be expressed as recursive SQL CTEs over +the `edges` table. 3.7 The storage layer shall write metadata (schema version, graph hash, last-analyzed commit) atomically and expose it via `getMeta`. @@ -215,12 +217,12 @@ merge-safe tiers, 1 for review-required tiers, and 2 for `block`. 6.1 The MCP server shall advertise itself as `opencodehub` over stdio with an `instructions` block steering clients to call `list_repos` first. -6.2 The server shall register 28 tools: `list_repos`, `query`, `context`, +6.2 The server shall register 29 tools: `list_repos`, `query`, `context`, `impact`, `signature`, `detect_changes`, `sql`, `group_list`, `group_query`, `group_status`, `group_contracts`, `group_cross_repo_links`, `group_sync`, `project_profile`, `dependencies`, `license_audit`, `owners`, `list_findings`, `list_findings_delta`, `list_dead_code`, -`scan`, `verdict`, `risk_trends`, `route_map`, +`scan`, `verdict`, `change_pack`, `risk_trends`, `route_map`, `api_impact`, `shape_check`, `tool_map`, and `pack_codebase`. No registered tool mutates user source files; the MCP surface is read-only with respect to the working tree. @@ -258,7 +260,8 @@ shall reject it with `SqlGuardError`. and `sql`. 7.2 The CLI shall lazy-load every subcommand via `await import(...)` so -`codehub --help` does not transitively load DuckDB or tree-sitter. +`codehub --help` does not transitively load the WASM parser or the embedder +runtime. 7.3 The `setup` command shall write MCP configuration stanzas for claude-code, cursor, codex, windsurf, and opencode; pass `--undo` to diff --git a/mise.toml b/mise.toml index 65344279..b8cba988 100644 --- a/mise.toml +++ b/mise.toml @@ -3,7 +3,7 @@ node = "24" pnpm = "11.1.0" python = "3.12" uv = "latest" -"npm:node-gyp" = "latest" # fallback native build for @duckdb/node-api / onnxruntime-node when a platform prebuild is missing (parsing is WASM-only — ADR 0015) +"npm:node-gyp" = "latest" # defensive fallback: OCH's own deps are pure JS + WASM (no native storage binding after ADR 0019; parsing is WASM-only per ADR 0015), but a transitive optional native dep could still need a source build on a platform with no prebuild "aqua:betterleaks/betterleaks" = "1.2.0" # secret scanner — used by analyze + pre-release gate lefthook = "2.1.8" # git hooks — must satisfy lefthook.yml min_version (2.1.6); matches root devDep so a stale global mise install can't shadow it diff --git a/packages/analysis/src/test-utils.ts b/packages/analysis/src/test-utils.ts index da1ee086..fd696940 100644 --- a/packages/analysis/src/test-utils.ts +++ b/packages/analysis/src/test-utils.ts @@ -132,7 +132,7 @@ function sortNodesById(nodes: readonly FakeNode[]): FakeNode[] { /** * Sort edges by `(from, to, type)` so callers see the same order as - * `listEdges` returns from DuckDb/GraphDb. + * `listEdges` returns from the store. */ function sortEdges(edges: readonly FakeEdge[]): FakeEdge[] { return [...edges].sort((a, b) => { diff --git a/packages/cli/README.md b/packages/cli/README.md index d28098f0..d83a224f 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -69,7 +69,7 @@ top-level subcommands by phase of the workflow. ## Design - **Lazy loading** — each `.action()` does `await import(...)` so cold - startup is bounded by Commander, not DuckDB or the parse pool + startup is bounded by Commander, not the store or the parse pool (`packages/cli/src/index.ts:78-81`). - **No stateful daemon** — `analyze` runs to completion and exits; `mcp` is the only long-running process. @@ -81,13 +81,12 @@ top-level subcommands by phase of the workflow. - **`mcp` is launched, never embedded** — agents that need the MCP surface spawn `codehub mcp` over stdio (`packages/cli/src/commands/mcp.ts`). -See ADR 0016 for the lbug-graph + DuckDB-temporal storage layout and the +See ADR 0019 for the single-file `store.sqlite` storage layout and the root README's "MCP tool surface" section for the agent-facing tool inventory. -The bundled graph backend is `@ladybugdb/core` **0.17.1** or newer. 0.17.0 -changed empty-`STRING[]` serialization (empty lists now round-trip as a typed -empty array rather than collapsing to NULL); the adapter decodes a bare empty -array as an absent field on every supported lbug version, so `graphHash` -byte-identity — and the Parquet embeddings sidecar `packHash` that depends on -it — is preserved across the upgrade. +Storage is one `store.sqlite` file (WAL) via Node's built-in `node:sqlite`, +with zero native bindings (ADR 0019). Empty `keywords: []` round-trips as a +typed empty array distinct from an absent field — stored in the node's JSON +`payload` column — so `graphHash` byte-identity is preserved. Embeddings live +in the `embeddings` table (no Parquet sidecar). diff --git a/packages/cli/src/commands/analyze-carry-forward.test.ts b/packages/cli/src/commands/analyze-carry-forward.test.ts index cd2657d8..163179b4 100644 --- a/packages/cli/src/commands/analyze-carry-forward.test.ts +++ b/packages/cli/src/commands/analyze-carry-forward.test.ts @@ -3,10 +3,10 @@ * {@link loadPreviousGraph}. * * What this exercises: - * - After a prior DuckDB index + scan-state.json are on disk, + * - After a prior index (`store.sqlite`) + scan-state.json are on disk, * `loadPreviousGraph` returns a {@link pipeline.PreviousGraph} whose * `nodes` AND `edges` fields are populated (non-empty, round-tripped - * through the `rowToGraphNode` / `rowToCodeRelation` mappers). + * through the store's typed `listNodes` / `listEdges` finders). * - That shape is the exact precondition `resolveIncrementalView` * (`packages/ingestion/src/pipeline/phases/incremental-helper.ts:95-102`) * checks before it flips `active=true`. A `PreviousGraph` satisfying @@ -15,7 +15,7 @@ * run their carry-forward codepath. * - The negative case (missing DB) still returns `undefined`. * - * The test builds its own DuckDB from scratch via a synthetic + * The test builds its own `store.sqlite` from scratch via a synthetic * `KnowledgeGraph` rather than running the full `runIngestion` pipeline — * keeps the test fast (no tree-sitter / SCIP invocations) and isolates the * storage ↔ `loadPreviousGraph` round-trip being exercised. @@ -190,7 +190,7 @@ async function seedPriorIndex(repoPath: string): Promise<{ return { nodeCount: graph.nodeCount(), edgeCount: graph.edgeCount() }; } -test("loadPreviousGraph: returns full nodes + edges from a seeded DuckDB", async () => { +test("loadPreviousGraph: returns full nodes + edges from a seeded store", async () => { const repoPath = await mkdtemp(join(tmpdir(), "och-carry-forward-")); const seeded = await seedPriorIndex(repoPath); diff --git a/packages/cli/src/commands/analyze-findings-survival.test.ts b/packages/cli/src/commands/analyze-findings-survival.test.ts new file mode 100644 index 00000000..33927d98 --- /dev/null +++ b/packages/cli/src/commands/analyze-findings-survival.test.ts @@ -0,0 +1,171 @@ +/** + * Regression test for the incremental-analyze findings-wipe bug. + * + * `runAnalyze` rebuilds the graph with a replace-mode `bulkLoad` (ADR 0019), + * which truncates EVERY node — including the `Finding` nodes and `FOUND_IN` + * edges that a prior `codehub scan` ingested. On the scan-skip fast-path + * (fingerprint match + `scan.sarif` present) the scanners do NOT re-run, so + * before the fix nothing re-populated those findings and the freshly-rebuilt + * graph reported zero findings — `list_findings`, `list_findings_delta`, and + * `verdict` all silently saw a clean scan. + * + * The fix re-ingests the cached `scan.sarif` on the skip path. `runIngestSarif` + * is idempotent (fingerprint-stable enrichment + upsert-mode bulkLoad), so it + * restores exactly the findings the wipe removed. + * + * These tests exercise the store-level composition directly — seed findings, + * simulate the replace-mode graph wipe, then run the skip-path re-ingest — so + * the regression is caught without driving a full git+scanner analyze run + * (which the determinism suite never exercised, which is why the bug shipped). + */ + +import assert from "node:assert/strict"; +import { mkdir, mkdtemp, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "node:test"; +import { KnowledgeGraph } from "@opencodehub/core-types"; +import type { SarifLog } from "@opencodehub/sarif"; +import { openStore, resolveGraphPath, resolveRepoMetaDir } from "@opencodehub/storage"; +import { runIngestSarif } from "./ingest-sarif.js"; + +/** A SARIF log with two findings on two files — the cached `scan.sarif`. */ +function scanLog(): SarifLog { + return { + version: "2.1.0", + runs: [ + { + tool: { driver: { name: "semgrep", version: "1.0.0" } }, + results: [ + { + ruleId: "r.xss", + message: { text: "XSS risk" }, + locations: [ + { + physicalLocation: { + artifactLocation: { uri: "web/a.ts" }, + region: { startLine: 10 }, + }, + }, + ], + partialFingerprints: { "opencodehub/v1": "a".repeat(32) }, + }, + { + ruleId: "r.sqli", + message: { text: "SQLi risk" }, + locations: [ + { + physicalLocation: { + artifactLocation: { uri: "api/b.ts" }, + region: { startLine: 20 }, + }, + }, + ], + partialFingerprints: { "opencodehub/v1": "b".repeat(32) }, + }, + ], + }, + ], + }; +} + +async function countFindings(repoPath: string): Promise { + const store = await openStore({ path: resolveGraphPath(repoPath) }); + try { + await store.graph.open(); + let n = 0; + for (const node of await store.graph.listNodes()) { + if (node.kind === "Finding") n += 1; + } + return n; + } finally { + await store.close(); + } +} + +/** + * Write the cached `scan.sarif` and seed its findings into the graph the way + * a prior `codehub scan` would have (via the same idempotent ingest path). + * Returns the sarif path so the test can re-ingest it on the skip branch. + */ +async function seedRepoWithFindings(repoPath: string): Promise { + await mkdir(resolveRepoMetaDir(repoPath), { recursive: true }); + const sarifPath = join(resolveRepoMetaDir(repoPath), "scan.sarif"); + await writeFile(sarifPath, `${JSON.stringify(scanLog(), null, 2)}\n`, "utf8"); + await runIngestSarif(sarifPath, { repo: repoPath }); + return sarifPath; +} + +/** + * Reproduce the replace-mode graph rebuild that `runAnalyze` performs: a + * `bulkLoad` in the default replace mode truncates all nodes/edges. This is + * the step that wipes the seeded Finding nodes. + */ +async function simulateGraphRebuildWipe(repoPath: string): Promise { + const store = await openStore({ path: resolveGraphPath(repoPath) }); + try { + await store.graph.open(); + await store.temporal.open(); + await store.graph.createSchema(); + // Empty graph in replace mode == the "rebuilt from the pipeline" graph + // that carries no Finding nodes (findings come only from the scan step). + await store.graph.bulkLoad(new KnowledgeGraph()); + } finally { + await store.close(); + } +} + +test("scan-skip path: replace-mode graph rebuild wipes seeded Finding nodes", async () => { + const repoPath = await mkdtemp(join(tmpdir(), "och-findings-wipe-")); + await seedRepoWithFindings(repoPath); + assert.equal(await countFindings(repoPath), 2, "seed should ingest two findings"); + + await simulateGraphRebuildWipe(repoPath); + + // This asserts the BUG precondition: after the replace-mode rebuild the + // findings are gone. If a future change makes the rebuild preserve findings + // this assertion flips and the guard below becomes redundant — update both. + assert.equal( + await countFindings(repoPath), + 0, + "replace-mode bulkLoad must truncate the prior Finding nodes", + ); +}); + +test("scan-skip path: re-ingesting the cached SARIF restores findings after the wipe", async () => { + const repoPath = await mkdtemp(join(tmpdir(), "och-findings-restore-")); + const sarifPath = await seedRepoWithFindings(repoPath); + + // Graph rebuild wipes the findings... + await simulateGraphRebuildWipe(repoPath); + assert.equal(await countFindings(repoPath), 0); + + // ...and the fix re-ingests the reused scan.sarif on the fingerprint-match + // skip branch (exactly what analyze.ts now does instead of only logging). + const ingested = await runIngestSarif(sarifPath, { repo: repoPath }); + + assert.equal(ingested.findingsEmitted, 2, "re-ingest must emit both cached findings"); + assert.equal( + await countFindings(repoPath), + 2, + "findings must survive an incremental re-analyze that skips the scanners", + ); +}); + +test("scan-skip re-ingest is idempotent — no duplicate Finding nodes", async () => { + const repoPath = await mkdtemp(join(tmpdir(), "och-findings-idem-")); + const sarifPath = await seedRepoWithFindings(repoPath); + await simulateGraphRebuildWipe(repoPath); + + // Two consecutive skip-path re-ingests (two incremental analyze runs) must + // not double-count — the ingest bulkLoad runs in upsert mode keyed on the + // finding fingerprint. + await runIngestSarif(sarifPath, { repo: repoPath }); + await runIngestSarif(sarifPath, { repo: repoPath }); + + assert.equal( + await countFindings(repoPath), + 2, + "repeated skip-path re-ingests must stay at two findings (idempotent upsert)", + ); +}); diff --git a/packages/cli/src/commands/analyze.test.ts b/packages/cli/src/commands/analyze.test.ts index 98226567..7c52612f 100644 --- a/packages/cli/src/commands/analyze.test.ts +++ b/packages/cli/src/commands/analyze.test.ts @@ -101,7 +101,7 @@ test("resolveMaxSummariesCap: auto clamps at the 500 cap for large repos", async }); test("resolveMaxSummariesCap: auto falls back to 50 on first run (no prior seed)", async () => { - // `undefined` models "no prior DuckDB store at the expected path". + // `undefined` models "no prior store at the expected path". const cap = await resolveMaxSummariesCap("/unused", "auto", true, async () => undefined); assert.equal(cap, 50); }); diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index e930aedb..669e84ee 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -20,17 +20,7 @@ import { spawn } from "node:child_process"; import { mkdir } from "node:fs/promises"; import { basename, join, resolve } from "node:path"; -import { - type CodeRelation, - type EdgeId, - type GraphNode, - NODE_KINDS, - type NodeId, - type NodeKind, - RELATION_TYPES, - type RelationType, - SCHEMA_VERSION, -} from "@opencodehub/core-types"; +import { type CodeRelation, type GraphNode, SCHEMA_VERSION } from "@opencodehub/core-types"; import { embedderModelId } from "@opencodehub/embedder"; import { pipeline } from "@opencodehub/ingestion"; import { @@ -47,7 +37,6 @@ import { type RepoEntry, readRegistry, upsertRegistry } from "../registry.js"; import { generateSkills } from "../skills-gen.js"; import { computeScanFingerprint, - countSarifFindings, readScanFingerprint, shouldSkipScan, writeScanFingerprint, @@ -556,9 +545,25 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi sarifExists, }) ) { - const priorCount = await countSarifFindings(sarifPath); - const countNote = priorCount !== undefined ? `, reusing ${priorCount} finding(s)` : ""; - log(`codehub analyze: scan — up to date (fingerprint match)${countNote}`); + // The graph bulkLoad above ran in replace mode (ADR 0019), which + // truncated every node — including the `Finding` nodes and + // `FOUND_IN` edges from the prior scan. When we skip re-running the + // scanners we still MUST re-ingest the reused `scan.sarif`, or the + // freshly-rebuilt graph ends up with zero findings and + // `list_findings`/`verdict`/`list_findings_delta` silently report a + // clean scan. `runIngestSarif` is idempotent (fingerprint-stable + // enrichment + upsert-mode bulkLoad), so re-ingesting the unchanged + // SARIF restores exactly the findings the wipe removed. + const { runIngestSarif } = await import("./ingest-sarif.js"); + const ingestOpts: { repo: string; home?: string } = { + repo: repoName, + ...(opts.home !== undefined ? { home: opts.home } : {}), + }; + const ingested = await runIngestSarif(sarifPath, ingestOpts); + log( + `codehub analyze: scan — up to date (fingerprint match), ` + + `re-ingested ${ingested.findingsEmitted} finding(s) from cached SARIF`, + ); } else { await runScanAndLog(); // Refresh the sidecar only after a successful scan so a thrown @@ -598,9 +603,9 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi * `.codehub/scan-state.json` (written at the tail of the prior run), * - IMPORTS + EXTENDS + IMPLEMENTS edges recovered from the `relations` * table by stripping each endpoint id back to its enclosing file path, - * - the FULL prior node and edge snapshot, mapped back into - * {@link GraphNode} / {@link CodeRelation} via {@link rowToGraphNode} - * and {@link rowToCodeRelation}. Shipping these two arrays is what + * - the FULL prior node and edge snapshot as {@link GraphNode} / + * {@link CodeRelation} arrays (via the store's typed `listNodes` / + * `listEdges` finders). Shipping these two arrays is what * flips `resolveIncrementalView` * (`packages/ingestion/src/pipeline/phases/incremental-helper.ts:95-102`) * from `active=false` (passive mode) to `active=true`, so the four @@ -629,10 +634,7 @@ export async function loadPreviousGraph( // Full node + edge dumps via typed finders. For a typical OCH repo // this is 10K-50K nodes and 20K-100K edges — fits in memory in one // shot. The `listNodes` / `listEdges` finders already return - // rehydrated `GraphNode` / `CodeRelation` objects, so the legacy - // `rowToGraphNode` / `rowToCodeRelation` adapters are no longer - // needed on this read path — they remain exported for external - // consumers that hand-roll over the wide-column shape. + // rehydrated `GraphNode` / `CodeRelation` objects. const nodes = [...(await store.graph.listNodes())]; const edges = [...(await store.graph.listEdges())]; // Derive the legacy file-granular projections from the full edge set so @@ -985,12 +987,11 @@ async function openEmbeddingHashCacheAdapter( adapter: { // listEmbeddingHashes is on the graph-tier interface — embeddings // travel with the graph view, not the temporal cochange table. - // Wrapped in try/catch: on a freshly-created lbug db that has no - // schema yet, the Cypher query inside listEmbeddingHashes() can - // throw "Cannot create an empty database under READ ONLY mode" - // because lbug defers some internal initialization until first - // query. Returning an empty map matches the interface contract - // ("Empty map on a fresh database or any error"). + // Wrapped in try/catch: querying a freshly-created store that has no + // schema yet (or a read-only handle on a not-yet-initialized file) can + // throw before the embeddings table exists. Returning an empty map + // matches the interface contract ("Empty map on a fresh database or + // any error"). list: async () => { try { return await store.graph.listEmbeddingHashes(); @@ -1018,350 +1019,6 @@ function fileFromNodeId(id: string): string | undefined { return rest.slice(0, second); } -// `PREV_NODE_SELECT_COLUMNS` was the explicit column whitelist used by the -// legacy SQL `SELECT * FROM nodes` round-trip in {@link loadPreviousGraph}. -// That read path now goes through `store.graph.listNodes()`, which already -// returns rehydrated `GraphNode` objects, so the constant is no longer -// load-bearing here. The `rowToGraphNode` / `rowToCodeRelation` adapters -// below remain exported for external consumers that hand-roll over the -// SQLite wide-column shape. - -const NODE_KIND_SET: ReadonlySet = new Set(NODE_KINDS); -const RELATION_TYPE_SET: ReadonlySet = new Set(RELATION_TYPES); - -function strField(r: Record, col: string): string | undefined { - const v = r[col]; - return typeof v === "string" && v.length > 0 ? v : undefined; -} - -function numField(r: Record, col: string): number | undefined { - const v = r[col]; - if (typeof v === "number" && Number.isFinite(v)) return v; - if (typeof v === "bigint") return Number(v); - return undefined; -} - -function boolField(r: Record, col: string): boolean | undefined { - const v = r[col]; - return typeof v === "boolean" ? v : undefined; -} - -function stringArrayField(r: Record, col: string): readonly string[] | undefined { - // Preserve `[]` distinct from absent. The SQLite TEXT[] binder returns - // a 0-length JS array for an empty SQL array literal and `null` for - // SQL NULL; mirror the storage adapter's `setStringArrayField` and - // return the array verbatim so a Community / Route node written as - // `{keywords: []}` (or `{responseKeys: []}`) survives the carry-forward - // load with its empty array intact — required so canonical-JSON / - // graphHash byte-identity holds across the incremental re-index. - const v = r[col]; - if (!Array.isArray(v)) return undefined; - const out: string[] = []; - for (const item of v) { - if (typeof item === "string") out.push(item); - } - return out; -} - -function parseJsonStringArrayField( - r: Record, - col: string, -): readonly string[] | undefined { - const raw = r[col]; - if (typeof raw !== "string" || raw.length === 0) return undefined; - try { - const parsed = JSON.parse(raw) as unknown; - if (!Array.isArray(parsed)) return undefined; - return parsed.filter((x): x is string => typeof x === "string"); - } catch { - return undefined; - } -} - -function parseJsonObjectField( - r: Record, - col: string, -): Record | undefined { - const raw = r[col]; - if (typeof raw !== "string" || raw.length === 0) return undefined; - try { - const parsed = JSON.parse(raw) as unknown; - if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) return undefined; - return parsed as Record; - } catch { - return undefined; - } -} - -/** - * Reverse of `nodeToRow` (`packages/storage/src/duckdb-adapter.ts:1169`): - * translate one row of the polymorphic `nodes` table back into a - * {@link GraphNode}. Only the `nodes`/`edges` fidelity required by the four - * incremental consumer phases (`cross-file`, `mro`, `communities`, - * `processes`) is load-bearing — Community / Process nodes are re-added - * verbatim by `communities.ts:90-94` / `processes.ts:306-310`, so their - * `name` / `filePath` / `inferredLabel` / `keywords` / `symbolCount` / - * `cohesion` / `entryPointId` / `stepCount` must round-trip. Other kinds - * survive the round trip best-effort; fields we can't recover stay - * `undefined` and the caller treats the resulting node as lossy — safe - * because the carry-forward only lives long enough to be hashed into the - * next graph. - * - * Returns `undefined` when the row carries a `kind` we don't recognise or - * when required scalar slots (`id`, `name`, `file_path`) are missing. - * - * Exported for tests; the production call site is {@link loadPreviousGraph}. - */ -export function rowToGraphNode(row: Record): GraphNode | undefined { - const idRaw = row["id"]; - const nameRaw = row["name"]; - const fileRaw = row["file_path"]; - const kindRaw = row["kind"]; - if (typeof idRaw !== "string" || idRaw.length === 0) return undefined; - if (typeof nameRaw !== "string") return undefined; - if (typeof fileRaw !== "string") return undefined; - if (typeof kindRaw !== "string" || !NODE_KIND_SET.has(kindRaw)) return undefined; - const kind = kindRaw as NodeKind; - - // Build a permissive record keyed by TS field names. The discriminated- - // union cast at the end is safe because every `GraphNode` member only - // requires `id`/`kind`/`name`/`filePath` plus optional fields beyond that; - // required fields unique to a kind (e.g. `FindingNode.propertiesBag`) are - // populated explicitly in the per-kind branches below. - const node: Record = { - id: idRaw as NodeId, - kind, - name: nameRaw, - filePath: fileRaw, - }; - - // LocatedNode fields — set only when non-NULL because some non-LocatedNode - // kinds (Community / Process / File / Folder) intentionally leave them - // NULL and re-hydrating a spurious zero would change the graph hash. - const startLine = numField(row, "start_line"); - if (startLine !== undefined) node["startLine"] = startLine; - const endLine = numField(row, "end_line"); - if (endLine !== undefined) node["endLine"] = endLine; - - const isExported = boolField(row, "is_exported"); - if (isExported !== undefined) node["isExported"] = isExported; - const signature = strField(row, "signature"); - if (signature !== undefined) node["signature"] = signature; - const parameterCount = numField(row, "parameter_count"); - if (parameterCount !== undefined) node["parameterCount"] = parameterCount; - const returnType = strField(row, "return_type"); - if (returnType !== undefined) node["returnType"] = returnType; - const declaredType = strField(row, "declared_type"); - if (declaredType !== undefined) node["declaredType"] = declaredType; - const owner = strField(row, "owner"); - if (owner !== undefined) node["owner"] = owner; - const description = strField(row, "description"); - if (description !== undefined) node["description"] = description; - const contentHash = strField(row, "content_hash"); - if (contentHash !== undefined) node["contentHash"] = contentHash; - const content = strField(row, "content"); - if (content !== undefined) node["content"] = content; - - // Community / Process — the two carry-forward-critical kinds. - const inferredLabel = strField(row, "inferred_label"); - if (inferredLabel !== undefined) node["inferredLabel"] = inferredLabel; - const symbolCount = numField(row, "symbol_count"); - if (symbolCount !== undefined) node["symbolCount"] = symbolCount; - const cohesion = numField(row, "cohesion"); - if (cohesion !== undefined) node["cohesion"] = cohesion; - const keywords = stringArrayField(row, "keywords"); - if (keywords !== undefined) node["keywords"] = keywords; - const entryPointId = strField(row, "entry_point_id"); - if (entryPointId !== undefined) node["entryPointId"] = entryPointId; - const stepCount = numField(row, "step_count"); - if (stepCount !== undefined) node["stepCount"] = stepCount; - - // Section (markdown heading) — `level` round-trips for completeness. - const level = numField(row, "level"); - if (level !== undefined) node["level"] = level; - - // Route: `url` + `responseKeys` + `method` (shared column with Tool / Operation). - const url = strField(row, "url"); - if (url !== undefined) node["url"] = url; - const responseKeys = stringArrayField(row, "response_keys"); - if (responseKeys !== undefined) node["responseKeys"] = responseKeys; - - if (kind === "Tool") { - const toolName = strField(row, "tool_name"); - if (toolName !== undefined) node["toolName"] = toolName; - const inputSchemaJson = strField(row, "input_schema_json"); - if (inputSchemaJson !== undefined) node["inputSchemaJson"] = inputSchemaJson; - } else if (kind === "Route") { - const method = strField(row, "method"); - if (method !== undefined) node["method"] = method; - } - - if (kind === "Finding") { - const ruleId = strField(row, "rule_id"); - const severity = strField(row, "severity"); - const scannerId = strField(row, "scanner_id"); - const message = strField(row, "message"); - const propertiesBag = parseJsonObjectField(row, "properties_bag"); - if (ruleId !== undefined) node["ruleId"] = ruleId; - if (severity !== undefined) node["severity"] = severity; - if (scannerId !== undefined) node["scannerId"] = scannerId; - if (message !== undefined) node["message"] = message; - // propertiesBag is REQUIRED on FindingNode; default to {} on lossy reads - // so the resulting object still structurally satisfies the union. - node["propertiesBag"] = propertiesBag ?? {}; - const partialFingerprint = strField(row, "partial_fingerprint"); - if (partialFingerprint !== undefined) node["partialFingerprint"] = partialFingerprint; - const baselineState = strField(row, "baseline_state"); - if (baselineState !== undefined) node["baselineState"] = baselineState; - const suppressedJson = strField(row, "suppressed_json"); - if (suppressedJson !== undefined) node["suppressedJson"] = suppressedJson; - } - - if (kind === "Dependency") { - const version = strField(row, "version"); - const ecosystem = strField(row, "ecosystem"); - const lockfileSource = strField(row, "lockfile_source"); - const license = strField(row, "license"); - // version / ecosystem / lockfileSource are REQUIRED on the type; default - // to safe values when NULL so the object still passes the structural - // union at runtime. The carry-forward path only hashes these fields. - node["version"] = version ?? ""; - node["ecosystem"] = ecosystem ?? "npm"; - node["lockfileSource"] = lockfileSource ?? ""; - if (license !== undefined) node["license"] = license; - } - - if (kind === "Operation") { - const httpMethod = strField(row, "http_method"); - const httpPath = strField(row, "http_path"); - node["method"] = httpMethod ?? "GET"; - node["path"] = httpPath ?? "/"; - const summary = strField(row, "summary"); - if (summary !== undefined) node["summary"] = summary; - const operationId = strField(row, "operation_id"); - if (operationId !== undefined) node["operationId"] = operationId; - } - - if (kind === "Contributor") { - const emailHash = strField(row, "email_hash"); - node["emailHash"] = emailHash ?? ""; - const emailPlain = strField(row, "email_plain"); - if (emailPlain !== undefined) node["emailPlain"] = emailPlain; - } - - // ProjectProfile — JSON-encoded array columns plus a polymorphic - // `frameworks_json` (flat `string[]` OR `{ flat, detected }`). - if (kind === "ProjectProfile") { - node["languages"] = parseJsonStringArrayField(row, "languages_json") ?? []; - const frameworksRaw = strField(row, "frameworks_json"); - let frameworksFlat: readonly string[] = []; - if (frameworksRaw !== undefined) { - try { - const parsed = JSON.parse(frameworksRaw) as unknown; - if (Array.isArray(parsed)) { - frameworksFlat = parsed.filter((x): x is string => typeof x === "string"); - } else if (typeof parsed === "object" && parsed !== null) { - const rec = parsed as Record; - const flat = rec["flat"]; - if (Array.isArray(flat)) { - frameworksFlat = flat.filter((x): x is string => typeof x === "string"); - } - const detected = rec["detected"]; - if (Array.isArray(detected)) node["frameworksDetected"] = detected; - } - } catch { - /* ignore — leave frameworks as [] */ - } - } - node["frameworks"] = frameworksFlat; - node["iacTypes"] = parseJsonStringArrayField(row, "iac_types_json") ?? []; - node["apiContracts"] = parseJsonStringArrayField(row, "api_contracts_json") ?? []; - node["manifests"] = parseJsonStringArrayField(row, "manifests_json") ?? []; - node["srcDirs"] = parseJsonStringArrayField(row, "src_dirs_json") ?? []; - } - - // File ownership (H.5) + Community ownership (H.4) — shared across kinds. - const orphanGrade = strField(row, "orphan_grade"); - if (orphanGrade !== undefined) node["orphanGrade"] = orphanGrade; - const isOrphan = boolField(row, "is_orphan"); - if (isOrphan !== undefined) node["isOrphan"] = isOrphan; - const truckFactor = numField(row, "truck_factor"); - if (truckFactor !== undefined) node["truckFactor"] = truckFactor; - const od30 = numField(row, "ownership_drift_30d"); - if (od30 !== undefined) node["ownershipDrift30d"] = od30; - const od90 = numField(row, "ownership_drift_90d"); - if (od90 !== undefined) node["ownershipDrift90d"] = od90; - const od365 = numField(row, "ownership_drift_365d"); - if (od365 !== undefined) node["ownershipDrift365d"] = od365; - - // v1.2 extensions - const deadness = strField(row, "deadness"); - if (deadness !== undefined) node["deadness"] = deadness; - const coveragePercent = numField(row, "coverage_percent"); - if (coveragePercent !== undefined) node["coveragePercent"] = coveragePercent; - const coveredLinesJson = strField(row, "covered_lines_json"); - if (coveredLinesJson !== undefined) node["coveredLinesJson"] = coveredLinesJson; - const cyclomaticComplexity = numField(row, "cyclomatic_complexity"); - if (cyclomaticComplexity !== undefined) node["cyclomaticComplexity"] = cyclomaticComplexity; - const nestingDepth = numField(row, "nesting_depth"); - if (nestingDepth !== undefined) node["nestingDepth"] = nestingDepth; - const nloc = numField(row, "nloc"); - if (nloc !== undefined) node["nloc"] = nloc; - const halsteadVolume = numField(row, "halstead_volume"); - if (halsteadVolume !== undefined) node["halsteadVolume"] = halsteadVolume; - - return node as unknown as GraphNode; -} - -/** - * Reverse of the relations row builder at - * `packages/storage/src/duckdb-adapter.ts:299-340`. Relations round-trip - * cleanly because their schema is 7 scalar columns with no polymorphism. - * Returns `undefined` when `type` is not a known {@link RelationType} or - * when required scalars are missing. - * - * Exported for tests; the production call site is {@link loadPreviousGraph}. - */ -export function rowToCodeRelation(row: Record): CodeRelation | undefined { - const id = row["id"]; - const from = row["from_id"]; - const to = row["to_id"]; - const type = row["type"]; - const confidence = row["confidence"]; - if (typeof id !== "string" || id.length === 0) return undefined; - if (typeof from !== "string" || from.length === 0) return undefined; - if (typeof to !== "string" || to.length === 0) return undefined; - if (typeof type !== "string" || !RELATION_TYPE_SET.has(type)) return undefined; - const conf = - typeof confidence === "number" && Number.isFinite(confidence) ? confidence : Number(confidence); - if (!Number.isFinite(conf)) return undefined; - - const reason = row["reason"]; - const step = row["step"]; - const base = { - id: id as EdgeId, - from: from as NodeId, - to: to as NodeId, - type: type as RelationType, - confidence: conf, - }; - const stepNum: number | undefined = - typeof step === "number" && Number.isFinite(step) - ? step - : typeof step === "bigint" - ? Number(step) - : undefined; - const hasReason = typeof reason === "string" && reason.length > 0; - // Build the final record in a single statement so we match the optional- - // field discipline required by `exactOptionalPropertyTypes`. - if (hasReason && stepNum !== undefined) { - return { ...base, reason: reason as string, step: stepNum }; - } - if (hasReason) return { ...base, reason: reason as string }; - if (stepNum !== undefined) return { ...base, step: stepNum }; - return base; -} - /** Per-file record persisted to `.codehub/scan-state.json`. */ interface ScanStateFile { readonly relPath: string; diff --git a/packages/cli/src/commands/api-impact.test.ts b/packages/cli/src/commands/api-impact.test.ts index 65eb875b..ba26ebf9 100644 --- a/packages/cli/src/commands/api-impact.test.ts +++ b/packages/cli/src/commands/api-impact.test.ts @@ -69,8 +69,8 @@ function makeFakeStore( const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/augment.test.ts b/packages/cli/src/commands/augment.test.ts index 77ded290..ad6e5da2 100644 --- a/packages/cli/src/commands/augment.test.ts +++ b/packages/cli/src/commands/augment.test.ts @@ -4,7 +4,7 @@ * Coverage matches the P0-1 contract: * - empty output when no repo is registered for the cwd * - empty output for sub-threshold patterns (<3 chars) - * - surface callers + processes when a real DuckDB fixture has them + * - surface callers + processes when a real store fixture has them * - never throws, regardless of registry corruption or missing index * - cold-start budget (<750ms) on a 10k-node fixture */ @@ -113,11 +113,11 @@ test("augment: returns empty when cwd maps to no registered repo", async () => { assert.equal(out, ""); }); -test("augment: returns empty when the registered repo has no DuckDB file", async () => { +test("augment: returns empty when the registered repo has no store file", async () => { const home = await scratch("no-db"); const repoPath = resolve(home, "ghost"); await mkdir(join(repoPath, ".codehub"), { recursive: true }); - // Registry entry points at a repo whose graph.duckdb does not exist. + // Registry entry points at a repo whose store.sqlite does not exist. await upsertRegistry( { name: "ghost", diff --git a/packages/cli/src/commands/baseline.test.ts b/packages/cli/src/commands/baseline.test.ts index ff8f2511..e6587eb3 100644 --- a/packages/cli/src/commands/baseline.test.ts +++ b/packages/cli/src/commands/baseline.test.ts @@ -3,7 +3,7 @@ * * These tests write SARIF files on disk under a scratch tmp dir, run the * command handlers directly (no commander round-trip), and assert on the - * returned summary + on-disk artifact. No registry or DuckDB is touched. + * returned summary + on-disk artifact. No registry or store is touched. */ import assert from "node:assert/strict"; diff --git a/packages/cli/src/commands/change-pack.test.ts b/packages/cli/src/commands/change-pack.test.ts index 9d851306..ad1a4286 100644 --- a/packages/cli/src/commands/change-pack.test.ts +++ b/packages/cli/src/commands/change-pack.test.ts @@ -11,7 +11,7 @@ * 4. The store is always closed (finally), even on the summary path. * * Each test injects an `_openStore` factory + an `_runChangePack` stand-in - * so nothing hits lbug/DuckDB or git. The CLI's contract under test is the + * so nothing hits the store or git. The CLI's contract under test is the * exit-code passthrough (`pack.verdict.exitCode`) and the JSON shape — not * the analysis module's compose logic, which has its own suite. */ @@ -39,8 +39,8 @@ function fakeStore(): FakeStoreHandle { const store = { graph: FAKE_GRAPH, temporal: {} as unknown, - graphFile: "/tmp/fake-repo/.codehub/graph.lbug", - temporalFile: "/tmp/fake-repo/.codehub/temporal.duckdb", + graphFile: "/tmp/fake-repo/.codehub/store.sqlite", + temporalFile: "/tmp/fake-repo/.codehub/store.sqlite", close: async () => { wasClosed = true; }, diff --git a/packages/cli/src/commands/code-pack.ts b/packages/cli/src/commands/code-pack.ts index 7fcbcf8d..990b59f3 100644 --- a/packages/cli/src/commands/code-pack.ts +++ b/packages/cli/src/commands/code-pack.ts @@ -531,9 +531,9 @@ export function formatContextSummary(s: ContextSummary): string { * via `_store`; production passes the full Store envelope from * {@link openStore}. The composed envelope is the only shape carrying both * a `graph` and a `temporal` view, so the presence of both uniquely - * identifies it. (The pre-ADR-0016 envelope also carried a `backend` - * discriminator; that field was removed when the DuckDB-as-graph backend was - * ripped out, so this no longer keys off it.) + * identifies it. (An earlier envelope also carried a `backend` discriminator; + * the single-backend collapse in ADR 0019 removed it, so this no longer keys + * off it.) */ function isStoreShape(s: Store | IGraphStore | undefined): s is Store { if (s === undefined) return false; diff --git a/packages/cli/src/commands/context.test.ts b/packages/cli/src/commands/context.test.ts index bd08a02e..71b088c0 100644 --- a/packages/cli/src/commands/context.test.ts +++ b/packages/cli/src/commands/context.test.ts @@ -100,8 +100,8 @@ function makeFakeStore(opts: FakeStoreOptions = {}): FakeStoreHandle { const composed: Store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.duckdb", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/dead-code.test.ts b/packages/cli/src/commands/dead-code.test.ts index 76c4a42b..c3e4847e 100644 --- a/packages/cli/src/commands/dead-code.test.ts +++ b/packages/cli/src/commands/dead-code.test.ts @@ -53,8 +53,8 @@ function makeFakeStore(syms: readonly FakeSym[]): { store: Store; closed: () => const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/dependencies.test.ts b/packages/cli/src/commands/dependencies.test.ts index d7829f9d..beb93ff4 100644 --- a/packages/cli/src/commands/dependencies.test.ts +++ b/packages/cli/src/commands/dependencies.test.ts @@ -54,8 +54,8 @@ function makeFakeStore(deps: readonly DependencyNode[]): FakeHandle { handle.store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/findings.test.ts b/packages/cli/src/commands/findings.test.ts index e95324d9..43a5a1e2 100644 --- a/packages/cli/src/commands/findings.test.ts +++ b/packages/cli/src/commands/findings.test.ts @@ -56,8 +56,8 @@ function makeFakeStore(rows: readonly FindingNode[]): FakeHandle { handle.store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/license-audit.test.ts b/packages/cli/src/commands/license-audit.test.ts index 16fa2f6a..5f8ff19d 100644 --- a/packages/cli/src/commands/license-audit.test.ts +++ b/packages/cli/src/commands/license-audit.test.ts @@ -38,8 +38,8 @@ function makeFakeStore(deps: readonly DependencyNode[]): { store: Store; closed: const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/open-store.ts b/packages/cli/src/commands/open-store.ts index 37078fb6..93e31b4a 100644 --- a/packages/cli/src/commands/open-store.ts +++ b/packages/cli/src/commands/open-store.ts @@ -7,8 +7,8 @@ * so callers can route graph-tier queries through `store.graph` and * temporal-tier queries (cochanges, summaries, `--sql` escape hatch) * through `store.temporal`. Post-ADR 0019 both views are one `SqliteStore` - * over a single `/.codehub/store.sqlite`; the legacy backend selector - * was removed when the lbug + DuckDB pair was replaced (see ADR 0019). + * over a single `/.codehub/store.sqlite`; the prior two-backend + * selector was removed in that single-file migration (see ADR 0019). */ import { resolve } from "node:path"; diff --git a/packages/cli/src/commands/owners.test.ts b/packages/cli/src/commands/owners.test.ts index e1387d6d..15f23bec 100644 --- a/packages/cli/src/commands/owners.test.ts +++ b/packages/cli/src/commands/owners.test.ts @@ -63,8 +63,8 @@ function makeFakeStore( const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/project-profile.test.ts b/packages/cli/src/commands/project-profile.test.ts index f994c6b2..c308b508 100644 --- a/packages/cli/src/commands/project-profile.test.ts +++ b/packages/cli/src/commands/project-profile.test.ts @@ -29,8 +29,8 @@ function makeFakeStore(profile: ProjectProfileNode | undefined): { const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/query.test.ts b/packages/cli/src/commands/query.test.ts index 14ec185b..c4666abf 100644 --- a/packages/cli/src/commands/query.test.ts +++ b/packages/cli/src/commands/query.test.ts @@ -10,7 +10,7 @@ * - `--bm25-only` skips the embedder probe entirely. * * The fake store intercepts the `embeddings` count probe so we can steer - * the hybrid-vs-BM25 branch without staging DuckDB or ONNX weights. + * the hybrid-vs-BM25 branch without staging the store or ONNX weights. */ import assert from "node:assert/strict"; @@ -153,8 +153,8 @@ function makeFakeStore(opts: FakeStoreOptions = {}): FakeStoreHandle { const composed: Store = { graph: graph as unknown as IGraphStore, temporal: temporal as unknown as ITemporalStore, - graphFile: "/tmp/fake.duckdb", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/replay.test.ts b/packages/cli/src/commands/replay.test.ts index 00b7e889..3aa3c2be 100644 --- a/packages/cli/src/commands/replay.test.ts +++ b/packages/cli/src/commands/replay.test.ts @@ -215,7 +215,7 @@ describe("loadPack (real on-disk)", () => { await rm(dir, { recursive: true, force: true }); }); - it("parses manifest (schema 2, no duckdb pin), ast-chunks, and context-bom ranges", async () => { + it("parses manifest (schema 2, no legacy backend pin), ast-chunks, and context-bom ranges", async () => { const loaded = await loadPack(dir); assert.equal(loaded.manifest.packHash, "deadbeef"); assert.equal(loaded.manifest.budgetTokens, 100); diff --git a/packages/cli/src/commands/replay.ts b/packages/cli/src/commands/replay.ts index 634d8a77..afdb0b64 100644 --- a/packages/cli/src/commands/replay.ts +++ b/packages/cli/src/commands/replay.ts @@ -202,8 +202,8 @@ export async function loadPack(dir: string): Promise { /** * Parse the on-disk snake_case manifest into the fields `replay` needs. - * Corrected for schema 2 (ADR 0019): no `duckdb_version` pin, `budget_tokens` - * is read for the decision set. + * Corrected for schema 2 (ADR 0019): no legacy native-backend version pin, + * `budget_tokens` is read for the decision set. */ function parseManifest(json: string): ReplayManifest { const w = JSON.parse(json) as Record; diff --git a/packages/cli/src/commands/route-map.test.ts b/packages/cli/src/commands/route-map.test.ts index 5d14710a..c9791507 100644 --- a/packages/cli/src/commands/route-map.test.ts +++ b/packages/cli/src/commands/route-map.test.ts @@ -61,8 +61,8 @@ function makeFakeStore(routes: readonly RouteNode[], edges: readonly CodeRelatio handle.store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/status.test.ts b/packages/cli/src/commands/status.test.ts index 5c899c71..ba063696 100644 --- a/packages/cli/src/commands/status.test.ts +++ b/packages/cli/src/commands/status.test.ts @@ -157,7 +157,7 @@ test("status degrades to summaries:- / vectors:unknown when the store can't open const repoPath = await seedRepo(home, "degraded"); const cap = captureStdout(); try { - // Default probe: no graph.lbug exists in the seeded repo → undefined. + // Default probe: no store.sqlite exists in the seeded repo → undefined. await runStatus(repoPath, { home, probeRetrieval: async () => undefined }); } finally { cap.restore(); diff --git a/packages/cli/src/commands/verdict.test.ts b/packages/cli/src/commands/verdict.test.ts index 7ec53ba0..87049c97 100644 --- a/packages/cli/src/commands/verdict.test.ts +++ b/packages/cli/src/commands/verdict.test.ts @@ -9,7 +9,7 @@ * 5. `--exit-code` on auto_merge tier → exit 0. * * Each test injects a stub `computeVerdictFn` + a fake store so nothing - * hits DuckDB or git. The CLI's real exit-code ladder (0/1/2/3) is what + * hits the store or git. The CLI's real exit-code ladder (0/1/2/3) is what * the assertions target, so the test pinsbehavior — not the * analysis module's 0/1/2 mapping. */ @@ -88,7 +88,7 @@ function verdictFixture( communitiesTouched: ["c1", "c2", "c3"], changedFileCount: 7, changedFiles: [ - "packages/storage/src/duckdb-adapter.ts", + "packages/storage/src/sqlite-adapter.ts", "packages/cli/src/index.ts", "README.md", ], @@ -451,7 +451,7 @@ test("runVerdict: ownership_required rule passes when approvals are supplied", a ], }; // touchedPaths now comes from the verdict pipeline (verdict.changedFiles). - // The auto_merge fixture touches `packages/storage/src/duckdb-adapter.ts`, + // The auto_merge fixture touches `packages/storage/src/sqlite-adapter.ts`, // which matches the rule glob — so the rule fires, but the supplied // @storage-team approval satisfies require_approval_from → pass. const { exitCode } = await withExitCode(async () => { @@ -598,7 +598,7 @@ test("runVerdict: ownership_required blocks (exit 3) when a changed path lacks a }, ], }; - // The auto_merge fixture touches packages/storage/src/duckdb-adapter.ts, + // The auto_merge fixture touches packages/storage/src/sqlite-adapter.ts, // which matches the rule glob. No approval supplied → block, proving the // rule sees the real changedFiles threaded through touchedPaths. const { exitCode } = await withExitCode(async () => { @@ -618,7 +618,7 @@ test("runVerdict: ownership_required blocks (exit 3) when a changed path lacks a assert.match(output, /Policy: block/); assert.match( output, - /storage-owner: path "packages\/storage\/src\/duckdb-adapter.ts" requires approval from one of: @storage-team/, + /storage-owner: path "packages\/storage\/src\/sqlite-adapter.ts" requires approval from one of: @storage-team/, ); assert.equal(exitCode, 3); }); diff --git a/packages/cli/src/skills-gen.test.ts b/packages/cli/src/skills-gen.test.ts index d9041533..6215a9d8 100644 --- a/packages/cli/src/skills-gen.test.ts +++ b/packages/cli/src/skills-gen.test.ts @@ -6,7 +6,7 @@ * "listNodesByEntryPoint" | "listEdgesByType">`). The fake store below * implements those four methods over an in-memory fixture so the tests * exercise the real code path down to the markdown renderer and the - * filesystem writer without standing up DuckDB. + * filesystem writer without standing up the store. */ import { strict as assert } from "node:assert"; diff --git a/packages/cli/tsup.config.ts b/packages/cli/tsup.config.ts index 4331b9e9..81c95ca2 100644 --- a/packages/cli/tsup.config.ts +++ b/packages/cli/tsup.config.ts @@ -50,8 +50,8 @@ const distDir = join(here, "dist"); * bundled source are still followed. `noExternal` takes precedence for the * `@opencodehub/*` scope, so our workspace libs are still inlined. * - * This implicitly covers the native bindings (`@ladybugdb/core`, - * `@duckdb/node-api`, `onnxruntime-node`, `web-tree-sitter`), the worker host + * This implicitly covers the WASM/optional runtimes (`web-tree-sitter`, + * the `onnxruntime-web` embedder), the worker host * (`piscina`), the CJS MCP SDK, and the lazily-imported packages * (`@chonkiejs/core`, `@apidevtools/swagger-parser`, * `@aws-sdk/client-sagemaker-runtime`, `ts-morph`). diff --git a/packages/docs/astro.config.mjs b/packages/docs/astro.config.mjs index 96bb099b..dfd62e69 100644 --- a/packages/docs/astro.config.mjs +++ b/packages/docs/astro.config.mjs @@ -20,7 +20,7 @@ export default defineConfig({ starlight({ title: "OpenCodeHub", description: - "Apache-2.0 code intelligence graph + MCP server for AI coding agents. 30 tools, 15 GA languages, lbug graph + DuckDB temporal, WASM-only parsing, deterministic, offline-capable.", + "Apache-2.0 code intelligence graph + MCP server for AI coding agents. 29 tools, 15 GA languages, single-file SQLite storage, WASM-only parsing, deterministic, offline-capable.", logo: { src: "./src/assets/logo.svg", replacesTitle: false, @@ -48,7 +48,7 @@ export default defineConfig({ description: "Apache-2.0 code intelligence graph + MCP server for AI coding agents. Gives agents callers, callees, processes, and blast radius in one MCP tool call — local, offline-capable, deterministic.", details: - "OpenCodeHub indexes a repository into a hybrid structural + semantic knowledge graph and exposes it over the Model Context Protocol (MCP) to AI coding agents. The MCP server registers 30 tools across five families — exploration (list_repos, query, context, impact, detect_changes, rename, sql, signature), group / federation (group_list, group_query, group_status, group_contracts, group_cross_repo_links, group_sync), scan / findings / verdict (scan, list_findings, list_findings_delta, list_dead_code, remove_dead_code, license_audit, verdict, risk_trends), HTTP / routing (route_map, api_impact, shape_check, tool_map), and meta (project_profile, dependencies, owners, pack_codebase). The CLI binary is `codehub`. Runtime: Node 20, 22, or 24, pnpm 11, lbug graph store (graph.lbug) + DuckDB temporal sibling (temporal.duckdb), always both, no backend selector (ADR 0016), web-tree-sitter (WASM) is the only parse runtime with all 15 grammar `.wasm` blobs vendored at packages/ingestion/vendor/wasms/, 15 GA languages, SCIP indexers for TypeScript / TSX / JavaScript / Python / Go / Rust / Java / C# / C / C++ / Kotlin / Ruby. 19-scanner inventory. Apache-2.0 end to end. Repos are first-class graph nodes (`repo_uri`); the cross-repo `group_*` family fans out over named groups; AMBIGUOUS_REPO error envelope returns `choices[]` so a caller can retry deterministically.", + "OpenCodeHub indexes a repository into a hybrid structural + semantic knowledge graph and exposes it over the Model Context Protocol (MCP) to AI coding agents. The MCP server registers 29 tools across five families — exploration (list_repos, query, context, impact, detect_changes, sql, signature), group / federation (group_list, group_query, group_status, group_contracts, group_cross_repo_links, group_sync), scan / findings / verdict (scan, list_findings, list_findings_delta, list_dead_code, license_audit, verdict, change_pack, risk_trends), HTTP / routing (route_map, api_impact, shape_check, tool_map), and meta (project_profile, dependencies, owners, pack_codebase). The CLI binary is `codehub`. Runtime: Node 24.15+, pnpm 11, a single-file SQLite store (`/.codehub/store.sqlite`, via Node's built-in node:sqlite) that backs graph nodes, edges, embeddings, and the temporal tables — zero native storage bindings (ADR 0019), web-tree-sitter (WASM) is the only parse runtime with all 15 grammar `.wasm` blobs vendored at packages/ingestion/vendor/wasms/, 15 GA languages, SCIP indexers for TypeScript / TSX / JavaScript / Python / Go / Rust / Java / C# / C / C++ / Kotlin / Ruby. 19-scanner inventory. Apache-2.0 end to end. Repos are first-class graph nodes (`repo_uri`); the cross-repo `group_*` family fans out over named groups; AMBIGUOUS_REPO error envelope returns `choices[]` so a caller can retry deterministically.", promote: [ "start-here/**", "agents/**", @@ -80,7 +80,7 @@ export default defineConfig({ label: "agents", paths: ["agents/**", "mcp/**"], description: - "Agent-side reference: per-editor MCP setup, the 30-tool catalog, tool decision matrix, idiomatic prompts.", + "Agent-side reference: per-editor MCP setup, the 29-tool catalog, tool decision matrix, idiomatic prompts.", }, { label: "mcp", diff --git a/packages/docs/public/tool-catalog.json b/packages/docs/public/tool-catalog.json index 33d7b1b6..eac10b6a 100644 --- a/packages/docs/public/tool-catalog.json +++ b/packages/docs/public/tool-catalog.json @@ -1,14 +1,14 @@ { "$schema": "https://opencodehub.dev/schemas/tool-catalog-v1.json", "version": "1.0.0", - "description": "Machine-readable catalog of the 28 MCP tools the OpenCodeHub server registers. Every tool is read-only with respect to user source — no tool edits the working tree. Generated to be fetched by an AI coding agent that wants the catalog without scraping the docs.", + "description": "Machine-readable catalog of the 29 MCP tools the OpenCodeHub server registers. Every tool is read-only with respect to user source — no tool edits the working tree. Generated to be fetched by an AI coding agent that wants the catalog without scraping the docs.", "server": { "name": "opencodehub", "transport": "stdio", "launch_command": "codehub mcp", "capabilities": ["tools", "resources"] }, - "tool_count": 28, + "tool_count": 29, "families": { "exploration": "High-frequency code-graph tools.", "group": "Cross-repo federation tools (require a named group).", @@ -65,7 +65,7 @@ { "name": "sql", "family": "exploration", - "description": "Read-only SQL against the DuckDB temporal store (cochanges + symbol_summaries). 5-second timeout. The node/edge graph is queried via the typed tools or Cypher.", + "description": "Read-only SQL against the temporal store (cochanges + symbol_summaries). 5-second timeout. The node/edge graph is queried via the typed tools or Cypher.", "when_to_use": "Custom view of the temporal store (cochanges + symbol_summaries) that no other tool exposes.", "when_not_to_use": "A typed tool (context, impact, query) already covers the question, or you need the node/edge graph (reach it via the typed tools or Cypher).", "signature_sketch": "sql({query, repo?, repo_uri?}) -> {rows, row_count, next_steps}", @@ -188,6 +188,15 @@ "signature_sketch": "verdict({repo?, repo_uri?, base?, head?}) -> {tier, exit_code, reasons, signals}", "example": "verdict({base: 'main', head: 'HEAD'})" }, + { + "name": "change_pack", + "family": "scan", + "description": "Deterministic, diff-scoped context pack: changed symbols plus their upstream impacted subgraph, the 5-tier verdict, affected tests, and a token-cost estimate.", + "when_to_use": "Hand a CI agent everything a diff touches in one read-only payload.", + "when_not_to_use": "Whole-repo snapshot — call pack_codebase; plain merge gate — call verdict.", + "signature_sketch": "change_pack({repo?, repo_uri?, base?, head?, depth?, budget?}) -> {changed, impacted_subgraph, verdict, affected_tests, cost_estimate}", + "example": "change_pack({base: 'main', head: 'HEAD'})" + }, { "name": "risk_trends", "family": "scan", diff --git a/packages/docs/src/content/docs/agents/discovery-and-resources.mdx b/packages/docs/src/content/docs/agents/discovery-and-resources.mdx index 1d14c4e5..a0af99e1 100644 --- a/packages/docs/src/content/docs/agents/discovery-and-resources.mdx +++ b/packages/docs/src/content/docs/agents/discovery-and-resources.mdx @@ -57,7 +57,7 @@ submission status on the [MCP registries page](/opencodehub/agents/registries/). ## Source of truth for tool inventory -The MCP server registers 28 tools at +The MCP server registers 29 tools at [`packages/mcp/src/server.ts`](https://github.com/theagenticguy/opencodehub/blob/main/packages/mcp/src/server.ts). Grep for `register[A-Z][a-zA-Z]+Tool\(server` to see the live list. If this site or any registry disagrees with the file, the file wins. diff --git a/packages/docs/src/content/docs/agents/editors/claude-code.mdx b/packages/docs/src/content/docs/agents/editors/claude-code.mdx index 907d7c6e..0353d36d 100644 --- a/packages/docs/src/content/docs/agents/editors/claude-code.mdx +++ b/packages/docs/src/content/docs/agents/editors/claude-code.mdx @@ -36,7 +36,7 @@ The `.mcp.json` shape: } ``` -`codehub mcp` runs the stdio MCP server. The 28 tools register under +`codehub mcp` runs the stdio MCP server. The 29 tools register under the `mcp__opencodehub__*` namespace. Every one is read-only with respect to your source — no tool edits the working tree. @@ -122,7 +122,7 @@ In a Claude Code session, ask: which OpenCodeHub tools do you see? ``` -The agent should list 28 tools, all under `mcp__opencodehub__*`. If it +The agent should list 29 tools, all under `mcp__opencodehub__*`. If it sees zero, the most common causes are: Claude Code wasn't restarted after `codehub init`, or `codehub` is not on PATH for the editor's process (try launching the editor from a shell that has `codehub` diff --git a/packages/docs/src/content/docs/agents/editors/cursor.mdx b/packages/docs/src/content/docs/agents/editors/cursor.mdx index 52709a08..010ba014 100644 --- a/packages/docs/src/content/docs/agents/editors/cursor.mdx +++ b/packages/docs/src/content/docs/agents/editors/cursor.mdx @@ -32,7 +32,7 @@ define the same server name. ``` That is the entire config. `codehub mcp` runs the stdio MCP server -and registers all 28 tools under `mcp__opencodehub__*`. +and registers all 29 tools under `mcp__opencodehub__*`. If `codehub` is not on your shell PATH (Cursor inherits the GUI app's environment, not your shell's), substitute the absolute path: @@ -55,7 +55,7 @@ Find the path with `which codehub` in your terminal. 1. Restart Cursor (the agent only loads MCP servers at startup). 2. Open the chat panel. 3. Ask: `which OpenCodeHub tools do you see?` -4. Expect 28 tools listed under `mcp__opencodehub__*`. +4. Expect 29 tools listed under `mcp__opencodehub__*`. If you see zero tools, check Cursor's MCP debug pane (Settings → MCP) for the server's stderr. The most common cause is `codehub` not being diff --git a/packages/docs/src/content/docs/agents/editors/opencode.mdx b/packages/docs/src/content/docs/agents/editors/opencode.mdx index 0241ff84..7fe38256 100644 --- a/packages/docs/src/content/docs/agents/editors/opencode.mdx +++ b/packages/docs/src/content/docs/agents/editors/opencode.mdx @@ -62,7 +62,7 @@ If you need env vars: 1. Restart OpenCode (or reload the workspace). 2. Open a chat session. 3. Ask: `which OpenCodeHub tools do you see?` -4. Expect 28 tools under `mcp__opencodehub__*`. +4. Expect 29 tools under `mcp__opencodehub__*`. OpenCode logs MCP server stderr to its dev console — open it if the server fails to register. diff --git a/packages/docs/src/content/docs/agents/editors/windsurf.mdx b/packages/docs/src/content/docs/agents/editors/windsurf.mdx index 0b0ffcda..07cdaec7 100644 --- a/packages/docs/src/content/docs/agents/editors/windsurf.mdx +++ b/packages/docs/src/content/docs/agents/editors/windsurf.mdx @@ -38,7 +38,7 @@ servers, add `codehub` as a sibling key under `mcpServers`. 1. Fully restart Windsurf — Cascade only loads MCP servers at boot. 2. Open Cascade in any project. 3. Ask: `which OpenCodeHub tools do you see?` -4. Expect 28 tools under `mcp__opencodehub__*`. +4. Expect 29 tools under `mcp__opencodehub__*`. If Cascade reports zero tools, check the MCP server status pane in Cascade's settings — failed servers list their stderr there. The diff --git a/packages/docs/src/content/docs/agents/index.mdx b/packages/docs/src/content/docs/agents/index.mdx index 323caa36..b4b326ae 100644 --- a/packages/docs/src/content/docs/agents/index.mdx +++ b/packages/docs/src/content/docs/agents/index.mdx @@ -9,7 +9,7 @@ import { Card, CardGrid, LinkCard } from "@astrojs/starlight/components"; OpenCodeHub gives an AI coding agent a code graph it can query: callers, callees, processes, blast radius, owners, scanner findings, and a 5-tier -PR verdict — all behind 28 MCP tools served by one local binary. The +PR verdict — all behind 29 MCP tools served by one local binary. The graph is built deterministically from your repo and stored next to it. Other docs sections answer "what is OCH" and "how is it built." This @@ -40,7 +40,7 @@ codehub init # writes .mcp.json + links the Claude Code plugin codehub analyze # first index — 30s to a few minutes ``` -Restart your editor. Your agent now has 28 MCP tools, all prefixed +Restart your editor. Your agent now has 29 MCP tools, all prefixed `mcp__opencodehub__*`. See [Install](/opencodehub/agents/install/) for the full path or jump to the per-editor card below. @@ -65,7 +65,7 @@ the full path or jump to the per-editor card below. =20.0.0`. `npm install -g @opencodehub/cli@latest` does zero native -builds and zero GitHub fetches. Supersedes ADR 0013 (parse runtime). +(`web-tree-sitter`) is now the only parse runtime on Node ≥24.15. All +15 grammar `.wasm` blobs are vendored at +`packages/ingestion/vendor/wasms/`. `npm install -g @opencodehub/cli@latest` +does zero native builds and zero GitHub fetches. Supersedes ADR 0013 +(parse runtime). [Read ADR 0015](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0015-wasm-only-parser-at-the-npm-distributed-boundary.md) -### ADR 0016 — DuckDB graph rip-out +### ADR 0016 — Graph-backend rip-out -Remove the DuckDB graph backend, the `CODEHUB_STORE` env var, the -backend probe, and the single-file `graph.duckdb` layout. The graph -tier is always `@ladybugdb/core` (`graph.lbug`); the temporal tier is -always DuckDB (`temporal.duckdb`); both files are written on every -`analyze`, with no selector. A missing graph binding hard-fails with -`GraphDbBindingError`. The segregated `IGraphStore` / `ITemporalStore` -interfaces stay as the community-fork adapter contract. +Removes the `CODEHUB_STORE` env var, the backend probe, and the +selector, settling storage on a two-file native pair with the segregated +`IGraphStore` / `ITemporalStore` interfaces preserved for community +forks. **Superseded by ADR 0019**, which collapses that pair into one +`store.sqlite` file and removes both native storage bindings. The +segregated interfaces it kept survive unchanged. [Read ADR 0016](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0016-duckdb-graph-rip.md) +### ADR 0018 — Cleanroom tool-name provenance + +Records the cleanroom provenance of the route / tool / contract tool +names, documenting the independent-derivation trail for each name. + +[Read ADR 0018](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0018-cleanroom-tool-name-provenance.md) + +### ADR 0019 — Single-file SQLite storage + +Collapses the entire index into one `/.codehub/store.sqlite` file +(WAL mode) via Node's built-in `node:sqlite` (`DatabaseSync`, enabled by +default on Node ≥24.15). One `SqliteStore` implements both `IGraphStore` +and `ITemporalStore`; `openStore()` returns that single instance as both +the `graph` and `temporal` views, so call sites use `store.graph.X()` / +`store.temporal.Y()` unchanged. Both native storage bindings are removed +and the write-only Parquet embeddings sidecar is dropped, so the +code-pack becomes an 8-item BOM and the install carries zero native +storage dependencies. Every platform is supported, including Windows +arm64 and Linux musl (Alpine). Supersedes ADR 0016 in its entirety; the +segregated interfaces stay as the community-fork escape hatch. + +[Read ADR 0019](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) + +### ADR 0020 — Decision-equivalence supersedes byte-identity + +Makes decision-equivalence the pack contract and treats byte-identity as +a witness rather than the contract itself. Pairs with the pack +determinism spec. + +[Read ADR 0020](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0020-decision-equivalence-supersedes-byte-identity.md) + ### ADR 0017 — Drop detect-secrets, tune betterleaks Remove `detect-secrets` from the scanner fleet in favour of diff --git a/packages/docs/src/content/docs/architecture/determinism.md b/packages/docs/src/content/docs/architecture/determinism.md index e11d44f7..aebc64d4 100644 --- a/packages/docs/src/content/docs/architecture/determinism.md +++ b/packages/docs/src/content/docs/architecture/determinism.md @@ -33,7 +33,7 @@ Three concrete reasons: An input is: - Source tree contents at the current commit. -- Toolchain versions (Node 22 or 24, pnpm 11.x, tree-sitter grammars +- Toolchain versions (Node ≥24.15, pnpm 11.x, tree-sitter grammars pinned in `packages/ingestion/package.json`, SCIP indexer versions pinned in `.github/workflows/gym.yml` per ADR 0006). - OpenCodeHub version (the monorepo version pinned in @@ -45,9 +45,9 @@ Anything outside that list — wall-clock time, process ID, file-system inode ordering — must not influence the hash. The ingestion phases are pure: inputs in, relations out, no ambient state. -The `graphHash` invariant covers everything the graph store -(`graph.lbug`) owns; the temporal signals in the DuckDB sibling -(`temporal.duckdb`) are statistical and never enter the hash. A parity +The `graphHash` invariant covers the graph nodes and edges in +`store.sqlite`; the temporal signals in the same file (cochanges, +symbol summaries) are statistical and never enter the hash. A parity gate in CI asserts the invariant on every PR that touches the storage layer. @@ -120,9 +120,9 @@ bytes?" If the answer is not obviously yes, the phase is wrong. ## Related -- [ADR 0001 — Storage backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0001-storage-backend.md) — - "Deterministic writes given identical INSERT order" is a listed - positive of DuckDB vs. engines with random header UUIDs. +- [ADR 0019 — Single-file SQLite storage](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) — + the graphHash byte-identity gate (`sqlite-parity.test.ts`) that a + rebuilt `KnowledgeGraph` must hash identically to the original. - [ADR 0002 — Rust core deferred](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0002-rust-core-deferred.md) — calls out the "full vs incremental `graphHash` byte-identical" determinism CI gate explicitly. diff --git a/packages/docs/src/content/docs/architecture/embeddings.md b/packages/docs/src/content/docs/architecture/embeddings.md index 5ec0841e..10735b7e 100644 --- a/packages/docs/src/content/docs/architecture/embeddings.md +++ b/packages/docs/src/content/docs/architecture/embeddings.md @@ -1,6 +1,6 @@ --- title: Embeddings -description: Three backends in a priority cascade, three tiers keyed by a granularity discriminator, one HNSW index with filter-aware traversal. +description: Three backends in a priority cascade, three tiers keyed by a granularity discriminator, one embeddings table with exact brute-force cosine KNN. sidebar: order: 50 --- @@ -8,8 +8,8 @@ sidebar: Embeddings are optional. When enabled, the pipeline produces vectors at three granularities (symbol, file, community) from one of three backends (ONNX local, HTTP/OpenAI-compat, SageMaker) and persists -them in the graph backend's embeddings table served by one HNSW -index. This page covers the backend cascade, the tier model, the +them in the `embeddings` table in `store.sqlite`, searched by exact +brute-force cosine KNN. This page covers the backend cascade, the tier model, the storage shape, and why `WHERE granularity='symbol'` does not collapse recall. @@ -123,39 +123,30 @@ embedded text when an LLM summary exists for the node. See [Summarization and fusion](/opencodehub/architecture/summarization-and-fusion/) for the formula. -## Single HNSW index +## Single embeddings table -The storage shape is deliberately simple: one embeddings table, one -HNSW index over the `vector` column, one `granularity` column as a -discriminator. All three tiers share this index. Granularity filtering -is pushed as `WHERE e.granularity IN (…)` into the index predicate, so -selective filters narrow the candidate set during traversal rather -than being applied after the fact. +The storage shape is deliberately simple: one `embeddings` table inside +`store.sqlite`, with the `vector` stored as a BLOB (BLOB-exact +`Float32Array`) and one `granularity` column as a discriminator. All +three tiers share this table. Granularity filtering is pushed as +`WHERE e.granularity IN (…)` into the query predicate, so selective +filters narrow the candidate set rather than being applied after the +fact. -## Filter-aware HNSW +## Filter-aware vector search -The graph backend's HNSW index supports filter-aware traversal — the -predicate is pushed into the graph walk so filters like +Vector search runs directly over the `embeddings` table with the +predicate applied in the SQL query, so filters like `WHERE language='python'` or `WHERE granularity='community'` actually return results. A naive post-filter walks the top-k by cosine distance and drops rows that fail the predicate, which collapses to -zero recall under selective filters; the OCH index avoids that by -construction. +zero recall under selective filters; querying with the predicate +inline avoids that by construction. -On the legacy DuckDB layout, the same property holds via the -`hnsw_acorn` community extension's ACORN-1 algorithm. If -`hnsw_acorn` fails to install or load (first-run requires network to -pull from the DuckDB community extension repo), the adapter falls -back to `vss` with a post-filter warning. If both fail, -`vectorExtension='none'` disables vector search entirely — queries -return zero rows plus a surfaced warning rather than crashing. - -## RaBitQ quantization - -`hnsw_acorn` supports RaBitQ quantization, documented at 21-30× -memory reduction versus fp32 vectors. It is a capability of the -extension rather than a separately-configured knob in OpenCodeHub — -enabling `hnsw_acorn` enables it. +The default is a brute-force KNN over the stored BLOB vectors, which is +exact and adds zero native dependencies. `node:sqlite` exposes a +`loadExtension` seam for `sqlite-vec` if brute-force is ever outgrown; +that is a deferred fast-follow, not a shipping requirement. ## Configuration knobs @@ -177,9 +168,10 @@ enabling `hnsw_acorn` enables it. remote-env-var-set + offline=true combination throws. A missing SageMaker endpoint with no env vars just picks ONNX — that is the intended cascade, not a failure. -- **`vectorExtension='none'` is a real state.** Queries return no - rows and surface an extension warning. This is the air-gapped / - offline / extension-broken state; it is not an exception. +- **No-embeddings is a real state.** When embeddings were never + computed (the default, or an air-gapped / offline run), vector search + returns no rows and surfaces a warning. This is expected, not an + exception; lexical BM25 search still works. - **Graph-hash independence.** The embeddings phase does not contribute to `graphHash` — embeddings are optional and probabilistic across backends. Gate 10 (the embeddings determinism @@ -191,10 +183,10 @@ enabling `hnsw_acorn` enables it. ## Further reading -- [ADR 0001 — Storage backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0001-storage-backend.md) - — why DuckDB + `hnsw_acorn`. +- [ADR 0019 — Single-file SQLite storage](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) + — where the `embeddings` table lives and why there is no native binding. - [ADR 0004 — Hierarchical embeddings](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0004-hierarchical-embeddings.md) - — one table, three granularities, one HNSW index. + — one table, three granularities, one discriminator column. - [Summarization and fusion](/opencodehub/architecture/summarization-and-fusion/) — where the symbol-tier text comes from. - Durable lesson: `api-patterns/sagemaker-embedder-backend.md` — diff --git a/packages/docs/src/content/docs/architecture/monorepo-map.md b/packages/docs/src/content/docs/architecture/monorepo-map.md index 1bce8010..cd85193d 100644 --- a/packages/docs/src/content/docs/architecture/monorepo-map.md +++ b/packages/docs/src/content/docs/architecture/monorepo-map.md @@ -22,14 +22,14 @@ package is a library imported by `cli`, `mcp`, `ingestion`, or | `@opencodehub/embedder` | `packages/embedder` | Deterministic ONNX embedder (`F2LLM-v2-80M`, 320-dim), modelId fingerprint, three-backend cascade. | | `@opencodehub/frameworks` | `packages/frameworks` | Five-stage framework detector (manifest → lockfile → config-AST → folder → import/SCIP) over a curated registry. | | `@opencodehub/ingestion` | `packages/ingestion` | The indexing pipeline (parse, resolve, scip-index, embeddings, communities, processes, summaries, ...). | -| `@opencodehub/mcp` | `packages/mcp` | The stdio MCP server, 28 tool registrations (all read-only with respect to user source), 7 resources, the error envelope, the staleness `_meta` block. | -| `@opencodehub/pack` | `packages/pack` | Deterministic 9-item code-pack BOM (the artifact attached to every release). | +| `@opencodehub/mcp` | `packages/mcp` | The stdio MCP server, 29 tool registrations (all read-only with respect to user source), 7 resources, the error envelope, the staleness `_meta` block. | +| `@opencodehub/pack` | `packages/pack` | Deterministic 8-item code-pack BOM (the artifact attached to every release). | | `@opencodehub/policy` | `packages/policy` | `opencodehub.policy.yaml` loader, validator, evaluator. | | `@opencodehub/sarif` | `packages/sarif` | SARIF 2.1.0 Zod schemas, merge + enrich, suppressions, baseline diffing. | | `@opencodehub/scanners` | `packages/scanners` | Nineteen scanner wrappers (semgrep, betterleaks, osv-scanner, bandit, biome, pip-audit, npm-audit, trivy, checkov, checkov-docker-compose, hadolint, tflint, spectral, ruff, grype, vulture, radon, ty, clamav). | | `@opencodehub/scip-ingest` | `packages/scip-ingest` | `.scip` protobuf reader + per-language indexer runners (TypeScript, Python, Go, Rust, Java, .NET, clang, Kotlin, Ruby). | | `@opencodehub/search` | `packages/search` | Hybrid BM25 + RRF search. | -| `@opencodehub/storage` | `packages/storage` | The `IGraphStore` / `ITemporalStore` interface segregation, the LadybugDB graph adapter and DuckDB temporal adapter, and `openStore()` that composes them. | +| `@opencodehub/storage` | `packages/storage` | The `IGraphStore` / `ITemporalStore` interface segregation, the `SqliteStore` class that implements both over one `store.sqlite` via `node:sqlite`, and `openStore()` that returns it as both views. | | `@opencodehub/summarizer` | `packages/summarizer` | Structured per-symbol summarizer (Haiku 4.5 via Bedrock Converse + Zod 4). | | `@opencodehub/wiki` | `packages/wiki` | Markdown wiki renderer (architecture, api-surface, dependency-map, ownership-map, risk-atlas) over the graph. | | `@opencodehub/docs` | `packages/docs` | This Starlight documentation site. | @@ -55,23 +55,24 @@ TypeScript project-references graph enforces this via `tsc --noEmit`. ## Storage — interface segregation -`@opencodehub/storage` exposes two narrow interfaces — `IGraphStore` +`@opencodehub/storage` exposes two narrow interfaces: `IGraphStore` (graph workload: nodes, edges, embeddings, multi-hop traversal) and `ITemporalStore` (temporal workload: cochanges, summary cache). The -single shipping pair implements them: +single shipping class implements both: -- **LadybugDB graph store + DuckDB temporal store** — always. Two - artifacts on disk (`graph.lbug` + `temporal.duckdb`), backed by a - Cypher-emitting dialect for the graph half and DuckDB SQL for the - temporal half. `IGraphStore` lives only on `GraphDbStore`; - `DuckDbStore` implements `ITemporalStore` only; `openStore()` - composes them. There is no backend selector and no fallback (ADR - 0016) — a missing LadybugDB binding throws `GraphDbBindingError`. +- **`SqliteStore` over one `store.sqlite`** — always. One artifact on + disk (`.codehub/store.sqlite`, WAL mode) backed by Node's built-in + `node:sqlite`, holding nodes, edges, embeddings, the FTS5 index, and + the temporal tables. One `SqliteStore` implements both `IGraphStore` + and `ITemporalStore`; `openStore()` returns that one instance as both + the `graph` and `temporal` views. There is no backend selector, no + native binding, and no fallback (ADR 0019 removed both + `@ladybugdb/core` and `@duckdb/node-api`). See [Storage backend](/opencodehub/architecture/storage-backend/) for -how `openStore()` composes the pair and the community-adapter escape -hatch (AGE / Memgraph / Neo4j / Neptune via the segregated -interfaces). +how `openStore()` returns the single store as both views and the +community-adapter escape hatch (AGE / Memgraph / Neo4j / Neptune via +the segregated interfaces). ## Related files diff --git a/packages/docs/src/content/docs/architecture/overview.md b/packages/docs/src/content/docs/architecture/overview.md index 43deb688..3ede11e5 100644 --- a/packages/docs/src/content/docs/architecture/overview.md +++ b/packages/docs/src/content/docs/architecture/overview.md @@ -1,6 +1,6 @@ --- title: Architecture overview -description: Six-phase pipeline from source tree to MCP — parse, resolve, augment, index, cluster, serve — backed by a graph-native store with deterministic outputs. +description: Six-phase pipeline from source tree to MCP — parse, resolve, augment, index, cluster, serve — backed by a single-file SQLite store with deterministic outputs. sidebar: order: 10 --- @@ -17,7 +17,7 @@ flowchart LR tree[Source tree] --> parse[Parse] parse --> resolve[Resolve] resolve --> augment[Augment
SCIP] - augment --> index[Index
BM25 + HNSW] + augment --> index[Index
BM25 + vector KNN] index --> cluster[Cluster
communities + processes] cluster --> serve[Serve
MCP] ``` @@ -26,33 +26,32 @@ Fifteen tree-sitter grammars produce a unified `ParseCapture` stream. Per-language resolvers turn captures into typed relations. SCIP indexers (TypeScript, Python, Go, Rust, Java, C#, C/C++, Kotlin, Ruby) upgrade heuristic edges to compiler-grade references where -available. The graph persists into LadybugDB, with DuckDB -carrying the temporal sibling. Communities and -processes are precomputed. An stdio MCP server with 28 tools answers +available. The whole index persists into one `store.sqlite` file via +Node's built-in `node:sqlite`. Communities and +processes are precomputed. An stdio MCP server with 29 tools answers agent queries. ## Where the data lives -The graph tier is always **LadybugDB** (`graph.lbug`); the temporal tier -is always **DuckDB** (`temporal.duckdb`). Both files live under -`.codehub/`. There is no selection knob, no probe, and no fallback — if -the `@ladybugdb/core` binding cannot load, `open()` throws -`GraphDbBindingError` and the operation aborts. See [Storage backend](/opencodehub/architecture/storage-backend/). +The entire index lives in one **`store.sqlite`** file (WAL mode) under +`.codehub/`, via Node's built-in `node:sqlite`. It holds graph nodes, +edges, embeddings, the FTS5 search index, and the temporal tables +(cochanges, summary cache). There is no selection knob, no native +binding, and no fallback: ADR 0019 removed both `@ladybugdb/core` and +`@duckdb/node-api`, leaving zero native storage bindings. See +[Storage backend](/opencodehub/architecture/storage-backend/). ```mermaid flowchart LR - subgraph lbug[".codehub/ (default)"] - nodes[(graph.lbug
nodes + edges)] - embed[(embeddings)] - temporal[(temporal.duckdb
cochanges, summary cache)] + subgraph store[".codehub/"] + db[(store.sqlite
nodes + edges + embeddings
+ cochanges, summary cache)] end - fts["BM25 over names + summaries"] --- nodes - hnsw["filter-aware HNSW"] --- embed - nodes -. round-trip parity .- temporal + fts["BM25 (FTS5) over names + summaries"] --- db + vec["vector search over embeddings"] --- db ``` -Embeddings live in the same physical store as the graph (one -`embeddings` table, one HNSW index, three granularities keyed by a +Embeddings live in the same `store.sqlite` file as the graph (one +`embeddings` table, three granularities keyed by a `granularity` discriminator). Findings reuse the `nodes` table with `kind='Finding'`. @@ -67,8 +66,8 @@ line+col, nodeType). Lines are 1-indexed, columns 0-indexed. Fifteen languages are registered via a compile-time exhaustive `satisfies Record` table: TypeScript, TSX, JavaScript, Python, Go, Rust, Java, C#, C, C++, Ruby, Kotlin, -Swift, PHP, Dart. The runtime is `web-tree-sitter` (WASM) — the only -parse runtime on Node 20, 22, and 24. There is no native parser and no +Swift, PHP, Dart. The runtime is `web-tree-sitter` (WASM), the only +parse runtime on Node ≥24.15. There is no native parser and no opt-in (ADR 0015). See [Parsing and resolution](/opencodehub/architecture/parsing-and-resolution/). @@ -103,16 +102,17 @@ and Ruby (scip-ruby). Pins live in `.github/workflows/gym.yml`. See [SCIP reconciliation](/opencodehub/architecture/scip-reconciliation/). -### 4. Index — BM25, HNSW, and scanners +### 4. Index — BM25, vector KNN, and scanners -One job: persist the graph into LadybugDB with search indexes wired up. +One job: persist the graph into `store.sqlite` with search indexes wired up. -- **BM25** — over symbol names, signatures, and summaries. -- **HNSW** — filter-aware, with the granularity discriminator pushed - into the predicate so all three tiers (symbol / file / community) - share one index without recall collapse. -- **Multi-hop traversal** — Cypher-emitting dialect on the LadybugDB - graph store. +- **BM25** — over symbol names, signatures, and summaries via an FTS5 + virtual table. +- **Vector search** — filter-aware, with the granularity discriminator + pushed into the predicate so all three tiers (symbol / file / + community) share one `embeddings` table without recall collapse. +- **Multi-hop traversal** — recursive CTEs over the `edges` table for + impact and blast-radius. Embeddings are optional, gated on `PipelineOptions.embeddings`. The backend cascade is SageMaker → HTTP / OpenAI-compatible → local ONNX. @@ -157,7 +157,7 @@ cheapest configuration that hits all three: `codehub analyze --offline` opens zero sockets. - **Deterministic.** Phases are pure: same inputs → same outputs, byte-identical `graphHash`. The `graphHash` invariant holds over the - LadybugDB graph tier. See + graph nodes and edges in `store.sqlite`. See [Determinism](/opencodehub/architecture/determinism/). - **Apache-2.0, every transitive dep on the permissive allowlist.** No BSL, no AGPL, no source-available engines in the core. See @@ -167,19 +167,23 @@ cheapest configuration that hits all three: | ADR | Topic | |---|---| -| 0001 | Storage backend selection — DuckDB + `hnsw_acorn` + `fts` (the v1.0 baseline). | +| 0001 | Storage backend selection — the v1.0 embedded baseline. **Superseded by later storage ADRs.** | | 0002 | Rust core deferred — v2.0 stays pure TypeScript. | -| 0004 | Hierarchical embeddings — one table, three granularities, filter-aware HNSW. | +| 0004 | Hierarchical embeddings — one table, three granularities, filter-aware vector search. | | 0005 | SCIP replaces LSP — compiler-grade edges without long-running language servers. | | 0006 | SCIP indexer CI pins — current version table per language. | | 0007–0010 | Artifact factory, document pattern, output conventions, dogfood findings. | -| 0011 | LadybugDB (phase-1) — graph-native backend behind the `IGraphStore` seam. | +| 0011 | Graph-native backend (phase-1) behind the `IGraphStore` seam. | | 0012 | Repo as a first-class graph node — `repo_uri`, group registry, `AMBIGUOUS_REPO` envelope. | -| 0013 (storage) | M7 default-flip + interface segregation. **Superseded by 0016.** | +| 0013 (storage) | M7 default-flip + interface segregation. **Superseded by 0019.** | | 0013 (parse) | WASM-default parse runtime, native opt-in. **Superseded by 0015.** | | 0014 | SCIP REFERENCES + TYPE_OF emission, embedder modelId stamping. | -| 0015 | WASM-only parser — `web-tree-sitter` is the only runtime on Node 20/22/24; native opt-in removed. | -| 0016 | DuckDB graph backend ripped out — LadybugDB graph + DuckDB temporal, both always present, no selection knob. | +| 0015 | WASM-only parser — `web-tree-sitter` is the only runtime on Node ≥24.15; native opt-in removed. | +| 0016 | Graph-backend rip-out, segregated interfaces preserved. **Superseded by 0019.** | +| 0017 | Drop detect-secrets — ship a tuned betterleaks default config. | +| 0018 | Cleanroom provenance of the route / tool / contract tool names. | +| 0019 | Single-file SQLite storage — one `store.sqlite` via `node:sqlite`; both native storage bindings removed. Supersedes 0016. | +| 0020 | Decision-equivalence is the pack contract; byte-identity is a witness, not the contract. | See [ADRs](/opencodehub/architecture/adrs/) for the full list. @@ -188,7 +192,8 @@ See [ADRs](/opencodehub/architecture/adrs/) for the full list. - [Monorepo map](/opencodehub/architecture/monorepo-map/) — every workspace package and what it owns. - [Storage backend](/opencodehub/architecture/storage-backend/) — the - graph + temporal interface segregation and the resolver. + single `store.sqlite` file and the `IGraphStore` / `ITemporalStore` + interface segregation. - [Cross-repo federation](/opencodehub/architecture/cross-repo-federation/) — `repo_uri`, the group registry, and the `AMBIGUOUS_REPO` envelope. - [Determinism](/opencodehub/architecture/determinism/) — the diff --git a/packages/docs/src/content/docs/architecture/parsing-and-resolution.md b/packages/docs/src/content/docs/architecture/parsing-and-resolution.md index 34b3cf1c..1c903eec 100644 --- a/packages/docs/src/content/docs/architecture/parsing-and-resolution.md +++ b/packages/docs/src/content/docs/architecture/parsing-and-resolution.md @@ -20,7 +20,7 @@ threads. Each file is hashed and the resulting `ParseCapture[]` is cached keyed on `(sha256, grammarSha, SCHEMA_VERSION)`, so a subsequent analyze with the same content skips tree-sitter entirely. -The runtime is `web-tree-sitter` (WASM) on Node 20, 22, and 24 — the +The runtime is `web-tree-sitter` (WASM) on Node ≥24.15, the only supported parse runtime. All 15 grammar `.wasm` blobs are vendored at `packages/ingestion/vendor/wasms/`, built from the grammar sources pinned in `package.json`; rebuild via `bash scripts/build-vendor-wasms.sh` diff --git a/packages/docs/src/content/docs/architecture/storage-backend.md b/packages/docs/src/content/docs/architecture/storage-backend.md index a7736834..ab901455 100644 --- a/packages/docs/src/content/docs/architecture/storage-backend.md +++ b/packages/docs/src/content/docs/architecture/storage-backend.md @@ -1,85 +1,108 @@ --- title: Storage backend -description: LadybugDB graph store + DuckDB temporal sibling, the IGraphStore / ITemporalStore segregation, how openStore composes them, and the community-adapter escape hatch. +description: One store.sqlite file backs the whole index via node:sqlite, the SqliteStore class that implements both IGraphStore and ITemporalStore, how openStore composes them, and the community-adapter escape hatch. sidebar: order: 25 --- -OpenCodeHub's storage layer is two narrow interfaces composed into one -store. The graph half is always LadybugDB; the temporal half is always -DuckDB. There is no backend selector, no probe, and no fallback layout -— `openStore()` composes a `GraphDbStore` (graph) with a `DuckDbStore` -(temporal) and returns both. If the LadybugDB binding fails to load, -`open()` throws `GraphDbBindingError` and the operation aborts. +OpenCodeHub's storage layer is two narrow interfaces implemented by one +class over one file. The entire index lives in a single +`/.codehub/store.sqlite` (WAL mode) via Node's built-in +`node:sqlite`. A single `SqliteStore` implements both `IGraphStore` and +`ITemporalStore`, and `openStore()` returns that one instance as both +the `graph` and `temporal` views. There is no backend selector, no +native binding to probe, and no fallback layout. ADR 0019 removed both +`@ladybugdb/core` and `@duckdb/node-api`, so there are zero native +storage bindings. ## The interfaces `@opencodehub/storage` exports two interfaces: - **`IGraphStore`** — graph workload. Nodes, edges, embeddings, - multi-hop traversal. Shape: properties + Cypher / Cypher-equivalent - query surface. + multi-hop traversal. - **`ITemporalStore`** — temporal workload. Cochanges, the symbol-summary cache. Statistical signals over git history that never enter `graphHash`. -Splitting the interfaces lets community adapters implement only the -half they have an engine for. A graph-only Neo4j adapter does not have -to handle cochange queries; the in-tree DuckDB temporal store does not -have to implement Cypher. `IGraphStore` lives only on `GraphDbStore`; -`DuckDbStore` implements `ITemporalStore` only — neither adapter +The interfaces stay segregated so a community adapter can implement only +the half it has an engine for. A graph-only Neo4j adapter does not have +to handle cochange queries, and a temporal-only adapter does not have to +implement graph traversal. In the shipping build, one `SqliteStore` implements both. ADR 0013 records the call-site refactor that routed 108 raw-SQL call sites across `analysis/`, `mcp/`, `pack/`, `wiki/`, -and `cli/` through the typed finders on the interfaces; ADR 0016 then -ripped the DuckDB graph adapter out entirely. +and `cli/` through the typed finders on the interfaces. -## The single pair that ships +## The single store that ships -### LadybugDB graph store + DuckDB temporal store +### One store.sqlite file, backed by node:sqlite -Two artifacts on disk, both always present after `codehub analyze`: +One artifact on disk, always present after `codehub analyze`: | File | Holds | |---|---| -| `/.codehub/graph.lbug` | Nodes, edges, embeddings, BM25 + HNSW indexes — everything `IGraphStore` owns. | -| `/.codehub/temporal.duckdb` | Cochanges, symbol-summary cache — everything `ITemporalStore` owns. | - -The graph half speaks Cypher natively and stores each edge kind in -its own physical layout — the part of the motivation that DuckDB's -polymorphic `relations` table could not match. The temporal half runs -columnar SQL aggregations over git history, where DuckDB is the right -engine. - -Embeddings live in `graph.lbug`. At pack time they stream from -`store.graph.listEmbeddings()` into a per-call DuckDB temp table on -`temporal.duckdb`, so the byte-identical `embeddings.parquet` sidecar -still works without a graph-tier round trip. +| `/.codehub/store.sqlite` | Nodes, edges, embeddings, BM25 (FTS5) indexes, and the temporal tables (cochanges, symbol-summary cache). The entire index. | + +`node:sqlite` (`DatabaseSync`, enabled by default on Node ≥24.15, the +engines floor) provides every primitive the store needs: BLOB storage +for `Float32Array` embeddings, recursive CTEs for graph traversal +(impact and blast-radius), WAL for crash-safe concurrent reads, and FTS5 +for BM25 search. It is in the standard library, so the store adds zero +install weight. + +WAL companions `store.sqlite-wal` and `store.sqlite-shm` appear while a +writer is open and collapse back to the single file on +`wal_checkpoint(TRUNCATE)` at close. + +Embeddings live in the `embeddings` table inside `store.sqlite` +(BLOB-exact and directly queryable). At pack time they stream from +`store.graph.listEmbeddings()` straight into the code-pack; there is no +Parquet sidecar and no separate temporal file to round-trip through. + +## Schema + +- One generic **`nodes`** table: typed columns for the universal base + (`id, kind, name, file_path, start_line, end_line`) plus a JSON + `payload` overflow for the 37 kind-specific shapes, rehydrated on + read. Findings reuse this table with `kind='Finding'`. +- One polymorphic **`edges`** table keyed by the `(from, to, type, + step)` dedup tuple. +- An **FTS5** virtual table over node names, signatures, and + descriptions for `search`. +- **Recursive CTEs** for multi-hop traversal (impact and blast-radius). + +The `embeddings` table holds all three granularities (symbol / file / +community) keyed by a `granularity` discriminator, so one table serves +every tier. ## How the store is composed -`openStore({path})` always returns -`{graph: GraphDbStore, temporal: DuckDbStore, graphFile, temporalFile, close}`. +`openStore({path})` opens one `store.sqlite` and returns +`{graph, temporal, storeFile, close}`, where `graph` and `temporal` are +the same `SqliteStore` instance viewed through each interface. All +existing call sites keep working unchanged: `store.graph.X()` reaches +the graph surface, `store.temporal.Y()` reaches the temporal surface. There is no `backend` field on the result and no `backend?` option on -the input. The graph artifact is always `graph.lbug`; the temporal -artifact is always `temporal.duckdb`. The `CODEHUB_STORE` env var, the -dynamic-import probe of `@ladybugdb/core`, and the dual-artifact mtime -arbitration are all gone — removed in ADR 0016. If the LadybugDB -binding cannot load, `open()` throws `GraphDbBindingError`; there is no -DuckDB-as-graph fallback. `codehub doctor` hard-fails on a missing -binding (it warned and continued in the prior auto-probe era). - -## Why the segregation, in one example - -The clean motivation: cochange detection (the temporal-store workload) -runs over git history and produces frequency / co-edit scores. The -queries are columnar SQL aggregations that DuckDB is the right -engine for. The graph workload is a different shape — multi-hop -traversal across typed edge kinds — that benefits from a graph-native -engine. Segregating the two interfaces lets each backend specialize. +the input. The `CODEHUB_STORE` env var, the dynamic-import probe of +`@ladybugdb/core`, and the dual-artifact mtime arbitration are all gone. +`codehub doctor` drops the native-binding probes and gains a +`node:sqlite` builtin check: an import plus a WAL round-trip. There is +no native storage binding left to probe. + +## Why one file + +A single embedded file removes the native binding from the install hot +path. `npm i -g @opencodehub/cli` plus Node ≥24.15 is the whole install: +no Docker, no postinstall compile, no second process. Every platform is +supported, including Windows arm64 and Linux musl (Alpine), because there +is no per-platform prebuilt to match. The graph and temporal workloads +still map to distinct primitives inside SQLite: recursive CTEs for +multi-hop traversal across typed edge kinds, and columnar aggregations +for cochange frequency and co-edit scores over git history. ## Community adapters (escape hatch) -The two interfaces are deliberately narrow so a community adapter can +The two interfaces stay deliberately narrow so a community adapter can implement either independently. Candidates for `IGraphStore` adapters include: @@ -88,27 +111,31 @@ include: - **Neo4j** (the canonical Cypher engine). - **Neptune** (AWS managed Cypher / Gremlin). -OCH ships only the LadybugDB + DuckDB pair; it does not ship these -adapters. The seam is a deliberate escape hatch — a team that already -operates one of these engines can supply an `IGraphStore` adapter and -pair it with the in-tree DuckDB `ITemporalStore`. The conformance -suite (`assertIGraphStoreConformance`) and the parity harness in +OCH ships one `SqliteStore` that implements both interfaces; it does not +ship these adapters. The seam is a deliberate escape hatch: a team that +already operates one of these engines can supply an `IGraphStore` +adapter and pair it with a temporal implementation, or implement both on +one class. The conformance suite +(`assertIGraphStoreConformance`) and the parity harness in `packages/storage/src/test-utils/` stay precisely because they are the v1.0 contract these community adapters target. ADR 0013 names the four -candidates explicitly; ADR 0016 confirms the segregated interfaces -survive the DuckDB-graph rip-out for exactly this reason. +candidates explicitly, and ADR 0019 confirms the segregated interfaces +survive the move to a single store for exactly this reason. ## Determinism The `graphHash` invariant covers everything `IGraphStore` owns and is asserted by a CI gate on every PR that touches `packages/storage`. The -temporal signals in `temporal.duckdb` (cochanges, symbol summaries) -are statistical and never enter `graphHash`. +temporal signals in `store.sqlite` (cochanges, symbol summaries) are +statistical and never enter `graphHash`. The migration's hard gate was +that a `KnowledgeGraph` rebuilt from `listNodes({})` + `listEdges({})` +must hash byte-identically to the original; +`sqlite-parity.test.ts` proves it across small and mixed-kind fixtures. ## See also - [ADR 0011 — LadybugDB graph backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0011-graph-db-backend.md) - [ADR 0013 — Storage default + interface segregation](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0013-m7-default-flip-and-abstraction.md) -- [ADR 0016 — Rip out the DuckDB graph backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0016-duckdb-graph-rip.md) +- [ADR 0019 — Single-file SQLite storage](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) - [Configuration](/opencodehub/reference/configuration/) — env vars and on-disk layout. diff --git a/packages/docs/src/content/docs/architecture/summarization-and-fusion.md b/packages/docs/src/content/docs/architecture/summarization-and-fusion.md index ea182f5d..bea4306d 100644 --- a/packages/docs/src/content/docs/architecture/summarization-and-fusion.md +++ b/packages/docs/src/content/docs/architecture/summarization-and-fusion.md @@ -130,7 +130,7 @@ sequenceDiagram participant Summ as summarize phase participant Bedrock participant Emb as embeddings phase - participant HNSW as embeddings table + HNSW + participant Vec as embeddings table Summ->>Summ: filter by SCIP-trust Summ->>Summ: cache probe (nodeId, contentHash, promptVersion) @@ -141,7 +141,7 @@ sequenceDiagram end Summ->>Summ: persist SymbolSummaryRow Emb->>Emb: symbolText(node, summary, body) — fuse - Emb->>HNSW: upsert symbol-tier vector + Emb->>Vec: upsert symbol-tier vector ``` ## Cache-key discriminator diff --git a/packages/docs/src/content/docs/guides/indexing-a-repo.md b/packages/docs/src/content/docs/guides/indexing-a-repo.md index a5973c1d..a8163f31 100644 --- a/packages/docs/src/content/docs/guides/indexing-a-repo.md +++ b/packages/docs/src/content/docs/guides/indexing-a-repo.md @@ -9,13 +9,14 @@ sidebar: tree-sitter (and SCIP for every language with a pinned indexer — TypeScript, Python, Go, Rust, Java, C#, C/C++, Kotlin, Ruby), resolve imports and inheritance, detect processes and clusters, build BM25 -and HNSW indexes, and write everything to `.codehub/` under the repo +and vector indexes, and write everything to `.codehub/` under the repo root. -The graph half is always **LadybugDB** (`.codehub/graph.lbug`) and the -temporal sibling is always **DuckDB** (`.codehub/temporal.duckdb`). Both -files are written on every analyze — there is no backend knob and no -single-file fallback. See +The whole index lives in one **`store.sqlite`** file (WAL mode) under +`.codehub/`, via Node's built-in `node:sqlite`. It holds graph nodes, +edges, embeddings, and the temporal tables, and it is written on every +analyze. There is no backend knob and no native storage binding (ADR +0019). See [Storage backend](/opencodehub/architecture/storage-backend/). ## Basic indexing @@ -34,7 +35,7 @@ codehub analyze --embeddings ``` `--embeddings` computes symbol and optional file/community vectors and -writes them to the HNSW index. After this, `codehub query` fuses BM25 +writes them to the `embeddings` table. After this, `codehub query` fuses BM25 and vector results via reciprocal-rank fusion (RRF). Memory-constrained machines can use `--embeddings-int8` for quantised @@ -83,13 +84,13 @@ symbol participates in. The default granularity is `symbol`. ## What lives in `.codehub/` -Every index writes the same two-file layout — LadybugDB for the graph, -DuckDB for the temporal sibling: +Every index writes the same single-file layout: one `store.sqlite` via +Node's built-in `node:sqlite`: | Path | Purpose | |---|---| -| `graph.lbug` | LadybugDB graph store — symbols, edges, embeddings, BM25 + HNSW indexes. | -| `temporal.duckdb` | DuckDB sibling — cochanges, symbol-summary cache. | +| `store.sqlite` | The whole index (WAL mode) — symbols, edges, embeddings, the FTS5 search index, and the temporal tables (cochanges, symbol-summary cache). | +| `store.sqlite-wal` / `store.sqlite-shm` | WAL companions present while a writer is open; collapse into `store.sqlite` at close. | | `meta.json` | Index metadata (graph hash, node counts, CLI version, toolchain pins, embedder modelId). | | `scan.sarif` | SARIF scan output when `codehub scan` has run. | | `sbom.cyclonedx.json` / `sbom.spdx.json` | SBOMs when `codehub analyze --sbom` has run. | diff --git a/packages/docs/src/content/docs/guides/using-with-claude-code.md b/packages/docs/src/content/docs/guides/using-with-claude-code.md index 5cd6cb93..efb5e739 100644 --- a/packages/docs/src/content/docs/guides/using-with-claude-code.md +++ b/packages/docs/src/content/docs/guides/using-with-claude-code.md @@ -93,7 +93,7 @@ entries in `.mcp.json` are preserved. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the full catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the full catalogue of 29 tools Claude Code will see. - [MCP overview](/opencodehub/mcp/overview/) — server name, transport, envelope conventions. diff --git a/packages/docs/src/content/docs/guides/using-with-codex.md b/packages/docs/src/content/docs/guides/using-with-codex.md index a4e2d377..71bdc50c 100644 --- a/packages/docs/src/content/docs/guides/using-with-codex.md +++ b/packages/docs/src/content/docs/guides/using-with-codex.md @@ -65,5 +65,5 @@ MCP servers are left alone. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 29 tools Codex will see. diff --git a/packages/docs/src/content/docs/guides/using-with-opencode.md b/packages/docs/src/content/docs/guides/using-with-opencode.md index c1dee36f..9a7545e7 100644 --- a/packages/docs/src/content/docs/guides/using-with-opencode.md +++ b/packages/docs/src/content/docs/guides/using-with-opencode.md @@ -76,5 +76,5 @@ MCP servers configured there are left alone. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 29 tools OpenCode will see. diff --git a/packages/docs/src/content/docs/guides/using-with-windsurf.md b/packages/docs/src/content/docs/guides/using-with-windsurf.md index 34bcb87e..0fa68ddc 100644 --- a/packages/docs/src/content/docs/guides/using-with-windsurf.md +++ b/packages/docs/src/content/docs/guides/using-with-windsurf.md @@ -76,5 +76,5 @@ are left alone. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 29 tools Windsurf will see. diff --git a/packages/docs/src/content/docs/index.mdx b/packages/docs/src/content/docs/index.mdx index 6efec178..57e25257 100644 --- a/packages/docs/src/content/docs/index.mdx +++ b/packages/docs/src/content/docs/index.mdx @@ -1,6 +1,6 @@ --- title: OpenCodeHub -description: Apache-2.0 code intelligence graph + MCP server for AI coding agents — 28 tools, 15 GA languages, deterministic, offline-capable. +description: Apache-2.0 code intelligence graph + MCP server for AI coding agents — 29 tools, 15 GA languages, deterministic, offline-capable. template: splash hero: tagline: Graph-aware impact, context, and query for an AI coding agent — local, deterministic, Apache-2.0. @@ -35,7 +35,7 @@ import { Card, CardGrid, LinkCard } from "@astrojs/starlight/components"; participating processes — without grep round-trips. - Hybrid BM25 + filter-aware HNSW search, results grouped by + Hybrid BM25 + brute-force vector KNN search, results grouped by execution-flow process. Fed by a typed graph, not a flat index. @@ -51,7 +51,7 @@ import { Card, CardGrid, LinkCard } from "@astrojs/starlight/components"; diff --git a/packages/docs/src/content/docs/mcp/overview.md b/packages/docs/src/content/docs/mcp/overview.md index b1f8a092..52209a8b 100644 --- a/packages/docs/src/content/docs/mcp/overview.md +++ b/packages/docs/src/content/docs/mcp/overview.md @@ -16,7 +16,7 @@ can connect to over stdio. - **Capabilities:** `tools` and `resources`. The server does not advertise `prompts` — the canned-prompts surface lives as Claude Code skills shipped by `plugins/opencodehub/` instead. -- **Tool count:** 28 (registered in `packages/mcp/src/server.ts`). Every +- **Tool count:** 29 (registered in `packages/mcp/src/server.ts`). Every tool is read-only with respect to user source — no tool edits the working tree. @@ -39,7 +39,7 @@ editor's native MCP config location. ## The four tool families -The 28 tools fall into four functional clusters plus a meta cluster. +The 29 tools fall into four functional clusters plus a meta cluster. The full per-tool catalog is in [MCP tools](/opencodehub/mcp/tools/). | Family | Tools | Count | @@ -91,7 +91,7 @@ Error responses instead carry `isError: true`, ## What the server exposes -- **28 tools** — exploration, federation, scan/findings, HTTP routing, +- **29 tools** — exploration, federation, scan/findings, HTTP routing, and metadata. All read-only with respect to user source. See [tools](/opencodehub/mcp/tools/). - **7 resources** — structured views over repos, clusters, and diff --git a/packages/docs/src/content/docs/mcp/resources.md b/packages/docs/src/content/docs/mcp/resources.md index 13632bdd..7d73f80a 100644 --- a/packages/docs/src/content/docs/mcp/resources.md +++ b/packages/docs/src/content/docs/mcp/resources.md @@ -1,6 +1,6 @@ --- title: MCP resources -description: The seven MCP resources the opencodehub server publishes alongside its 28 tools. +description: The seven MCP resources the opencodehub server publishes alongside its 29 tools. sidebar: order: 30 --- diff --git a/packages/docs/src/content/docs/mcp/tools.md b/packages/docs/src/content/docs/mcp/tools.md index 20110d72..7fb3eeb1 100644 --- a/packages/docs/src/content/docs/mcp/tools.md +++ b/packages/docs/src/content/docs/mcp/tools.md @@ -1,11 +1,11 @@ --- title: MCP tools -description: All 28 MCP tools the opencodehub server registers, grouped by functional family. Every tool is read-only with respect to user source. +description: All 29 MCP tools the opencodehub server registers, grouped by functional family. Every tool is read-only with respect to user source. sidebar: order: 20 --- -The `opencodehub` MCP server registers **28 tools**, imported and +The `opencodehub` MCP server registers **29 tools**, imported and invoked from `packages/mcp/src/server.ts`. The number is taken live from `buildServer()` at startup. Every tool is **read-only with respect to user source** — no tool edits the working tree. @@ -73,8 +73,8 @@ The high-frequency tools. Most agent loops live here. | | | |---|---| -| **Use when** | You need a custom view of the temporal store (`cochanges` + `symbol_summaries`) that no other tool exposes. Read-only. 5-second timeout. | -| **Avoid when** | A typed tool (`context`, `impact`, `query`) already covers the question, or you need the node/edge graph — that lives in `graph.lbug` and is reached via the typed tools or Cypher (ADR 0016), not this SQL path. | +| **Use when** | You need a custom view of `store.sqlite` that no other tool exposes. Everything is directly SQL-queryable: `nodes`, `edges`, `embeddings`, `cochanges`, `symbol_summaries`, and `store_meta` (ADR 0019); reach kind-specific fields via SQLite JSON1, `payload->>'$.field'`. Read-only. 5-second timeout. | +| **Avoid when** | A typed tool (`context`, `impact`, `query`) already covers the question. The typed tools stay the high-level path; the `cypher` arg is reserved for community-fork graph adapters and is not supported by the default backend. | | **Inputs** | `query` (required), `repo?`, `repo_uri?` | | **Returns** | `{ rows: [...], row_count, next_steps }` | @@ -141,7 +141,7 @@ form so a follow-up `AMBIGUOUS_REPO` retry can use it as input. | **Inputs** | `group` | | **Returns** | `{ group, contracts_written, cross_links_written, next_steps }` | -## Scan / findings / verdict (7) +## Scan / findings / verdict (8) `scan` is the only tool that spawns processes (`openWorldHint=true`). `verdict` exits 0/1/2/3 by tier — the canonical source of CI signal. @@ -194,6 +194,14 @@ form so a follow-up `AMBIGUOUS_REPO` retry can use it as input. | **Inputs** | `repo?`, `repo_uri?`, `base?` (default `main`), `head?` (default `HEAD`) | | **Returns** | `{ tier: "auto_merge" \| "single_review" \| "dual_review" \| "expert_review" \| "block", exit_code, reasons, signals }` | +### `change_pack` + +| | | +|---|---| +| **Use when** | A CI agent needs everything a diff touches in one deterministic, read-only payload: the changed symbols, their upstream impacted subgraph, the `verdict` tier, the affected tests, and a token-cost estimate. | +| **Inputs** | `repo?`, `repo_uri?`, `base?` (default `main`), `head?` (default `HEAD`), `depth?` (upstream traversal, default 4), `budget?` (context budget in heuristic tokens, default 100000) | +| **Returns** | `{ changed, impacted_subgraph, verdict, affected_tests, cost_estimate }` — the same `ChangePack` the CLI's `codehub change-pack --json` emits, snake-cased under `structuredContent`. | + ### `risk_trends` | | | diff --git a/packages/docs/src/content/docs/reference/cli.md b/packages/docs/src/content/docs/reference/cli.md index 0d267671..31291a20 100644 --- a/packages/docs/src/content/docs/reference/cli.md +++ b/packages/docs/src/content/docs/reference/cli.md @@ -13,7 +13,7 @@ unhandled throw writes `codehub: ` to stderr and sets ## `analyze` Index a repository. Runs the full pipeline: parse, resolve, cluster, -build BM25 + HNSW indexes, and write `.codehub/`. +build BM25 + vector indexes, and write `.codehub/`. ```bash title="usage" codehub analyze [path] @@ -23,7 +23,7 @@ codehub analyze [path] |---|---|---| | `--force` | off | Ignore the registry cache and re-run the pipeline. | | `--embeddings` | off | Compute semantic vectors. | -| `--embeddings-int8` | off | Quantise vectors to int8 (~23 MB weights). | +| `--embeddings-int8` | off | Use the int8 embedder variant (~81 MB) instead of fp32 (~321 MB). | | `--granularity ` | `symbol` | Any subset of `symbol,file,community`. | | `--embeddings-workers ` | `auto` | Size of the ONNX worker pool. | | `--embeddings-batch-size ` | 32 | Batch size per worker. | @@ -89,9 +89,11 @@ codehub setup | `--force` | off | Overwrite existing entries; re-download weights. | | `--undo` | off | Restore the most recent `.bak` next to each config. | | `--embeddings` | off | Download `F2LLM-v2-80M` ONNX weights (SHA256-pinned GitHub release asset). | -| `--int8` | off | Use the int8 weight variant (~81 MB) instead of fp32 (~321 MB). | +| `--int8` | off | Use the int8 weight variant (~92 MB) instead of fp32 (~332 MB). | | `--model-dir ` | — | Override the target directory for embedder weights. | | `--plugin` | off | Install the Claude Code plugin to `~/.claude/plugins/opencodehub/`. | +| `--scip ` | — | Install an external SCIP adapter binary: `clang`, `ruby`, `dotnet`, `kotlin`, or `all`. SHA256-pinned; `dotnet` requires .NET SDK 8+ on `PATH`. | +| `--cobol-proleap` | off | Build the `uwol/cobol-parser` library from source (`git clone` + `mvn install`) and compile the bridge wrapper. Requires `git`, `mvn`, and JDK 17+ on `PATH`. Installs under `~/.codehub/vendor/proleap/`. | ## `mcp` @@ -152,10 +154,17 @@ codehub pack [path] ## `code-pack` -Produce the deterministic 9-item code-pack BOM (manifest, skeleton, -file-tree, dependency list, top symbols, processes, routes, tools, -findings) sized to a token budget. This is the artifact attached to -every release and signed with cosign. +Produce the deterministic 8-item code-pack BOM sized to a token budget. +The BOM is `manifest.json` plus seven body items: skeleton, file-tree, +dependency list, ast-chunks, xrefs, findings, and licenses. A +consumer-facing `readme.md` ships alongside the BOM but is not part of +the manifest hash preimage. The pack is byte-identical given the same +`(commit, tokenizer, budget)`, and `packHash` names its on-disk +directory (`/.codehub/packs//`). + +The default engine is `pack` (the `@opencodehub/pack` BOM). `--engine +repomix` opts into the legacy single-file snapshot (a single output +file, `bomItemCount` of 1, no manifest). ```bash title="usage" codehub code-pack [path] @@ -164,6 +173,47 @@ codehub code-pack [path] | Flag | Default | Purpose | |---|---|---| | `--budget ` | 100000 | AST-chunker token budget. | +| `--tokenizer ` | `openai:o200k_base@tiktoken-0.8.0` | Tokenizer pin `:@`. | +| `--out-dir ` | `/.codehub/packs//` | Override the default output directory. | +| `--engine ` | `pack` | `pack` emits the 8-item BOM; `repomix` emits the legacy single-file snapshot. | +| `--explain-context` | off | After packing, print the context read-receipt (files indexed, lines, hash coverage, per-language breakdown) from `context-bom.json`. | +| `--json` | off | With `--explain-context` or `--variance-probe`, emit the result as JSON on stdout. | +| `--variance-probe ` | — | Measure the run-to-run answer variance an OCH pack removes from a coding agent. Loads the task file, generates the pack, runs the agent N times with vs. without the pack, and reports the dispersion delta plus token overhead. Agents run on Amazon Bedrock. On-demand only. | +| `--runs ` | 10 | With `--variance-probe`: runs per arm. | +| `--harness ` | both | With `--variance-probe`: restrict to one agent. | +| `--aws-region ` | inherited `AWS_REGION` | With `--variance-probe`: AWS region for Bedrock inference. | +| `--model-claude ` | `us.anthropic.claude-sonnet-4-6` | With `--variance-probe`: Claude Code Bedrock model / inference-profile id. | +| `--model-codex ` | `openai.gpt-5.5` | With `--variance-probe`: Codex Bedrock model id. | + +```bash title="example" +codehub code-pack . --budget 80000 --explain-context +``` + +## `replay` + +Assert two code-packs are decision-equivalent (spec 011 / ADR 0020): the +same files and byte ranges selected under the same budget, regardless of +incidental drift in `tokenCount`, pins, or chunk text. `packHash` equality +is the cheap witness; a `decisionHash` projection is the contract. The +verdict is one of `EQUIVALENT`, `DIVERGED`, `BUDGET_MISMATCH`, or +`CORRUPT`. On-demand, never a CI gate. + +```bash title="usage" +codehub replay --compare +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--compare ` | — | **Required.** Exactly two pack directories (`.codehub/packs//`) to compare. | +| `--json` | off | Emit the full replay record (verdict, `decisionHash`es, diff) as JSON on stdout. | +| `--budget-strict` | off | Treat a `BUDGET_MISMATCH` (different `--budget` between the packs) as a failure exit. | + +Exit codes: `EQUIVALENT` → 0, `BUDGET_MISMATCH` → 0 (or 1 with +`--budget-strict`), `DIVERGED` → 1, `CORRUPT` → 1. + +```bash title="example" +codehub replay --compare .codehub/packs/abc123 .codehub/packs/def456 --json +``` ## `query` @@ -259,6 +309,30 @@ codehub verdict Exit codes: `auto_merge=0`, `single_review=1`, `dual_review=1`, `expert_review=2`, `block=3`. +## `change-pack` + +Diff-scoped change-pack: the impacted subgraph, a PR verdict, affected +tests, and a cost estimate for one diff. CLI sibling of the `change_pack` +MCP tool, usable in CI without launching the MCP server. + +```bash title="usage" +codehub change-pack +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--base ` | `main` | Base git ref. | +| `--head ` | `HEAD` | Head git ref. | +| `--depth ` | 4 | Upstream traversal depth. | +| `--min-confidence ` | 0.7 | Traversal confidence floor, 0 to 1. | +| `--budget ` | 100000 | Context budget in heuristic tokens. | +| `--include-tests-in-subgraph` | off | Retain test nodes in the impacted subgraph. | +| `--json` | off | Structured envelope. | + +Exit codes mirror `verdict`: `auto_merge` / `single_review` → 0, +`dual_review` → 1, `expert_review` / `block` → 2. + ## `group` Cross-repo group management. @@ -317,9 +391,13 @@ codehub doctor | Flag | Default | Purpose | |---|---|---| -| `--skip-native` | off | Skip checks that require native bindings (duckdb / lbug — `@duckdb/node-api` and `@ladybugdb/core`). Parsing has no native binding; it is WASM-only (`web-tree-sitter`) and unaffected by this flag. | +| `--skip-native` | off | Skip the two probes that load a runtime module: the `node:sqlite` built-in WAL round-trip and the optional `onnxruntime-web` embedder (prebuilt WASM). The store has no native bindings, so this flag retains only these two checks; it is kept for compatibility with CI sandboxes. Parsing is WASM-only (`web-tree-sitter`) and is never skipped. | +| `--strict` | off | Treat a missing SCIP indexer as a failure (exit 2), not a warning. For release / CI gates. Vendored WASM grammars fail in both modes. | | `--repoRoot ` | cwd | Repo root to probe. | +Exit codes: `0` all checks OK, `1` at least one warning, `2` at least +one failure. + ## `bench` Run the acceptance-gate bench suite and emit a dashboard. @@ -335,21 +413,28 @@ codehub bench ## `wiki` -Emit a Markdown wiki for the repo. +Emit a Markdown wiki for the repo under `--output`. Deterministic by +default; `--llm` routes top-ranked modules through the summarizer for +narrative prose. ```bash title="usage" -codehub wiki +codehub wiki --output ``` | Flag | Default | Purpose | |---|---|---| +| `--output ` | — | **Required.** Target directory for rendered pages. | | `--repo ` | current | Target repo. | | `--json` | off | Emit a JSON summary on stdout. | | `--offline` | off | Assert no network access (incompatible with `--llm`). | | `--llm` | off | Route top-ranked modules through the summarizer. | -| `--max-llm-calls ` | 0 (dry-run) | LLM call budget. | +| `--max-llm-calls ` | 0 (dry-run) | LLM call budget when `--llm` is set. | | `--llm-model ` | — | Override the Bedrock summary model id. | +```bash title="example" +codehub wiki --output docs/wiki +``` + ## `ci-init` Emit opinionated CI workflows. @@ -380,11 +465,17 @@ codehub augment ## `sql` -Read-only SQL against the **temporal store** — the DuckDB-backed `cochanges` and -`symbol_summaries` tables. 5-second timeout by default. The node/edge graph lives -in `graph.lbug` (see ADR 0016) and is **not** reachable from this SQL path; query -it via the typed tools (`query` / `context` / `impact`) or Cypher via the MCP `sql` -tool. +Read-only SQL against the single-file store, `/.codehub/store.sqlite` +(WAL, via Node's built-in `node:sqlite`, ADR 0019). Every table lives in +this one file and is directly queryable: `nodes`, `edges`, `embeddings`, +`cochanges`, `symbol_summaries`, and `store_meta`. Reach kind-specific +fields on `nodes` via SQLite JSON1, e.g. `payload->>'$.field'`. The guard +rejects any mutation. 5-second timeout by default. + +The typed tools (`query` / `context` / `impact`) remain the high-level +path for graph traversal. A `cypher` query path exists only as a reserved +escape hatch for community-fork graph adapters (AGE / Memgraph / Neo4j / +Neptune) and is not supported by the default backend. ```bash title="usage" codehub sql @@ -395,3 +486,189 @@ codehub sql | `--repo ` | current | Target repo. | | `--timeout ` | 5000 | Statement timeout. | | `--json` | off | Structured envelope. | + +```bash title="example" +codehub sql "SELECT id, name FROM nodes WHERE kind = 'Function' LIMIT 10" +``` + +## Read-only graph capabilities + +Each command below is a CLI sibling of an MCP tool, reusing the same +underlying reader against the single-file store. They run in CI without +launching the MCP server. + +## `findings` + +List SARIF `Finding` nodes (sibling of the MCP `list_findings` tool). + +```bash title="usage" +codehub findings +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--severity ` | — | Restrict to one SARIF severity: `error`, `warning`, `note`, or `none`. | +| `--scanner ` | — | Restrict to a single scanner id (e.g. `semgrep`). | +| `--rule-id ` | — | Restrict to a single rule id. | +| `--file-path ` | — | Substring filter on the finding's file path. | +| `--limit ` | 500 | Maximum findings to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub findings --severity error --scanner semgrep +``` + +## `dead-code` + +List dead and unreachable-export symbols (sibling of the MCP +`list_dead_code` tool). + +```bash title="usage" +codehub dead-code +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--file-path-pattern ` | — | Substring filter on each symbol's file path. | +| `--include-unreachable-exports` | off | Also include exported-but-unreferenced symbols. | +| `--limit ` | 100 | Maximum symbols to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub dead-code --include-unreachable-exports +``` + +## `license-audit` + +Classify `Dependency` nodes by license risk tier (sibling of the MCP +`license_audit` tool). + +```bash title="usage" +codehub license-audit +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub license-audit --json +``` + +## `project-profile` + +Show the detected project profile (sibling of the MCP `project_profile` +tool). + +```bash title="usage" +codehub project-profile +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub project-profile +``` + +## `risk-trends` + +Per-community risk trend plus a 30-day projection (sibling of the MCP +`risk_trends` tool). + +```bash title="usage" +codehub risk-trends +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub risk-trends --json +``` + +## `owners` + +List ranked `OWNED_BY` contributors for a node (sibling of the MCP +`owners` tool). + +```bash title="usage" +codehub owners +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--limit ` | 20 | Maximum contributors to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub owners src/auth/session.ts +``` + +## `route-map` + +Map HTTP routes to handlers and consumers (sibling of the MCP +`route_map` tool). + +```bash title="usage" +codehub route-map +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--route ` | — | Substring match against `Route.url` (e.g. `/api/users`). | +| `--method ` | — | Exact match against `Route.method` (e.g. `GET`). | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub route-map --route /api/users --method GET +``` + +## `api-impact` + +Score the blast radius of changing a `Route`'s contract (sibling of the +MCP `api_impact` tool). + +```bash title="usage" +codehub api-impact +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--route ` | — | Substring match against `Route.url`. | +| `--file ` | — | Substring match against `Route.filePath`. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub api-impact --route /api/users +``` + +## `dependencies` + +List external dependencies (sibling of the MCP `dependencies` tool). + +```bash title="usage" +codehub dependencies +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--ecosystem ` | — | Restrict to one ecosystem: `npm`, `pypi`, `go`, `cargo`, `maven`, or `nuget`. | +| `--file-path ` | — | Substring filter on the manifest / lockfile path. | +| `--limit ` | 500 | Maximum dependencies to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub dependencies --ecosystem npm +``` diff --git a/packages/docs/src/content/docs/reference/configuration.md b/packages/docs/src/content/docs/reference/configuration.md index 50fe8e82..b9210ae3 100644 --- a/packages/docs/src/content/docs/reference/configuration.md +++ b/packages/docs/src/content/docs/reference/configuration.md @@ -14,25 +14,24 @@ mutate global state. ### Storage -The graph tier is always LadybugDB (`graph.lbug`) and the temporal tier -is always DuckDB (`temporal.duckdb`). There is no backend selector — the -`CODEHUB_STORE` env var was removed in ADR 0016 along with the probe and -the DuckDB-as-graph fallback. If the LadybugDB binding cannot load, -`open()` throws `GraphDbBindingError`. +The whole index lives in one `store.sqlite` file (WAL mode) via Node's +built-in `node:sqlite`. There is no backend selector: the `CODEHUB_STORE` +env var was removed and there is no native storage binding to probe (ADR +0019). Nothing fails for lack of a platform prebuilt. | Variable | Purpose | |---|---| | `CODEHUB_HOME` | Override `~/.codehub/` (where the registry, embedder weights, and global state live). | ADR 0013 (`docs/adr/0013-m7-default-flip-and-abstraction.md`) records -the `IGraphStore` / `ITemporalStore` interface segregation; ADR 0016 -(`docs/adr/0016-duckdb-graph-rip.md`) records the rip-out of the DuckDB -graph backend, the env var, and the resolver. +the `IGraphStore` / `ITemporalStore` interface segregation; ADR 0019 +(`docs/adr/0019-single-file-sqlite-storage.md`) records collapsing the +whole index into one `store.sqlite` and removing both native storage +bindings. ### Parse runtime -`web-tree-sitter` (WASM) is the only parse runtime on Node 20, 22, and -24. There is no env var or CLI flag to switch parsers — the native +`web-tree-sitter` (WASM) is the only parse runtime on Node ≥24.15. There is no env var or CLI flag to switch parsers — the native `tree-sitter` N-API addon was removed in 0.4.0. The CLI emits a one-shot stderr advisory if a stale legacy env var is set, then ignores it; consult the CHANGELOG and ADR 0015 for the variable name and @@ -70,13 +69,12 @@ When none of the above are set, the local ONNX backend ## On-disk layout: `.codehub/` `codehub analyze` writes everything under `/.codehub/`. The -layout is fixed: a LadybugDB graph file alongside a DuckDB temporal -file. +layout is fixed: one `store.sqlite` file backs the whole index. | Path | Purpose | |---|---| -| `graph.lbug` | LadybugDB graph store — nodes, edges, embeddings, BM25 + HNSW indexes. | -| `temporal.duckdb` | Sibling DuckDB file — temporal store (cochanges, symbol-summary cache). | +| `store.sqlite` | The whole index (WAL mode, `node:sqlite`) — nodes, edges, embeddings, the FTS5 search index, and the temporal tables (cochanges, symbol-summary cache). | +| `store.sqlite-wal` / `store.sqlite-shm` | WAL companions present while a writer is open; collapse into `store.sqlite` at close. | | `meta.json` | Index metadata: graph hash, node counts, CLI version, embedder model id. | | `scan.sarif` | SARIF output from `codehub scan`. | | `sbom.cyclonedx.json` / `sbom.spdx.json` | SBOMs when `codehub analyze --sbom` has run. | diff --git a/packages/docs/src/content/docs/reference/error-codes.md b/packages/docs/src/content/docs/reference/error-codes.md index c225d9c8..795e7d79 100644 --- a/packages/docs/src/content/docs/reference/error-codes.md +++ b/packages/docs/src/content/docs/reference/error-codes.md @@ -21,8 +21,7 @@ The canonical list lives at | `STALENESS` | The index lags `HEAD` far enough to mistrust results. | `codehub analyze` (or `--force`). | | `INVALID_INPUT` | A tool argument failed schema validation. | Correct the call; check required fields. | | `NOT_FOUND` | The target symbol, repo, or group does not exist. | Confirm the name; run `codehub list` for repos. | -| `DB_ERROR` | The graph store returned an error during the query. | Check `codehub doctor`; inspect `.codehub/graph.lbug`. | -| `GraphDbBindingError` | The `@ladybugdb/core` native binding could not load, so `open()` aborted. There is no DuckDB-as-graph fallback (ADR 0016). | Run `codehub doctor` (it hard-fails on a missing binding); confirm the lbug prebuilt target matches your platform, or build from source via `cmake-js`. | +| `DB_ERROR` | The store returned an error during the query. | Check `codehub doctor` (it runs a `node:sqlite` import + WAL round-trip); inspect `.codehub/store.sqlite`. | | `SCHEMA_MISMATCH` | The index was produced by a different CLI version with an incompatible schema. | `codehub analyze --force` to rebuild. | | `RATE_LIMITED` | A downstream service (embedder, summariser) rate-limited the request. | Retry with backoff; reduce concurrency. | | `INTERNAL` | Catch-all for unhandled exceptions reaching the tool boundary. | File an issue with the error `message`. | @@ -30,6 +29,11 @@ The canonical list lives at | `AMBIGUOUS_REPO` | More than one repo is indexed and neither `repo` nor `repo_uri` was supplied. | Retry with one of the `choices[].repo_uri` values. | | `EMBEDDER_MISMATCH` | The store was indexed by a different embedder than the one currently configured. | Re-index with the configured embedder, or pass the documented force flag. | +The historical `GraphDbBindingError` (a failed native graph-binding +load) no longer exists. ADR 0019 removed both native storage bindings +and moved the whole index into one `store.sqlite` file via the built-in +`node:sqlite`, so there is no binding to fail. + ## `AMBIGUOUS_REPO` envelope `AMBIGUOUS_REPO` is the most common error a federated client encounters. diff --git a/packages/docs/src/content/docs/reference/languages.md b/packages/docs/src/content/docs/reference/languages.md index 09d56627..6f36190e 100644 --- a/packages/docs/src/content/docs/reference/languages.md +++ b/packages/docs/src/content/docs/reference/languages.md @@ -41,7 +41,7 @@ ProLeap deep-parse. ## Parse runtime — WASM-only -The parse runtime is `web-tree-sitter` (WASM) on Node 20, 22, and 24. +The parse runtime is `web-tree-sitter` (WASM) on Node ≥24.15. WASM has no native ABI dependency, so it works on every supported Node version out of the box and `npm install -g @opencodehub/cli@latest` does zero native builds. diff --git a/packages/docs/src/content/docs/skills/codehub-code-pack.mdx b/packages/docs/src/content/docs/skills/codehub-code-pack.mdx index 73ee3809..32afbf3d 100644 --- a/packages/docs/src/content/docs/skills/codehub-code-pack.mdx +++ b/packages/docs/src/content/docs/skills/codehub-code-pack.mdx @@ -1,21 +1,21 @@ --- title: "codehub-code-pack" -description: "Deterministic 9-item code-pack BOM for a repo or group — byte-identical given the same (commit, tokenizer, budget)." +description: "Deterministic 8-item code-pack BOM for a repo or group — byte-identical given the same (commit, tokenizer, budget)." --- import { Aside } from "@astrojs/starlight/components"; Standalone skill. Surfaces the `pack_codebase` MCP tool to produce a -**deterministic, 9-item Bill of Materials (BOM)** at +**deterministic, 8-item Bill of Materials (BOM)** at `/.codehub/packs//` that is byte-identical given the same -`(commit, tokenizer, budget, chonkie_version, duckdb_version, -grammar_commits)`. The pack is the durable artifact agents hand to a +`(commit, tokenizer, budget, chonkie_version, grammar_commits)`. The +pack is the durable artifact agents hand to a long-context LLM, archive for later replay, or diff between commits to prove invariants did not change. @@ -42,7 +42,7 @@ model: sonnet requested in the pack. 4. `mcp__opencodehub__pack_codebase` with the default `engine: "pack"`. The tool resolves the output to `/.codehub/packs//` - and writes the 9 items plus `manifest.json`. + and writes the 8 items plus `manifest.json`. 5. Report back the `packHash`, the `determinismClass`, and the absolute output directory; name the cause when the class is `best_effort` or `degraded`. @@ -53,7 +53,7 @@ Run the single-repo flow per member of the named group, then emit a table of `(repoUri, packHash, determinismClass, outDir)`. `packHash` is per-repo, not per-group — a group pack is the union of the member BOMs. -## The 9-item BOM +## The 8-item BOM | # | File | Determinism contract | |---|------|----------------------| @@ -63,9 +63,12 @@ not per-group — a group pack is the union of the member BOMs. | 4 | `deps.jsonl` | `(ecosystem, name, version, id)` lexicographic ASC | | 5 | `ast-chunks.jsonl` | LF-normalized; degrades to line-split with `determinismClass: degraded` | | 6 | `xrefs.jsonl` | community rows first, then call rows | -| 7 | `embeddings.parquet` | OPTIONAL — absent entirely when no embeddings exist | -| 8 | `findings.jsonl` | severity rank then `ruleId` ASC | -| 9 | `licenses.md` + `readme.md` | alpha-sorted dependency list + manifest-derived header | +| 7 | `findings.jsonl` | severity rank then `ruleId` ASC | +| 8 | `licenses.md` + `readme.md` | alpha-sorted dependency list + manifest-derived header | + +Embeddings are not a separate BOM item. They live in the `embeddings` +table inside `store.sqlite`; the write-only Parquet sidecar that used to +be item 7 was dropped with DuckDB (ADR 0019), since nothing read it back. ## Determinism class diff --git a/packages/docs/src/content/docs/start-here/first-query.md b/packages/docs/src/content/docs/start-here/first-query.md index 9a548bce..06841e34 100644 --- a/packages/docs/src/content/docs/start-here/first-query.md +++ b/packages/docs/src/content/docs/start-here/first-query.md @@ -18,7 +18,7 @@ haven't linked the CLI, replace `codehub` with ## Hybrid search: `query` -`codehub query` fuses BM25 lexical search with HNSW vector search (when +`codehub query` fuses BM25 lexical search with brute-force vector KNN (when embeddings are present) to find symbols related to a natural-language concept. @@ -99,7 +99,7 @@ down` restricts to dependencies (who do I call), and `--direction both` ## Next - [MCP tools overview](/opencodehub/mcp/overview/) for the full server - capabilities (28 tools across exploration, federation, scan, HTTP, + capabilities (29 tools across exploration, federation, scan, HTTP, and meta). - [Using with Claude Code](/opencodehub/guides/using-with-claude-code/) to let the agent run these tools for you. diff --git a/packages/docs/src/content/docs/start-here/install.md b/packages/docs/src/content/docs/start-here/install.md index 901b6030..c6224139 100644 --- a/packages/docs/src/content/docs/start-here/install.md +++ b/packages/docs/src/content/docs/start-here/install.md @@ -11,16 +11,16 @@ sidebar: parity with the Linux dev path, but native Windows now works without the MSVC build chain because OpenCodeHub does no native compilation at install time. -- **Node.js:** Node 20, 22, or 24. The parse runtime is `web-tree-sitter` - (WASM) on every supported version — there is no native opt-in (ADR 0015). +- **Node.js:** Node ≥24.15. The store is Node's built-in `node:sqlite` + (`DatabaseSync`, enabled by default at that version) and the parse + runtime is `web-tree-sitter` (WASM) — there is no native opt-in (ADR 0015). ## Supported platforms -OpenCodeHub installs with **zero native compilation** — the parse runtime is -WASM, and the two native bindings (`@ladybugdb/core` for the graph store, -`@duckdb/node-api` for the temporal store) ship prebuilt per platform. The -graph store is the narrowest matrix and is **mandatory** (there is no -fallback), so its prebuilt coverage defines where OpenCodeHub runs: +OpenCodeHub installs with **zero native compilation and zero native +storage bindings** — the store is Node's built-in `node:sqlite` and the +parse runtime is WASM (ADR 0019). There is no per-platform prebuilt to +match, so **every platform is supported**: | Platform | Supported | |---|---| @@ -29,16 +29,14 @@ fallback), so its prebuilt coverage defines where OpenCodeHub runs: | Linux x64 (glibc — Debian/Ubuntu/RHEL) | ✅ | | Linux arm64 (glibc) | ✅ | | Windows x64 | ✅ | -| **Windows arm64** | ❌ no `@ladybugdb/core` prebuilt | -| **Linux musl (Alpine)** | ❌ no `@ladybugdb/core` prebuilt | - -On an unsupported platform the CLI fails fast with a `GraphDbBindingError` that -names the case. For containers, use a **glibc** base image (`node:22`, -`node:22-slim`, `debian`, `ubuntu`) rather than an Alpine/musl image -(`node:22-alpine`). Windows-on-ARM users should run under x64 emulation or WSL2 -with an x64/arm64-glibc Linux until upstream ships the missing prebuilts -(tracked upstream in `@ladybugdb/core`). -- **pnpm:** `>=10.0.0` (the workspace lockfile is generated with 10.33.2). +| Windows arm64 | ✅ | +| Linux musl (Alpine) | ✅ | + +There is no unsupported-platform failure mode: `npm install -g +@opencodehub/cli` plus Node ≥24.15 is the whole install. Any base +image works, including Alpine/musl (`node:24-alpine`) and Windows-on-ARM, +because nothing compiles and no native binding has to load. +- **pnpm:** `>=11.0.0` (the workspace lockfile is generated with 11.1.0). - **Python 3.12:** optional, only used by auxiliary tooling (the harness packages do not ship as runtime dependencies). Not required for the CLI or MCP server. @@ -58,7 +56,7 @@ pnpm -r build mise run cli:link # puts `codehub` on your PATH ``` -`mise install` activates the Node 22, pnpm 11.1.0, and Python 3.12 pins +`mise install` activates the Node 24, pnpm 11.1.0, and Python 3.12 pins from `mise.toml`. `pnpm install --frozen-lockfile` installs exactly the lockfile-pinned dependencies. `pnpm -r build` compiles every TypeScript package so the CLI entrypoint at `packages/cli/dist/index.js` is @@ -76,10 +74,10 @@ tarball globally. If you already manage Node and pnpm another way: -1. Install Node 20, 22, or 24 (`nvm install 22`, `fnm install 22`, or - from [nodejs.org](https://nodejs.org)). Every supported version uses - the same `web-tree-sitter` (WASM) parse runtime — there is no native - parser and no opt-in (ADR 0015). +1. Install Node ≥24.15 (`nvm install 24`, `fnm install 24`, or + from [nodejs.org](https://nodejs.org)). The store uses the built-in + `node:sqlite` and parsing uses `web-tree-sitter` (WASM) — there is no + native parser and no opt-in (ADR 0015). 2. Install pnpm `>=11.0.0` (`corepack enable pnpm`, or `npm install -g pnpm@11`). 3. Clone, build, and link: @@ -113,10 +111,11 @@ Then probe your environment: codehub doctor ``` -`codehub doctor` checks your Node version, pnpm version, native-module -bindings (the DuckDB and LadybugDB prebuilds — parsing is WASM-only, so -there is no native parser to probe), and writable paths in `~/.codehub/` -and `.codehub/`. It exits non-zero if anything looks off. +`codehub doctor` checks your Node version, pnpm version, the built-in +`node:sqlite` module (an import plus a WAL round-trip — there is no +native storage binding to probe, and parsing is WASM-only), and writable +paths in `~/.codehub/` and `.codehub/`. It exits non-zero if anything +looks off. :::note[Fallback for unlinked checkouts] If you cannot or will not link the CLI (locked-down CI images, a @@ -132,11 +131,11 @@ node packages/cli/dist/index.js doctor ## Optional environment toggles -Storage has no toggle — the graph tier is always LadybugDB -(`.codehub/graph.lbug`) and the temporal tier is always DuckDB -(`.codehub/temporal.duckdb`); both are written on every `analyze` and -there is no backend-selection env var (ADR 0016). Parsing has no toggle -either — `web-tree-sitter` (WASM) is the only runtime (ADR 0015). +Storage has no toggle: the whole index lands in one +`.codehub/store.sqlite` file (WAL mode) via the built-in `node:sqlite`, +written on every `analyze`, with no backend-selection env var and no +native binding (ADR 0019). Parsing has no toggle either: +`web-tree-sitter` (WASM) is the only runtime (ADR 0015). | Variable | Default | Effect | |---|---|---| diff --git a/packages/docs/src/content/docs/start-here/quick-start.md b/packages/docs/src/content/docs/start-here/quick-start.md index 5cd0d02d..5b6c8a6e 100644 --- a/packages/docs/src/content/docs/start-here/quick-start.md +++ b/packages/docs/src/content/docs/start-here/quick-start.md @@ -77,11 +77,11 @@ codehub setup --editors claude-code codehub analyze ``` -`analyze` writes the graph to `.codehub/` under the repo root and -registers the repo in `~/.codehub/registry.json`. The graph always lands -in `.codehub/graph.lbug` (LadybugDB) with `.codehub/temporal.duckdb` -alongside it; if `@ladybugdb/core` cannot load on the current platform, -analyze aborts with a `GraphDbBindingError` rather than falling back. +`analyze` writes the index to `.codehub/` under the repo root and +registers the repo in `~/.codehub/registry.json`. The whole index always +lands in one `.codehub/store.sqlite` file (WAL mode) via Node's built-in +`node:sqlite`; there is no native storage binding to load and no +platform where analyze fails for lack of a prebuilt (ADR 0019). Add `--embeddings` to compute semantic vectors for hybrid search, or `--offline` to guarantee zero network sockets. @@ -117,7 +117,7 @@ codehub impact validateUser --depth 2 - [Your first query](/opencodehub/start-here/first-query/) walks through `query`, `context`, and `impact` with sample output. -- [MCP tools](/opencodehub/mcp/tools/) lists all 28 tools the server +- [MCP tools](/opencodehub/mcp/tools/) lists all 29 tools the server exposes. - [Using with Claude Code](/opencodehub/guides/using-with-claude-code/) covers the plugin path (PreToolUse hooks) and the MCP-only path. diff --git a/packages/docs/src/content/docs/start-here/what-is-opencodehub.md b/packages/docs/src/content/docs/start-here/what-is-opencodehub.md index b899de8b..ffbccf89 100644 --- a/packages/docs/src/content/docs/start-here/what-is-opencodehub.md +++ b/packages/docs/src/content/docs/start-here/what-is-opencodehub.md @@ -28,12 +28,12 @@ where does this data flow. OpenCodeHub parses your repository with tree-sitter (15 GA languages, plus SCIP indexers for TypeScript, Python, Go, Rust, and Java), resolves imports and inheritance, and materialises a **typed symbol -graph**. That graph is stored in LadybugDB, a graph-native database, -with DuckDB carrying the temporal sibling (cochanges and the -symbol-summary cache). Both tiers are always present — there is no -backend toggle, and a failure to load the `@ladybugdb/core` binding -aborts the operation rather than falling back. BM25 lexical search and -filter-aware HNSW vector search sit on the same store. A local MCP +graph**. That graph is stored in one `store.sqlite` file via Node's +built-in `node:sqlite`, which also carries the temporal tables +(cochanges and the symbol-summary cache). There is no backend toggle and +no native storage binding: ADR 0019 removed both `@ladybugdb/core` and +`@duckdb/node-api`, so the whole index is one file. BM25 lexical search +and filter-aware vector search sit on the same store. A local MCP server exposes the graph to any agent that speaks Model Context Protocol. @@ -41,11 +41,11 @@ Protocol. flowchart LR A[Source tree] -->|tree-sitter parse| B[Symbol graph] B -->|resolve imports and MRO| C[Typed relations] - C -->|BM25 plus HNSW index| D[Hybrid graph store] + C -->|BM25 plus vector index| D[store.sqlite] C -->|detect communities and flows| E[Processes and clusters] D --> F[MCP server] E --> F - F -->|28 tools| G[AI coding agent] + F -->|29 tools| G[AI coding agent] ``` Clustering, execution-flow tracing, and blast-radius analysis all happen @@ -54,19 +54,19 @@ call, not ten round-trips. ## What you get in v1 -- **Graph-native storage.** LadybugDB is the graph tier and a dedicated - DuckDB sibling serves the temporal store. Both files (`graph.lbug` + - `temporal.duckdb`) are written on every index — no backend knob, no - fallback layout (ADR 0016). +- **Single-file storage.** One `store.sqlite` file (WAL mode) via Node's + built-in `node:sqlite` holds the whole index: graph nodes, edges, + embeddings, and the temporal tables. There is no backend knob and no + native storage binding (ADR 0019), so every platform is supported. - **Cross-repo federation.** Group several indexed repos with `codehub group` and query them through the `group_*` MCP tools. The repo is a first-class graph node and `repo_uri` carries through every cross-repo response, including the `AMBIGUOUS_REPO` envelope. - **Deterministic code-pack.** `pack_codebase` (MCP) and `codehub - code-pack` produce a reproducible 9-item BOM signed by the release + code-pack` produce a reproducible 8-item BOM signed by the release workflow. - **WASM-only parsing.** `web-tree-sitter` is the only parse runtime on - Node 20, 22, and 24, with all 15 grammar `.wasm` blobs vendored in the + Node ≥24.15, with all 15 grammar `.wasm` blobs vendored in the `@opencodehub/ingestion` tarball. `npm install -g @opencodehub/cli@latest` does zero native builds and zero GitHub fetches (ADR 0015). diff --git a/packages/mcp/README.md b/packages/mcp/README.md index 53c7a482..420126fe 100644 --- a/packages/mcp/README.md +++ b/packages/mcp/README.md @@ -54,7 +54,7 @@ tree. ambiguity is reported once, consistently (`packages/mcp/src/repo-resolver.ts`). - **Connection pooling** — the graph store is held in a per-process - pool to amortise DuckDB cold starts across many tool calls + pool to amortise SQLite cold starts across many tool calls (`packages/mcp/src/connection-pool.ts`). - **Lazy analysis** — heavy work (scan, code-pack, verdict) shells out via `analysis-bridge` rather than running in the MCP process so a diff --git a/packages/mcp/src/connection-pool.test.ts b/packages/mcp/src/connection-pool.test.ts index a2211f91..c9eb8899 100644 --- a/packages/mcp/src/connection-pool.test.ts +++ b/packages/mcp/src/connection-pool.test.ts @@ -32,8 +32,8 @@ test("acquire opens once, reuses on subsequent acquires", async () => { return makeFakeStore(p).store; }); try { - const a = await pool.acquire("repoA", "/a.duckdb"); - const b = await pool.acquire("repoA", "/a.duckdb"); + const a = await pool.acquire("repoA", "/a.sqlite"); + const b = await pool.acquire("repoA", "/a.sqlite"); assert.equal(a, b); assert.equal(factoryCalls, 1); await pool.release("repoA"); @@ -53,9 +53,9 @@ test("concurrent acquires dedupe in-flight opens", async () => { }); try { const [a, b, c] = await Promise.all([ - pool.acquire("repoX", "/x.duckdb"), - pool.acquire("repoX", "/x.duckdb"), - pool.acquire("repoX", "/x.duckdb"), + pool.acquire("repoX", "/x.sqlite"), + pool.acquire("repoX", "/x.sqlite"), + pool.acquire("repoX", "/x.sqlite"), ]); assert.equal(a, b); assert.equal(b, c); @@ -76,11 +76,11 @@ test("LRU eviction on size overflow closes evicted entries", async () => { return probe.store; }); try { - await pool.acquire("a", "/a.duckdb"); + await pool.acquire("a", "/a.sqlite"); await pool.release("a"); - await pool.acquire("b", "/b.duckdb"); + await pool.acquire("b", "/b.sqlite"); await pool.release("b"); - await pool.acquire("c", "/c.duckdb"); + await pool.acquire("c", "/c.sqlite"); await pool.release("c"); // Give dispose a microtask to finish the async close. await new Promise((resolve) => setImmediate(resolve)); @@ -98,9 +98,9 @@ test("shutdown closes every remaining entry exactly once", async () => { probes.push(probe); return probe.store; }); - await pool.acquire("r1", "/r1.duckdb"); + await pool.acquire("r1", "/r1.sqlite"); await pool.release("r1"); - await pool.acquire("r2", "/r2.duckdb"); + await pool.acquire("r2", "/r2.sqlite"); await pool.release("r2"); await pool.shutdown(); for (const p of probes) { @@ -112,7 +112,7 @@ test("shutdown closes every remaining entry exactly once", async () => { test("acquire after shutdown throws", async () => { const pool = new ConnectionPool({ max: 2 }, async (p) => makeFakeStore(p).store); await pool.shutdown(); - await assert.rejects(() => pool.acquire("x", "/x.duckdb"), /shut down/); + await assert.rejects(() => pool.acquire("x", "/x.sqlite"), /shut down/); }); test("eviction of an in-use entry defers close to the last release", async () => { @@ -125,15 +125,15 @@ test("eviction of an in-use entry defers close to the last release", async () => try { // Hold three distinct repos in flight at once with max=2 so the LRU // evicts the least-recently-used ("a") WHILE it is still referenced. - await pool.acquire("a", "/a.duckdb"); - await pool.acquire("b", "/b.duckdb"); - await pool.acquire("c", "/c.duckdb"); // evicts "a" (refCount 1) + await pool.acquire("a", "/a.sqlite"); + await pool.acquire("b", "/b.sqlite"); + await pool.acquire("c", "/c.sqlite"); // evicts "a" (refCount 1) await new Promise((resolve) => setImmediate(resolve)); // The evicted entry is still in use — it MUST NOT be closed yet, or the // tool still holding the handle would see a closed store mid-call. assert.equal( - probes.get("/a.duckdb")?.isClosed(), + probes.get("/a.sqlite")?.isClosed(), false, "evicted-but-in-use store must stay open until its last release", ); @@ -143,11 +143,11 @@ test("eviction of an in-use entry defers close to the last release", async () => // undefined), leaking the handle. await pool.release("a"); assert.equal( - probes.get("/a.duckdb")?.isClosed(), + probes.get("/a.sqlite")?.isClosed(), true, "last release of an evicted entry must close the store", ); - assert.equal(probes.get("/a.duckdb")?.closeCount(), 1, "store must close exactly once"); + assert.equal(probes.get("/a.sqlite")?.closeCount(), 1, "store must close exactly once"); await pool.release("b"); await pool.release("c"); @@ -165,15 +165,15 @@ test("shutdown closes entries evicted while still in use", async () => { }); // Overflow so "a" is evicted while refCount > 0, then shut down before any // release. The parked side-table entry must still be drained. - await pool.acquire("a", "/a.duckdb"); - await pool.acquire("b", "/b.duckdb"); - await pool.acquire("c", "/c.duckdb"); // evicts "a" (refCount 1, parked) + await pool.acquire("a", "/a.sqlite"); + await pool.acquire("b", "/b.sqlite"); + await pool.acquire("c", "/c.sqlite"); // evicts "a" (refCount 1, parked) await pool.shutdown(); assert.equal( - probes.get("/a.duckdb")?.isClosed(), + probes.get("/a.sqlite")?.isClosed(), true, "shutdown must close a still-referenced evicted entry", ); - assert.equal(probes.get("/a.duckdb")?.closeCount(), 1, "store must close exactly once"); + assert.equal(probes.get("/a.sqlite")?.closeCount(), 1, "store must close exactly once"); }); diff --git a/packages/mcp/src/resources/repo-context.ts b/packages/mcp/src/resources/repo-context.ts index 56b217e3..698020b4 100644 --- a/packages/mcp/src/resources/repo-context.ts +++ b/packages/mcp/src/resources/repo-context.ts @@ -22,7 +22,6 @@ const AVAILABLE_TOOLS = [ "context", "impact", "detect_changes", - "rename", "sql", ] as const; diff --git a/packages/mcp/src/test-utils.ts b/packages/mcp/src/test-utils.ts index b64f0824..b2ec49b1 100644 --- a/packages/mcp/src/test-utils.ts +++ b/packages/mcp/src/test-utils.ts @@ -208,7 +208,7 @@ export type StoreOverrides = Partial<{ bulkLoadCochanges: ITemporalStore["bulkLoadCochanges"]; bulkLoadSymbolSummaries: ITemporalStore["bulkLoadSymbolSummaries"]; exec: ITemporalStore["exec"]; - // Optional escape hatch — present on lbug adapter. + // Optional escape hatch — reserved for a community graph adapter. execCypher: NonNullable; // Legacy raw-SQL escape — only sql.test.ts calls this, but we keep // the override slot so the test can plug in a custom dispatcher. diff --git a/packages/mcp/src/tools/change-pack.test.ts b/packages/mcp/src/tools/change-pack.test.ts index 6a1a39c8..e99a147c 100644 --- a/packages/mcp/src/tools/change-pack.test.ts +++ b/packages/mcp/src/tools/change-pack.test.ts @@ -9,7 +9,7 @@ import type { ToolContext } from "./shared.js"; * The analysis `runChangePack` never throws — it fails open to an empty diff * when git is unavailable (which it is in the temp harness repo). That gives * the MCP tool a coherent, deterministic ChangePack to wrap without needing a - * real lbug graph DB or a git checkout. These tests assert the snake_case + * real graph store or a git checkout. These tests assert the snake_case * `structuredContent` shape the CLI parity test keys against. */ async function withHarness( diff --git a/packages/mcp/src/tools/group-tools.test.ts b/packages/mcp/src/tools/group-tools.test.ts index 2404bc1b..c491b412 100644 --- a/packages/mcp/src/tools/group-tools.test.ts +++ b/packages/mcp/src/tools/group-tools.test.ts @@ -184,7 +184,7 @@ async function withTestHarness( // Fake store pool: hand back a fake for every repo path. const pool = new ConnectionPool({ max: 4, ttlMs: 60_000 }, async (dbPath) => { - // dbPath looks like /.codehub/graph.lbug — match by repo name. + // dbPath looks like /.codehub/store.sqlite — match by repo name. for (const r of repos) { const rp = repoPaths.get(r.name); if (rp && dbPath.startsWith(rp)) { diff --git a/packages/mcp/src/tools/list-dead-code.test.ts b/packages/mcp/src/tools/list-dead-code.test.ts index 83ef684e..9544892f 100644 --- a/packages/mcp/src/tools/list-dead-code.test.ts +++ b/packages/mcp/src/tools/list-dead-code.test.ts @@ -49,8 +49,8 @@ function wrapAsStore(fake: unknown): import("@opencodehub/storage").Store { return { graph: fake as import("@opencodehub/storage").IGraphStore, temporal: fake as import("@opencodehub/storage").ITemporalStore, - graphFile: "/in-memory/graph.lbug", - temporalFile: "/in-memory/temporal.duckdb", + graphFile: "/in-memory/store.sqlite", + temporalFile: "/in-memory/store.sqlite", close: async () => { const closer = (fake as { close?: () => Promise }).close; if (typeof closer === "function") await closer.call(fake); diff --git a/packages/mcp/src/tools/list-findings-delta.test.ts b/packages/mcp/src/tools/list-findings-delta.test.ts index 2b1a0c36..7341d799 100644 --- a/packages/mcp/src/tools/list-findings-delta.test.ts +++ b/packages/mcp/src/tools/list-findings-delta.test.ts @@ -40,8 +40,8 @@ function wrapAsStore(fake: unknown): import("@opencodehub/storage").Store { return { graph: fake as import("@opencodehub/storage").IGraphStore, temporal: fake as import("@opencodehub/storage").ITemporalStore, - graphFile: "/in-memory/graph.lbug", - temporalFile: "/in-memory/temporal.duckdb", + graphFile: "/in-memory/store.sqlite", + temporalFile: "/in-memory/store.sqlite", close: async () => { const closer = (fake as { close?: () => Promise }).close; if (typeof closer === "function") await closer.call(fake); diff --git a/packages/mcp/src/tools/query.test.ts b/packages/mcp/src/tools/query.test.ts index ca8de4e1..5d9bfb73 100644 --- a/packages/mcp/src/tools/query.test.ts +++ b/packages/mcp/src/tools/query.test.ts @@ -62,8 +62,8 @@ function wrapAsStore(fake: unknown): import("@opencodehub/storage").Store { return { graph: fake as import("@opencodehub/storage").IGraphStore, temporal: fake as import("@opencodehub/storage").ITemporalStore, - graphFile: "/in-memory/graph.lbug", - temporalFile: "/in-memory/temporal.duckdb", + graphFile: "/in-memory/store.sqlite", + temporalFile: "/in-memory/store.sqlite", close: async () => { const closer = (fake as { close?: () => Promise }).close; if (typeof closer === "function") await closer.call(fake); @@ -84,7 +84,7 @@ interface FakeNode { * a two-phase PROCESS_STEP walk; the fake short-circuits that with a * pre-built lookup: for each top-K hit id that falls under a known process, * emit one (process_id, node_id, step) triple. This is enough to exercise - * the grouping/sort/score logic without replicating DuckDB's recursive CTE + * the grouping/sort/score logic without replicating the store's recursive CTE * engine. */ interface FakeProcessMember { diff --git a/packages/mcp/src/tools/query.ts b/packages/mcp/src/tools/query.ts index 4d9f6cda..3aff8c34 100644 --- a/packages/mcp/src/tools/query.ts +++ b/packages/mcp/src/tools/query.ts @@ -247,9 +247,9 @@ async function lookupSummariesForHits( */ async function bm25CorpusHasSummaries(temporal: ITemporalStore): Promise { // Table-existence introspection via SQLite's `sqlite_master` catalog, - // routed through the temporal-tier `exec` escape hatch. (Pre-ADR-0019 - // this probed DuckDB's `information_schema.tables`, which node:sqlite does - // not expose.) A future graph-only adapter pairing with a non-SQLite + // routed through the temporal-tier `exec` escape hatch. (An earlier + // backend probed `information_schema.tables`, which node:sqlite does not + // expose.) A future graph-only adapter pairing with a non-SQLite // temporal store can override this probe. try { const rows = await temporal.exec( diff --git a/packages/mcp/src/tools/run-smoke.test.ts b/packages/mcp/src/tools/run-smoke.test.ts index dca0ff0e..b31c756c 100644 --- a/packages/mcp/src/tools/run-smoke.test.ts +++ b/packages/mcp/src/tools/run-smoke.test.ts @@ -81,8 +81,8 @@ function wrapAsStore(fake: unknown): import("@opencodehub/storage").Store { return { graph: fake as import("@opencodehub/storage").IGraphStore, temporal: fake as import("@opencodehub/storage").ITemporalStore, - graphFile: "/in-memory/graph.lbug", - temporalFile: "/in-memory/temporal.duckdb", + graphFile: "/in-memory/store.sqlite", + temporalFile: "/in-memory/store.sqlite", close: async () => { const closer = (fake as { close?: () => Promise }).close; if (typeof closer === "function") await closer.call(fake); @@ -91,7 +91,7 @@ function wrapAsStore(fake: unknown): import("@opencodehub/storage").Store { } /** - * Minimal DuckDB-compatible fake — every `store.query` that a tool runs + * Minimal store-compatible fake — every `store.query` that a tool runs * against it returns an empty row set. That is enough to exercise the * `run` call path through `withStore` without a real index. Tools * handle empty results gracefully and return a "nothing matched" message @@ -131,7 +131,7 @@ function makeFakeStore(): IGraphStore { /** * Spin up a fake `~/.codehub` with one registered repo so `withStore` can - * resolve it. The connection pool is wired to return our fake DuckDB. + * resolve it. The connection pool is wired to return our fake store. */ async function withHarness(fn: (ctx: ToolContext) => Promise): Promise { const home = await mkdtemp(resolve(tmpdir(), "codehub-runsmoke-")); diff --git a/packages/mcp/src/tools/sql.test.ts b/packages/mcp/src/tools/sql.test.ts index 72463802..4e1a5531 100644 --- a/packages/mcp/src/tools/sql.test.ts +++ b/packages/mcp/src/tools/sql.test.ts @@ -117,7 +117,7 @@ async function withHarness( const ctx: ToolContext = { pool, home }; // Acquire once just to seed handle.store for spy-based tests. const repoPath = `${home}/fakerepo`; - const dbPath = `${repoPath}/.codehub/graph.lbug`; + const dbPath = `${repoPath}/.codehub/store.sqlite`; try { handle.store = await pool.acquire(repoPath, dbPath); } finally { @@ -386,7 +386,7 @@ test("sql: cypher timeout_ms is forwarded to store.query opts", async () => { // so `nodes` / `edges` / `embeddings` ARE directly SQL-queryable. The tool // description must advertise them as SQL tables under `sql:` and mark // `cypher:` as the community-fork-only escape hatch. (This inverts the prior -// finding-R2 contract, which assumed a Cypher-only lbug graph tier.) +// finding-R2 contract, which assumed a Cypher-only graph tier.) // --------------------------------------------------------------------------- test("sql: tool description advertises the graph tables as SQL-queryable and marks cypher fork-only", async () => { @@ -414,7 +414,7 @@ test("sql: tool description advertises the graph tables as SQL-queryable and mar ); // The inverted bug: the description must NOT claim the graph is - // unqueryable by SQL — that was true only under the old lbug backend. + // unqueryable by SQL — that was true only under an earlier graph backend. assert.ok( !/not SQL-queryable/i.test(desc), "description must NOT claim the graph is non-SQL-queryable (ADR 0019)", diff --git a/packages/pack/README.md b/packages/pack/README.md index fdd6a96b..e023a790 100644 --- a/packages/pack/README.md +++ b/packages/pack/README.md @@ -42,7 +42,7 @@ The `manifest.json` (`PackManifest`) lists every written BOM body in ## Determinism contract Same `(commit, tokenizer_id, budget_tokens, chonkie_version, -duckdb_version, grammar_commits)` produces a byte-identical pack and the +grammar_commits)` produces a byte-identical pack and the same `pack_hash`. All file bytes use LF line endings; CRLF and lone-CR inputs are normalized to LF before chunking and hashing, so two repos differing only in line-ending style produce the same `pack_hash`. diff --git a/packages/policy/README.md b/packages/policy/README.md index c6541e45..8cbba3ab 100644 --- a/packages/policy/README.md +++ b/packages/policy/README.md @@ -36,7 +36,7 @@ Violations are sorted by `ruleId` for deterministic CI output. ## Design -- **Pure evaluator** — no DuckDB, no filesystem beyond the one YAML read. +- **Pure evaluator** — no store, no filesystem beyond the one YAML read. Inputs (`PolicyContext`) are pre-computed by the caller. - **Zod-only** validation, matching `packages/sarif`. - **Self-hosted OSS** — no calls to any OpenCodeHub-operated service. diff --git a/packages/policy/src/evaluate.test.ts b/packages/policy/src/evaluate.test.ts index 971aa51c..e46c91dc 100644 --- a/packages/policy/src/evaluate.test.ts +++ b/packages/policy/src/evaluate.test.ts @@ -133,7 +133,7 @@ test("ownership_required: passes when approval comes from require_approval_from" const decision = evaluatePolicy( policy, emptyCtx({ - touchedPaths: ["packages/storage/src/duckdb.ts"], + touchedPaths: ["packages/storage/src/sqlite.ts"], approvals: ["@storage-team"], }), ); @@ -178,7 +178,7 @@ test("ownership_required: blocks when no acceptable approval is present", () => const decision = evaluatePolicy( policy, emptyCtx({ - touchedPaths: ["packages/storage/src/duckdb.ts"], + touchedPaths: ["packages/storage/src/sqlite.ts"], approvals: ["@not-storage"], }), ); @@ -344,7 +344,7 @@ test("evaluatePolicy: violations are sorted by ruleId across mixed rule types", emptyCtx({ blastRadiusTier: 3, licenseViolations: [{ license: "GPL-3.0", package: "readline-gpl" }], - touchedPaths: ["packages/storage/src/duckdb.ts"], + touchedPaths: ["packages/storage/src/sqlite.ts"], approvals: [], }), ); diff --git a/packages/search/README.md b/packages/search/README.md index 062552ab..ba2c3ec8 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -27,9 +27,9 @@ const fused = await hybridSearch( // fused: readonly FusedHit[] — { nodeId, score, sources: ("bm25" | "vector")[] } ``` -- **BM25** — full-text search via DuckDB FTS, run directly in the graph +- **BM25** — full-text search via SQLite FTS5, run directly in the store store (`packages/storage`). -- **Dense vector** — cosine ANN search via DuckDB's `hnsw_acorn` extension. +- **Dense vector** — cosine ANN search via brute-force cosine over the embeddings table. - **RRF fusion** — the BM25 and vector rank lists are merged with Reciprocal Rank Fusion (`DEFAULT_RRF_K = 60`) before the final `limit` is applied. Each `FusedHit.sources` records which runs (`"bm25"`, `"vector"`) diff --git a/packages/storage/README.md b/packages/storage/README.md index 90e33ea5..9cc55ada 100644 --- a/packages/storage/README.md +++ b/packages/storage/README.md @@ -1,36 +1,38 @@ # @opencodehub/storage -Storage abstraction for OpenCodeHub. The graph tier is always -`@ladybugdb/core` (`graph.lbug`) — symbols, edges, embeddings, HNSW ANN -search, and BM25 full-text search. The temporal tier is always DuckDB -(`temporal.duckdb`) — cochanges and the symbol-summary cache. +Storage abstraction for OpenCodeHub. The whole index lives in one +`/.codehub/store.sqlite` file (WAL) via Node's built-in `node:sqlite` +— graph nodes, edges, embeddings, BM25 full-text search, and the temporal +tables (cochanges, symbol-summary cache). One `SqliteStore` class +implements both the graph tier (`IGraphStore`) and the temporal tier +(`ITemporalStore`); there are zero native storage bindings. ## Surface ```ts -import { openStore, StorageAdapter } from "@opencodehub/storage"; +import { openStore, type Store } from "@opencodehub/storage"; -const store = await openStore({ repoRoot: "/path/to/repo" }); -// store: StorageAdapter — read/write graph nodes and edges +const store = await openStore({ path: "/path/to/repo/.codehub" }); +await store.graph.open(); +// store.graph.X() / store.temporal.Y() — both hit the one SqliteStore ``` -- **`openStore`** — opens both tiers and returns `{ graph: GraphDbStore, - temporal: DuckDbStore, graphFile, temporalFile, close }`. No `backend` - field, no probe, no fallback; if `@ladybugdb/core` cannot load, - `open()` throws `GraphDbBindingError` and the operation aborts. -- **`GraphDbStore` / `DuckDbStore`** — `IGraphStore` lives only on - `GraphDbStore`; `DuckDbStore` implements `ITemporalStore` only. The - segregated interfaces are the v1.0 contract for community-fork adapters - (AGE / Memgraph / Neo4j / Neptune target `IGraphStore`). -- **`test-utils`** — exported as `@opencodehub/storage/test-utils` for - in-memory stores in tests (`packages/storage/src/test-utils/index.ts`). - The `assertIGraphStoreConformance` conformance suite stays as the - community-adapter contract. - -There is no backend selection: lbug owns the graph, DuckDB owns the -temporal store, both files are always written. See -[ADR 0016](../../docs/adr/0016-duckdb-graph-rip.md) for the rationale -behind ripping out the DuckDB graph backend. +- **`openStore`** — constructs one `SqliteStore` and returns it as both + views: `{ graph, temporal, graphFile, temporalFile, close }`. `graph` + and `temporal` are the same instance; `graphFile` and `temporalFile` + are the same `store.sqlite` path (retained so callers keep compiling). + No `backend` field, no probe, no fallback, nothing to compile at install. +- **`SqliteStore`** — the single concrete adapter, implementing + `IGraphStore` + `ITemporalStore`. The two interfaces stay segregated as + the contract for a community SQL-shaped fork that wants to swap the + temporal tier. +- **`test-utils`** — exported as `@opencodehub/storage/test-utils` + (`packages/storage/src/test-utils/index.ts`). Ships `assertGraphParity` + + `rebuildFromStore`, the graphHash byte-identity parity primitives the + in-tree `sqlite-parity.test.ts` runs across every node/edge kind. + +See [ADR 0019](../../docs/adr/0019-single-file-sqlite-storage.md) for the +single-file SQLite migration (supersedes ADR 0016's DuckDB-graph rip). ## Design diff --git a/packages/storage/src/column-encode.test.ts b/packages/storage/src/column-encode.test.ts index 3f8027d6..f36ed796 100644 --- a/packages/storage/src/column-encode.test.ts +++ b/packages/storage/src/column-encode.test.ts @@ -89,7 +89,7 @@ test("stringArrayOrNull: preserves [] vs absent for round-trip symmetry", () => // Explicit empty array survives the writer side as a typed 0-length // array (NOT null) so the native TEXT[] / STRING[] column can // distinguish `keywords: []` from absent. The symmetric reader is in - // duckdb-adapter.ts:setStringArrayField + analyze.ts:stringArrayField. + // the SqliteStore payload-JSON read path (rehydrateNode). assert.deepEqual(stringArrayOrNull([]), []); assert.equal(stringArrayOrNull("a"), null); assert.equal(stringArrayOrNull(null), null); @@ -329,7 +329,7 @@ test("stepZeroSentinel: drops 0 / null / undefined; passes through positive inte test("coerceLanguageStats: parse string / coerce empty / drop garbage", () => { assert.deepEqual(coerceLanguageStats('{"ts":0.83,"py":0.14}'), { ts: 0.83, py: 0.14 }); // Empty string sentinel — the writer collapsed an empty stats object to - // SQL NULL, which DuckDB reads back as null and the graph-db reads as + // SQL NULL, which a columnar graph adapter reads back as // null/undefined depending on the binding; all paths converge to {}. assert.deepEqual(coerceLanguageStats(null), {}); assert.deepEqual(coerceLanguageStats(undefined), {}); diff --git a/packages/storage/src/column-encode.ts b/packages/storage/src/column-encode.ts index a71faf45..0929ffd7 100644 --- a/packages/storage/src/column-encode.ts +++ b/packages/storage/src/column-encode.ts @@ -1,15 +1,13 @@ /** * Shared column-encoder helpers for the polymorphic CodeNode table. * - * `GraphDbStore` (`./graphdb-adapter.ts`, lbug) is the in-tree - * {@link IGraphStore} writer: it emits a 73-column row per node where every - * column matches the canonical {@link NODE_COLUMNS} order. These are the - * canonical encode helpers for that contract, kept here (rather than inline - * in the adapter) so a community `IGraphStore` adapter (AGE / Memgraph / - * Neo4j / Neptune) can consume the identical implementation and stay - * byte-identical under `graphHash`. (`DuckDbStore` is - * {@link ITemporalStore}-only post-ADR-0016 and does NOT write CodeNode - * rows; before the rip-out both adapters shared these helpers.) + * `SqliteStore` (`./sqlite-adapter.ts`) is the in-tree {@link IGraphStore} + * writer post-ADR-0019: it emits one row per node where every column matches + * the canonical {@link NODE_COLUMNS} order. These are the canonical encode + * helpers for that contract, kept here (rather than inline in the adapter) so + * a community `IGraphStore` adapter (AGE / Memgraph / Neo4j / Neptune) can + * consume the identical implementation and stay byte-identical under + * `graphHash`. * * The module is `internal-only` — it is NOT re-exported from * `packages/storage/src/index.ts`. Adapters import directly from @@ -38,17 +36,14 @@ * **`stringArrayOrNull` round-trip note** — an explicit empty `[]` and an * absent field are kept distinct on the wire. {@link stringArrayOrNull} * returns a typed 0-length array for an empty-array input and `null` for a - * non-array input. The lbug graph adapter preserves the distinction with a - * version-agnostic marker scheme (`encodeNodeCol` + `setStringArrayFieldGd` - * in graphdb-adapter.ts): - * - an explicit empty array is written as a single-element marker and - * decoded back to `[]` on read; - * - an absent field is written as a bare `[]`, which decodes as absent — - * whether lbug stored it as SQL NULL (≤ v0.16.1, where a 0-length - * `STRING[]` collapsed to NULL on write) or as a typed empty `STRING[]` - * (≥ v0.17.0, PR #471, where empty lists round-trip). - * A SQL-backed community adapter with a native array column (e.g. SQLite - * `TEXT[]`) can instead store the 0-length literal directly; either scheme + * non-array input. `SqliteStore` stores the JSON array literal in the node + * `payload` column, so the empty-vs-absent distinction survives directly + * (`[]` stays `[]`, an absent field stays absent). A community graph adapter + * whose backend cannot store a 0-length list natively — e.g. a typed + * `STRING[]` column that collapses `[]` to NULL on write — must preserve the + * distinction with a marker scheme instead: write an explicit empty array as a + * single-element marker decoded back to `[]` on read, and an absent field as + * a bare value that decodes as absent. Either scheme * satisfies the contract. Net effect: `{keywords: []}` round-trips * byte-identically to itself instead of collapsing to `{}` (canonical-JSON / * graphHash distinction preserved on every backend). Enforced end-to-end by @@ -333,13 +328,12 @@ export function booleanOrNull(v: unknown): boolean | null { * * **Preserve `[]` distinct from absent.** Returning a typed `[]` on an * empty-array input (rather than `null`) carries the "explicit empty" - * signal into each adapter's writer. SQLite `TEXT[]` stores a 0-length - * literal natively; lbug `STRING[]` cannot (it collapses `[]` to NULL on - * write), so the graph-db adapter substitutes an empty-array marker on the - * way in and decodes it back on the way out — see `encodeNodeCol` + - * `setStringArrayFieldGd` in `graphdb-adapter.ts`. The symmetric reader - * change in `duckdb-adapter.ts:setStringArrayField` and - * `analyze.ts:stringArrayField` re-attaches `[]` instead of dropping the + * signal into each adapter's writer. `SqliteStore` stores the array in the + * JSON `payload` column, so a 0-length list round-trips natively; a community + * graph adapter whose column type cannot hold an empty list (e.g. a typed + * `STRING[]` column that collapses `[]` to NULL on write) must substitute an + * empty-array marker on the way in and decode it back on the way out. The + * `payload`-JSON read path re-attaches `[]` instead of dropping the * field when the read-back array has length zero. Combined, this preserves * the canonical-JSON shape difference between `{keywords: []}` and `{}` * (graphHash content-shape change — see the empty-keywords fixture in diff --git a/packages/storage/src/index.ts b/packages/storage/src/index.ts index 411d8301..a23f16f3 100644 --- a/packages/storage/src/index.ts +++ b/packages/storage/src/index.ts @@ -47,7 +47,6 @@ export { resolveRepoMetaDir, } from "./paths.js"; export { getAllRelationTypes } from "./relations.js"; -export { generateSchemaDDL, type SchemaOptions } from "./schema-ddl.js"; export { assertReadOnlySql, SqlGuardError } from "./sql-guard.js"; export { SqliteStore, type SqliteStoreOptions } from "./sqlite-adapter.js"; export { installSqliteRuntimeGuard } from "./sqlite-runtime.js"; @@ -59,8 +58,8 @@ import { SqliteStore, type SqliteStoreOptions } from "./sqlite-adapter.js"; /** * Combined options accepted by {@link openStore}. Superset of the spec-level * {@link ApiOpenStoreOptions} that adds the SQLite-adapter tuning bag. The - * single-file store replaced the lbug + DuckDB pair (ADR 0019), so the former - * `duckOptions` / `graphDbOptions` per-backend bags are gone. + * single-file store (ADR 0019) has one backend, so the former per-backend + * option bags are gone. */ export interface OpenStoreOptions extends ApiOpenStoreOptions { /** SQLite-adapter tuning (journal mode, busy timeout). */ @@ -68,10 +67,10 @@ export interface OpenStoreOptions extends ApiOpenStoreOptions { } /** - * Resolve the single store file. The whole index now lives in ONE - * `/store.sqlite` (WAL) — there is no graph.lbug / temporal.duckdb - * split. The input `path` is the directory anchor (its dirname is the - * `/.codehub/` parent); `:memory:` short-circuits for tests. + * Resolve the single store file. The whole index lives in ONE + * `/store.sqlite` (WAL) — one file, one backend (ADR 0019). The input + * `path` is the directory anchor (its dirname is the `/.codehub/` + * parent); `:memory:` short-circuits for tests. */ function resolveStoreFile(path: string): string { if (path === ":memory:") return ":memory:"; diff --git a/packages/storage/src/interface.test.ts b/packages/storage/src/interface.test.ts index 96d3698e..8435959a 100644 --- a/packages/storage/src/interface.test.ts +++ b/packages/storage/src/interface.test.ts @@ -139,11 +139,11 @@ test("Store alias matches OpenStoreResult composition", () => { const dummy: Store = { graph: undefined as unknown as IGraphStore, temporal: undefined as unknown as ITemporalStore, - graphFile: "/tmp/.codehub/graph.lbug", - temporalFile: "/tmp/.codehub/temporal.duckdb", + graphFile: "/tmp/.codehub/store.sqlite", + temporalFile: "/tmp/.codehub/store.sqlite", close: async () => {}, }; - assert.equal(dummy.graphFile, "/tmp/.codehub/graph.lbug"); - assert.equal(dummy.temporalFile, "/tmp/.codehub/temporal.duckdb"); + assert.equal(dummy.graphFile, "/tmp/.codehub/store.sqlite"); + assert.equal(dummy.temporalFile, "/tmp/.codehub/store.sqlite"); assert.equal(typeof dummy.close, "function"); }); diff --git a/packages/storage/src/interface.ts b/packages/storage/src/interface.ts index ec56d8a3..8b51722c 100644 --- a/packages/storage/src/interface.ts +++ b/packages/storage/src/interface.ts @@ -101,38 +101,15 @@ export type GraphDialect = "cypher"; * interface only. They pair with an {@link ITemporalStore} (always * SQLite-backed by default) for tabular concerns. * - * ## v1.0 conformance contract + * ## graphHash parity contract * - * `assertIGraphStoreConformance(name, factory)` from - * `@opencodehub/storage/test-utils` is the formal v1.0 conformance test - * suite for community adapters. A third-party adapter author imports it - * from their own test file: - * - * ```ts - * import { test } from "node:test"; - * import { assertIGraphStoreConformance } from "@opencodehub/storage/test-utils"; - * import { AgeGraphStore } from "../src/age-store.js"; - * - * assertIGraphStoreConformance("Apache AGE", async () => { - * const store = new AgeGraphStore({ pgUrl: "postgresql://..." }); - * await store.open(); - * await store.createSchema(); - * return store; - * }); - * ``` - * - * The suite proves the adapter has byte-identical {@link KnowledgeGraph} - * round-trip via `graphHash`, that `listEdgesByType` agrees with - * `listEdges({types})`, that `traverseAncestors` is a subset of the BFS - * over `listEdges` truncated at the depth bound, that `listNodes` is - * `id ASC` and pages stably, and that `healthCheck` returns `{ok: true}` - * after `open + createSchema`. Vector search is treated as an optional - * capability and skipped cleanly when the adapter throws "not implemented" - * or returns `[]` for a known-non-empty query. - * - * The in-tree adapter (`SqliteStore`) opts into this suite from its own - * test file — any future signature change here MUST keep the conformance - * suite green before landing. + * `assertGraphParity` + `rebuildFromStore` from + * `@opencodehub/storage/test-utils` are the parity primitives an adapter + * must satisfy: a {@link KnowledgeGraph} written and read back through the + * typed finders must reproduce a byte-identical `graphHash`. The in-tree + * adapter (`SqliteStore`) exercises them in `sqlite-parity.test.ts` across + * every node/edge kind + the step-zero sentinel; any future signature + * change here MUST keep that parity suite green before landing. */ export interface IGraphStore { /** @@ -203,7 +180,7 @@ export interface IGraphStore { * discriminator. Unknown kinds yield 0 rows. * - Results are ORDER BY id ASC at the storage layer for cross-adapter * determinism. Adapters apply a lex-stable JS-side tiebreak so the - * output matches byte-for-byte across DuckStore and GraphDbStore. + * output matches byte-for-byte across any conforming adapter. * - Wider polymorphic columns (Dependency `version`/`license`/ * `lockfile_source`/`ecosystem`, ProjectProfile JSON arrays, Repo * fields, etc.) are mapped back onto the typed shape via per-kind diff --git a/packages/storage/src/license.ts b/packages/storage/src/license.ts index 39a57f4b..d69188c0 100644 --- a/packages/storage/src/license.ts +++ b/packages/storage/src/license.ts @@ -1,11 +1,10 @@ /** * License-tier classification — pure, dependency-free. * - * Extracted out of `duckdb-adapter.ts` so consumers (the single-file - * `SqliteStore`, `listDependencies`, the license-audit surface) can use it - * WITHOUT transitively importing `@duckdb/node-api`. That top-level native - * import is exactly what would defeat the lazy-SQLite contract — importing a - * pure helper must never load a native binding. + * Lives in its own pure module so consumers (`SqliteStore`, + * `listDependencies`, the license-audit surface) can use it without pulling + * in any storage-adapter internals. Keeping the classifier dependency-free is + * deliberate — a pure helper must never drag in the store's runtime deps. */ /** diff --git a/packages/storage/src/relations.ts b/packages/storage/src/relations.ts index f1f836a7..e7a2714a 100644 --- a/packages/storage/src/relations.ts +++ b/packages/storage/src/relations.ts @@ -3,10 +3,10 @@ * * The single source of truth for which edge relation types exist, in their * load-bearing order (append new kinds, NEVER reorder — commit diffs and any - * schema emitter depend on the order). Lived in `graphdb-schema.ts`; extracted - * here so the single-file `SqliteStore` and the parity tests can reach it - * without importing the lbug-era schema module (deleted in the single-file - * migration). + * schema emitter depend on the order). Extracted into this pure module so the + * single-file `SqliteStore` and the parity tests can reach it directly (the + * prior schema module that once held it was removed in the single-file + * migration, ADR 0019). */ const RELATION_KINDS: readonly string[] = [ "CONTAINS", diff --git a/packages/storage/src/schema-ddl.ts b/packages/storage/src/schema-ddl.ts deleted file mode 100644 index 2122a51c..00000000 --- a/packages/storage/src/schema-ddl.ts +++ /dev/null @@ -1,69 +0,0 @@ -/** - * DDL emitter for the SQLite-backed temporal store. - * - * Two tables only: - * - `cochanges` — file-level association statistics from git history. - * - `symbol_summaries` — structured per-symbol summaries from the - * ingestion summarize phase, keyed by - * `(node_id, content_hash, prompt_version)`. - * - * The graph tier (nodes/edges/embeddings/store_meta) lives in the lbug - * graph artifact; this DDL is intentionally narrow. - */ - -export interface SchemaOptions { - /** - * Retained for API symmetry with the prior multi-tier schema; the - * temporal-only DDL never references it. Callers that supply it pay - * one validation check; omitting it is also accepted. - */ - readonly embeddingDim?: number; -} - -/** - * Returns a sequence of DDL statements that must be executed in order. - */ -export function generateSchemaDDL(_opts: SchemaOptions = {}): readonly string[] { - return [ - // File-level co-change table. The signal is statistical (not deterministic), - // file-granular, and rewrites on every commit. - `CREATE TABLE IF NOT EXISTS cochanges ( - source_file TEXT NOT NULL, - target_file TEXT NOT NULL, - cocommit_count INTEGER NOT NULL, - total_commits_source INTEGER NOT NULL, - total_commits_target INTEGER NOT NULL, - last_cocommit_at TIMESTAMP NOT NULL, - lift DOUBLE NOT NULL, - PRIMARY KEY (source_file, target_file) - )`, - - `CREATE INDEX IF NOT EXISTS idx_cochanges_source ON cochanges (source_file)`, - `CREATE INDEX IF NOT EXISTS idx_cochanges_target ON cochanges (target_file)`, - - // Symbol-level structured summaries. Keyed by (node_id, content_hash, - // prompt_version) so prompt iteration and source-text drift don't - // collide. Summaries are side-channel content — they do NOT participate - // in the graph edge set. `structured_json` carries the validated - // structured payload (citations + side_effects + invariants + per-input - // descriptions + returns.details) as a canonical-JSON blob so the - // citation-grounded fields the summarizer validates survive ingestion - // instead of being discarded after `summaryText` / `signatureSummary` / - // `returnsTypeSummary` are extracted. NULL when the producing prompt - // emitted no structured payload (e.g. a pre-structured-summaries row). - `CREATE TABLE IF NOT EXISTS symbol_summaries ( - node_id TEXT NOT NULL, - content_hash TEXT NOT NULL, - prompt_version TEXT NOT NULL, - model_id TEXT NOT NULL, - summary_text TEXT NOT NULL, - signature_summary TEXT, - returns_type_summary TEXT, - structured_json TEXT, - created_at TIMESTAMP NOT NULL, - PRIMARY KEY (node_id, content_hash, prompt_version) - )`, - - `CREATE INDEX IF NOT EXISTS idx_summaries_node ON symbol_summaries (node_id)`, - ]; -} diff --git a/packages/storage/src/sqlite-adapter.test.ts b/packages/storage/src/sqlite-adapter.test.ts index 9d9d6e0c..2a012a0f 100644 --- a/packages/storage/src/sqlite-adapter.test.ts +++ b/packages/storage/src/sqlite-adapter.test.ts @@ -5,7 +5,7 @@ * ONE `*.sqlite` file in WAL mode, opened through Node's built-in * `node:sqlite` with zero native dependencies, can back the graph tier * (nodes, edges, traversal), the embedding tier (Float32Array vectors, - * cosine KNN), and the temporal tier — replacing the lbug + DuckDB pair. + * cosine KNN), and the temporal tier — one file, one backend (ADR 0019). * * The acceptance bar: * - a real KnowledgeGraph bulk-loads and round-trips (nodes + edges) from diff --git a/packages/storage/src/sqlite-adapter.ts b/packages/storage/src/sqlite-adapter.ts index 17f7731b..f0fa3471 100644 --- a/packages/storage/src/sqlite-adapter.ts +++ b/packages/storage/src/sqlite-adapter.ts @@ -3,11 +3,10 @@ * * THESIS. One `store.sqlite` file in WAL mode backs EVERYTHING: graph nodes, * edges, embeddings, and the temporal/non-graph tables (cochanges, symbol - * summaries). It replaced the two native-binding engines this project used - * before — `graph.lbug` via @ladybugdb/core + `temporal.duckdb` via - * @duckdb/node-api. Collapsing both onto Node 24's built-in `node:sqlite` - * removed the last two native storage dependencies, which is what unlocked - * the real goal: a zero-dep, one-command, no-Docker install + * summaries). It replaced the two native-binding storage engines this project + * used before (see ADR 0019). Collapsing both onto Node 24's built-in + * `node:sqlite` removed the last two native storage dependencies, which is + * what unlocked the real goal: a zero-dep, one-command, no-Docker install * (`npm i -g @opencodehub/cli` and nothing else). * * STATUS. This file implements the FULL {@link IGraphStore} + @@ -20,16 +19,15 @@ * `graphHash`. The node write/read path round-trips the full node object * through a JSON `payload` column (so arbitrary kind-specific fields — and the * `keywords: []`-vs-absent and `languageStats: {}` distinctions canonicalJson - * cares about — survive verbatim). The edge read path mirrors - * `GraphDbStore.listEdgesInternalGd` exactly, including the + * cares about — survive verbatim). The edge read path reproduces the + * canonical edge contract (ADR 0019), including the * {@link stepZeroSentinel} drop, the empty-reason drop, and the * `(from, to, type, id)` sort. Filter-only columns (severity, rule_id, * ecosystem, method, entry_point_id, repo_uri, …) live INSIDE the payload and * are reached via SQLite JSON1 `payload->>'$.field'` extracts. * * NON-GOAL. No backwards compatibility. Clean slate: this adapter assumes a - * fresh index, not a migration of existing `graph.lbug` / `temporal.duckdb` - * artifacts (per the spike brief). + * fresh index, not a migration of any prior-backend artifacts (ADR 0019). */ // Install the experimental-warning guard BEFORE the node:sqlite binding loads. @@ -240,8 +238,8 @@ export class SqliteStore implements IGraphStore, ITemporalStore { PRIMARY KEY (granularity, node_id, chunk_index) ); `); - // BM25 search: an FTS5 virtual table mirroring the THREE columns lbug's - // QUERY_FTS_INDEX indexes — name + signature + description. node_id is + // BM25 search: an FTS5 virtual table over the THREE columns the former + // FTS index covers name + signature + description. node_id is // UNINDEXED (carried for the join back to `nodes`). Populated at bulkLoad // from nodes.name + payload.signature/description. db.exec(` @@ -254,7 +252,7 @@ export class SqliteStore implements IGraphStore, ITemporalStore { ); `); // ── Temporal / non-graph tier — same file, no second engine ── - // Canonical 7-column cochanges shape (matches schema-ddl.ts:30-42). + // Canonical 7-column cochanges shape. // last_cocommit_at is stored as a TEXT ISO-8601 string (SQLite has no // native TIMESTAMP type; the affinity is irrelevant for a TEXT round-trip). db.exec(` @@ -271,7 +269,7 @@ export class SqliteStore implements IGraphStore, ITemporalStore { CREATE INDEX IF NOT EXISTS idx_cochanges_source ON cochanges (source_file); CREATE INDEX IF NOT EXISTS idx_cochanges_target ON cochanges (target_file); `); - // Canonical 9-column symbol_summaries shape (matches schema-ddl.ts:54-67). + // Canonical 9-column symbol_summaries shape. db.exec(` CREATE TABLE IF NOT EXISTS symbol_summaries ( node_id TEXT NOT NULL, @@ -287,8 +285,8 @@ export class SqliteStore implements IGraphStore, ITemporalStore { ); CREATE INDEX IF NOT EXISTS idx_summaries_node ON symbol_summaries (node_id); `); - // Single-row meta table keyed by id=1 (mirrors GraphDbStore's StoreMeta - // {id:1} MERGE pattern). Typed columns so getMeta can re-attach optional + // Single-row meta table keyed by id=1 (keeps the former GraphDbStore + // StoreMeta {id:1} MERGE pattern). Typed columns so getMeta can re-attach optional // fields only when the column is non-null (exactOptional readback). db.exec(` CREATE TABLE IF NOT EXISTS store_meta ( @@ -419,8 +417,8 @@ export class SqliteStore implements IGraphStore, ITemporalStore { } async listNodes(opts: ListNodesOptions = {}): Promise { - // Empty-array short-circuits BEFORE touching the connection (matches - // GraphDbStore.listNodes:1115-1117 — pure-JS contract). + // Empty-array short-circuits BEFORE touching the connection (the + // pure-JS contract carried over from the former GraphDbStore.listNodes). const kinds = opts.kinds; if (kinds !== undefined && kinds.length === 0) return []; const idsRaw = opts.ids; @@ -458,7 +456,7 @@ export class SqliteStore implements IGraphStore, ITemporalStore { const offset = clampNonNegativeInt(opts.offset); const wheres: string[] = ["kind = ?"]; const params: SqlParam[] = [kind]; - // NOTE: GraphDbStore ANDs filePath + filePathLike (impl 1201-1210) even + // NOTE: the former GraphDbStore impl ANDed filePath + filePathLike even // though the interface doc says "exact takes priority" — mirror the IMPL. if (opts.filePath !== undefined) { wheres.push("file_path = ?"); @@ -671,7 +669,8 @@ export class SqliteStore implements IGraphStore, ITemporalStore { counts.set(r.type, typeof r.n === "bigint" ? Number(r.n) : Number(r.n ?? 0)); } // Emit a 0 entry for every requested/all type with no rows (the - // GraphDbStore per-type loop guarantees every input type appears). + // per-type loop guarantees every input type appears — same contract the + // former GraphDbStore honored). for (const t of requested) out.set(t, counts.get(t) ?? 0); return out; } @@ -722,7 +721,7 @@ export class SqliteStore implements IGraphStore, ITemporalStore { ...(step !== undefined ? { step } : {}), }); } - // Final ordering: (from, to, type, id) — byte-for-byte the GraphDbStore key. + // Final ordering: (from, to, type, id) — byte-for-byte the canonical edge key. collected.sort((x, y) => { if (x.from !== y.from) return x.from < y.from ? -1 : 1; if (x.to !== y.to) return x.to < y.to ? -1 : 1; @@ -916,8 +915,8 @@ export class SqliteStore implements IGraphStore, ITemporalStore { vector: Uint8Array; }[]; // VectorResult.distance is a DISTANCE (lower = closer). Cosine distance - // = 1 - cosine similarity, so ranking ascending matches the lbug HNSW - // contract (ORDER BY distance ASC). + // = 1 - cosine similarity, so ranking ascending matches the canonical + // vector-search contract (ORDER BY distance ASC). const scored: VectorResult[] = rows.map((r) => ({ nodeId: r.node_id, distance: 1 - cosine(query, blobToF32(r.vector)), @@ -939,9 +938,9 @@ export class SqliteStore implements IGraphStore, ITemporalStore { } // CRITICAL: SQLite bm25() returns a NEGATIVE number (more-negative = // more-relevant). To expose SearchResult.score as "higher = better" - // (matching lbug's score DESC), set score = -bm25(...) and ORDER BY - // bm25(...) ASC (== score DESC). Tiebreak (id, file_path, name) ASC - // mirrors DuckDbStore.search. + // (the "score DESC" contract the former backends exposed), set + // score = -bm25(...) and ORDER BY bm25(...) ASC (== score DESC). + // Tiebreak (id, file_path, name) ASC keeps ordering deterministic. const sql = "SELECT n.id AS node_id, n.file_path AS file_path, n.name AS name, n.kind AS kind, " + "-bm25(nodes_fts) AS score, bm25(nodes_fts) AS rank " + @@ -973,9 +972,9 @@ export class SqliteStore implements IGraphStore, ITemporalStore { * Reachability traversal as a single recursive CTE. `direction:"down"` * follows outgoing edges (callees / dependencies); `"up"` follows incoming * edges (callers / dependents — the blast-radius direction). Bounded by - * maxDepth so a cyclic graph terminates. This is the LadybugDB-Cypher - * replacement, and the whole reason traversal is feasible without a - * graph engine. + * maxDepth so a cyclic graph terminates. This recursive CTE is what makes + * traversal feasible on plain SQLite — it replaced the Cypher traversal the + * former graph-engine backend relied on. */ async traverse(q: TraverseQuery): Promise { const maxDepth = Math.max(0, Math.floor(q.maxDepth)); @@ -1258,7 +1257,7 @@ export class SqliteStore implements IGraphStore, ITemporalStore { }); db.exec("BEGIN"); try { - // DELETE+INSERT upsert per composite key (mirrors DuckDb's approach). + // DELETE+INSERT upsert per composite key. const del = db.prepare( "DELETE FROM symbol_summaries WHERE node_id = ? AND content_hash = ? AND prompt_version = ?", ); @@ -1446,10 +1445,9 @@ function summaryRowFromRecord(row: Record): SymbolSummaryRow { } /** - * Clamp a number to a non-negative integer. Semantics match - * `clampNonNegativeIntGd` (graphdb-adapter.ts:2202-2207): `undefined` / `null` - * / negative / non-finite → `undefined` (no clause); `0` preserved; else - * `Math.floor`. + * Clamp a number to a non-negative integer. Semantics carried over from the + * former `clampNonNegativeIntGd` helper: `undefined` / `null` / negative / + * non-finite → `undefined` (no clause); `0` preserved; else `Math.floor`. */ function clampNonNegativeInt(v: number | undefined): number | undefined { if (v === undefined || v === null) return undefined; diff --git a/packages/storage/src/test-utils/conformance.ts b/packages/storage/src/test-utils/conformance.ts deleted file mode 100644 index 1114ae1e..00000000 --- a/packages/storage/src/test-utils/conformance.ts +++ /dev/null @@ -1,448 +0,0 @@ -/** - * v1.0 community-adapter conformance suite (architecture-revised.md §AC-A-11). - * - * `assertIGraphStoreConformance(name, factory)` registers a pre-baked set - * of `node:test` blocks that exercise the v1.0 {@link IGraphStore} contract - * end-to-end. A community AGE / Memgraph / Neo4j / Neptune adapter author - * imports this from `@opencodehub/storage/test-utils` and runs it against - * their own implementation: - * - * ```ts - * import { test } from "node:test"; - * import { assertIGraphStoreConformance } from "@opencodehub/storage/test-utils"; - * import { AgeGraphStore } from "../src/age-store.js"; - * - * assertIGraphStoreConformance("Apache AGE", async () => { - * const store = new AgeGraphStore({ pgUrl: "postgresql://..." }); - * await store.open(); - * await store.createSchema(); - * return store; - * }); - * ``` - * - * Pass = the adapter has byte-identical {@link graphHash} output AND the - * typed-finder semantics required by every in-tree caller (skeleton/xref - * packs, MCP tools, analysis pipelines). - * - * The suite owns its own minimal fixtures so a community fork does NOT - * inherit a moving target every time the in-tree adapter test files change. - * - * ## Registered tests - * - * 1. `lifecycle: bulkLoad fills counts + healthCheck=ok` — sanity that - * `open` + `createSchema` + `bulkLoad` each return without throwing - * and the resulting store reports `{ok: true}`. - * 2. `parity: rebuildFromStore graphHash byte-identical to fixture` — - * the Liskov contract from {@link rebuildFromStore}. Any adapter that - * passes here is byte-equivalent on the wire to DuckDb + GraphDb. - * 3. `listEdgesByType("CALLS") ≡ listEdges({types:["CALLS"]})` — typed - * shorthand must match the general filter. Catches adapter bugs - * where the two paths diverge on ordering or projection. - * 4. `traverseAncestors invariants` — the result of - * `traverseAncestors({maxDepth: N})` must be a subset of the BFS over - * `listEdges({types})` truncated at depth N, plus the start node is - * excluded and depth/path fields are well-formed. - * 5. `listNodes ordering + paging` — `id ASC` order across two writes, - * and `limit + offset` pages line up with the full-list slice. - * 6. `vectorSearch (optional)` — if the adapter implements vector search, - * assert ordered results; cleanly skipped via `t.skip()` when the - * adapter throws "vectorSearch not implemented", returns an empty - * array for a known-non-empty input, or the in-tree HNSW extension - * is unavailable. See {@link assertIGraphStoreConformance} JSDoc on - * skip semantics. - * - * Every block opens a fresh adapter via `factory()`. The factory is - * expected to return an `IGraphStore` that has already had `open()` and - * `createSchema()` called — the suite only owns the bulk-load → assert → - * close sequence so adapters with bespoke open requirements (custom - * connection strings, auth tokens, schema namespaces) stay decoupled - * from this file. - */ - -import assert from "node:assert/strict"; -import { test } from "node:test"; -import { - type CodeRelation, - type GraphNode, - graphHash, - KnowledgeGraph, - makeNodeId, - type NodeId, -} from "@opencodehub/core-types"; -import type { IGraphStore } from "../interface.js"; -import { rebuildFromStore } from "./parity-harness.js"; - -/** - * Minimal File + Function + CALLS chain fixture used by every conformance - * test block. Kept small (8 functions, two files) so an adapter under test - * does not pay a heavy ingestion cost; large enough to exercise paging, - * ordering, and a non-trivial CALLS chain for traversal. - * - * The ids are content-derived via {@link makeNodeId} so two independent - * builds produce byte-identical id strings — required for the parity - * round-trip + `listNodes id ASC` determinism asserts. - */ -function buildConformanceFixture(): KnowledgeGraph { - const g = new KnowledgeGraph(); - - const fileA = makeNodeId("File", "src/a.ts", "a.ts"); - const fileB = makeNodeId("File", "src/b.ts", "b.ts"); - g.addNode({ id: fileA, kind: "File", name: "a.ts", filePath: "src/a.ts" }); - g.addNode({ id: fileB, kind: "File", name: "b.ts", filePath: "src/b.ts" }); - - const funcs: NodeId[] = []; - for (let i = 0; i < 8; i += 1) { - const file = i % 2 === 0 ? "src/a.ts" : "src/b.ts"; - const id = makeNodeId("Function", file, `fn_${i}`, { parameterCount: i % 3 }); - funcs.push(id); - g.addNode({ - id, - kind: "Function", - name: `fn_${i}`, - filePath: file, - startLine: 10 + i, - endLine: 20 + i, - signature: `function fn_${i}()`, - parameterCount: i % 3, - isExported: i % 2 === 0, - }); - } - - // DEFINES from each file to its functions. - for (let i = 0; i < funcs.length; i += 1) { - const from = i % 2 === 0 ? fileA : fileB; - g.addEdge({ from, to: funcs[i] as NodeId, type: "DEFINES", confidence: 1.0 }); - } - // CALLS chain fn_0 -> fn_1 -> ... -> fn_7. Used by traverseAncestors. - for (let i = 0; i + 1 < funcs.length; i += 1) { - g.addEdge({ - from: funcs[i] as NodeId, - to: funcs[i + 1] as NodeId, - type: "CALLS", - confidence: 0.9, - }); - } - - return g; -} - -/** - * Detect adapters that can't run the vector-search test under the suite's - * default 4-dim probe. Any of these signals is honoured: - * - * - throw an error whose message contains "not implemented" (the AGE - * reference fork uses `"vectorSearch not implemented"`); OR - * - throw an error whose message contains "dimension mismatch" — the - * adapter is healthy but configured for a different embedding width - * (the in-tree default is 768) and the conformance suite uses a 4-dim - * probe vector to avoid pulling in real embeddings; OR - * - return an empty result set for a known-non-empty query (this is the - * in-tree DuckDb behaviour when the optional `hnsw_acorn` extension - * is absent — `getExtensionWarning()` reports `"No HNSW…"` and - * `vectorSearch` returns `[]`). - * - * All three signals fall through into a clean `t.skip(...)` so the - * conformance suite stays green across dev-box / container / CI matrices - * that may or may not ship the HNSW extension binaries — and across - * adapter authors who configure embedding width at construction time. - */ -const VECTOR_SEARCH_UNAVAILABLE_HINT = - "skipping: adapter reports vectorSearch is not implemented, its embedding width " + - "differs from the 4-dim probe, or the HNSW backend is unavailable"; - -function isVectorSkipError(err: unknown): boolean { - const message = (err as { message?: unknown } | null)?.message; - if (typeof message !== "string") return false; - return /not implemented/i.test(message) || /dimension mismatch/i.test(message); -} - -/** - * v1.0 community-adapter conformance suite (architecture-revised.md - * §AC-A-11). Registers `node:test` blocks that prove a third-party - * `IGraphStore` adapter satisfies the v1.0 contract under a shared - * fixture set. - * - * The suite calls `factory()` per test block so each block owns a fresh - * adapter and there is no test-ordering coupling. The factory is expected - * to return an adapter that has already had `open() + createSchema()` - * called — the suite owns the bulk-load → assert → close sequence only. - * - * ## Skip semantics (vector search) - * - * The optional vector-search test cleanly skips when the adapter: - * - * - throws an error whose message contains "not implemented"; OR - * - returns an empty array for a known-non-empty query (matches the - * in-tree DuckDb behaviour when the optional HNSW extension binaries - * are unavailable — see `DuckDbStore.getExtensionWarning`). - * - * Adapter authors with no vector capability at all can throw - * `new Error("vectorSearch not implemented")` from their stub and the - * suite passes without intervention. - * - * @param name - Human-readable adapter name (used as test prefix). - * @param factory - Async factory returning a fresh, opened adapter - * (post `open() + createSchema()`). - */ -export function assertIGraphStoreConformance( - name: string, - factory: () => Promise, -): void { - // --------------------------------------------------------------------- - // 1. Lifecycle — bulkLoad + healthCheck - // --------------------------------------------------------------------- - test(`[conformance:${name}] lifecycle: bulkLoad reports counts and healthCheck is ok`, async () => { - const store = await factory(); - try { - const fixture = buildConformanceFixture(); - const stats = await store.bulkLoad(fixture); - assert.equal( - stats.nodeCount, - fixture.nodeCount(), - "bulkLoad.nodeCount must equal the source graph nodeCount()", - ); - assert.equal( - stats.edgeCount, - fixture.edgeCount(), - "bulkLoad.edgeCount must equal the source graph edgeCount()", - ); - const health = await store.healthCheck(); - assert.equal(health.ok, true, "healthCheck must report ok=true after bulkLoad"); - } finally { - await store.close(); - } - }); - - // --------------------------------------------------------------------- - // 2. Parity — rebuildFromStore graphHash byte-identity (Liskov contract) - // --------------------------------------------------------------------- - test(`[conformance:${name}] parity: rebuildFromStore graphHash byte-identical to fixture`, async () => { - const store = await factory(); - try { - const fixture = buildConformanceFixture(); - const original = graphHash(fixture); - await store.bulkLoad(fixture); - const rebuilt = await rebuildFromStore(store); - const got = graphHash(rebuilt); - assert.equal( - got, - original, - `[${name}] round-trip broke graphHash\n original: ${original}\n rebuilt: ${got}`, - ); - } finally { - await store.close(); - } - }); - - // --------------------------------------------------------------------- - // 3. listEdgesByType ≡ listEdges({types: [t]}) - // --------------------------------------------------------------------- - test(`[conformance:${name}] listEdgesByType("CALLS") matches listEdges({types:["CALLS"]})`, async () => { - const store = await factory(); - try { - await store.bulkLoad(buildConformanceFixture()); - const viaShorthand = await store.listEdgesByType("CALLS"); - const viaFilter = await store.listEdges({ types: ["CALLS"] }); - assert.equal( - viaShorthand.length, - viaFilter.length, - `[${name}] listEdgesByType count must equal listEdges({types}) count`, - ); - // Compare canonical id-tuples to avoid coupling to undefined-vs-absent - // field differences in the wider edge shape — the contract is "same - // edges, same order". - const tuple = (e: CodeRelation): string => `${e.from}${e.to}${e.type}`; - assert.deepEqual( - viaShorthand.map(tuple), - viaFilter.map(tuple), - `[${name}] listEdgesByType must agree with listEdges({types}) on order + identity`, - ); - // Sanity: every returned edge actually has type=CALLS — guards against - // an adapter that ignores the filter and returns the full edge set. - for (const e of viaShorthand) { - assert.equal(e.type, "CALLS", `[${name}] listEdgesByType returned non-CALLS edge`); - } - } finally { - await store.close(); - } - }); - - // --------------------------------------------------------------------- - // 4. traverseAncestors — invariants vs hand-rolled BFS over listEdges - // --------------------------------------------------------------------- - test(`[conformance:${name}] traverseAncestors matches BFS over listEdges`, async () => { - const store = await factory(); - try { - await store.bulkLoad(buildConformanceFixture()); - - // The CALLS chain is fn_0 -> fn_1 -> ... -> fn_7. Pick fn_3 as the - // start id; ancestors at maxDepth=2 should be fn_2 (depth 1) and - // fn_1 (depth 2). fn_0 must NOT appear at depth=2. - const fn3Id = makeNodeId("Function", "src/b.ts", "fn_3", { parameterCount: 0 }); - - const result = await store.traverseAncestors({ - fromId: fn3Id, - edgeTypes: ["CALLS"], - maxDepth: 2, - }); - - // Hand-rolled BFS over listEdges so we are not coupled to the - // adapter's recursive query implementation. - const allCalls = await store.listEdges({ types: ["CALLS"] }); - const reverseAdj = new Map(); - for (const e of allCalls) { - const bucket = reverseAdj.get(e.to) ?? []; - bucket.push(e.from); - reverseAdj.set(e.to, bucket); - } - const expected = new Map(); - const queue: { id: string; depth: number }[] = [{ id: fn3Id, depth: 0 }]; - while (queue.length > 0) { - const head = queue.shift(); - if (!head) break; - if (head.depth >= 2) continue; - for (const ancestor of reverseAdj.get(head.id) ?? []) { - if (expected.has(ancestor)) continue; - expected.set(ancestor, head.depth + 1); - queue.push({ id: ancestor, depth: head.depth + 1 }); - } - } - - // Start node must be excluded. - for (const r of result) { - assert.notEqual(r.nodeId, fn3Id, `[${name}] start node leaked into traverseAncestors`); - } - // Every result row must appear in `expected` at the same depth bound. - const got = new Map(); - for (const r of result) got.set(r.nodeId, r.depth); - assert.equal( - got.size, - expected.size, - `[${name}] traverseAncestors size mismatch: got=${got.size}, expected=${expected.size}`, - ); - for (const [id, depth] of expected) { - assert.equal( - got.get(id), - depth, - `[${name}] traverseAncestors depth mismatch for ${id}: got=${got.get(id)}, expected=${depth}`, - ); - } - // depth + path fields well-formed (depth >= 1, path non-empty array). - for (const r of result) { - assert.ok(r.depth >= 1, `[${name}] traverseAncestors depth must be >=1`); - assert.ok(Array.isArray(r.path), `[${name}] traverseAncestors path must be an array`); - } - } finally { - await store.close(); - } - }); - - // --------------------------------------------------------------------- - // 5. listNodes — ordering + paging - // --------------------------------------------------------------------- - test(`[conformance:${name}] listNodes id-ASC ordering and limit/offset paging`, async () => { - const store = await factory(); - try { - await store.bulkLoad(buildConformanceFixture()); - const all = await store.listNodes(); - const ids = all.map((n: GraphNode) => n.id); - const sorted = [...ids].sort(); - assert.deepEqual(ids, sorted, `[${name}] listNodes must return rows ordered by id ASC`); - assert.ok(ids.length >= 4, `[${name}] fixture must have >=4 nodes for paging assertion`); - - const firstPage = await store.listNodes({ limit: 2 }); - const secondPage = await store.listNodes({ limit: 2, offset: 2 }); - assert.deepEqual( - firstPage.map((n: GraphNode) => n.id), - ids.slice(0, 2), - `[${name}] listNodes(limit=2) must equal first two rows of full list`, - ); - assert.deepEqual( - secondPage.map((n: GraphNode) => n.id), - ids.slice(2, 4), - `[${name}] listNodes(limit=2, offset=2) must equal rows [2,4) of full list`, - ); - } finally { - await store.close(); - } - }); - - // --------------------------------------------------------------------- - // 6. vectorSearch — optional capability - // --------------------------------------------------------------------- - test(`[conformance:${name}] vectorSearch returns ordered results when capability is present`, async (t) => { - const store = await factory(); - try { - const g = new KnowledgeGraph(); - const ids: NodeId[] = []; - const vectors: readonly (readonly number[])[] = [ - [1.0, 0.0, 0.0, 0.0], - [0.9, 0.1, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0], - ]; - for (let i = 0; i < vectors.length; i += 1) { - const id = makeNodeId("File", `src/f${i}.ts`, `f${i}`); - ids.push(id); - g.addNode({ id, kind: "File", name: `f${i}`, filePath: `src/f${i}.ts` }); - } - await store.bulkLoad(g); - - // Adapters that don't implement vector search may throw on upsert OR - // on the search call itself. Both pathways funnel into the same skip. - try { - await store.upsertEmbeddings( - ids.map((id, i) => ({ - nodeId: id, - chunkIndex: 0, - vector: new Float32Array(vectors[i] ?? []), - contentHash: `h${i}`, - })), - ); - } catch (err) { - if (isVectorSkipError(err)) { - t.skip(VECTOR_SEARCH_UNAVAILABLE_HINT); - return; - } - throw err; - } - - let hits: readonly { readonly nodeId: string; readonly distance: number }[]; - try { - hits = await store.vectorSearch({ - vector: new Float32Array([1.0, 0.0, 0.0, 0.0]), - limit: 2, - }); - } catch (err) { - if (isVectorSkipError(err)) { - t.skip(VECTOR_SEARCH_UNAVAILABLE_HINT); - return; - } - throw err; - } - - // Empty result on a known-non-empty input means the optional HNSW - // extension is disabled — skip rather than fail. This is the in-tree - // DuckDb behaviour when neither hnsw_acorn nor vss is available. - if (hits.length === 0) { - t.skip(VECTOR_SEARCH_UNAVAILABLE_HINT); - return; - } - - assert.ok(hits.length >= 1, `[${name}] vectorSearch must return at least one row`); - // Nearest first — the identical vector at index 0 is expected to be - // the top hit, but adapters with approximate-only HNSW may flip - // ties. Assert ordering by distance ASC instead. - for (let i = 1; i < hits.length; i += 1) { - const prev = hits[i - 1]; - const curr = hits[i]; - if (!prev || !curr) continue; - assert.ok( - prev.distance <= curr.distance, - `[${name}] vectorSearch results must be ordered by distance ASC: ${prev.distance} > ${curr.distance}`, - ); - } - } finally { - await store.close(); - } - }); -} diff --git a/packages/storage/src/test-utils/index.ts b/packages/storage/src/test-utils/index.ts index 2c2db3ba..0a662eb9 100644 --- a/packages/storage/src/test-utils/index.ts +++ b/packages/storage/src/test-utils/index.ts @@ -1,18 +1,14 @@ /** * `@opencodehub/storage/test-utils` barrel. * - * Public entry point for adapter conformance testing. Third-party - * `IGraphStore` adapter authors (community AGE / Memgraph / Neo4j / - * Neptune forks) import {@link assertIGraphStoreConformance} from here and - * run it against their own implementation to prove they satisfy the v1.0 - * graphHash byte-identity + typed-finder contract. - * - * {@link assertGraphParity} + {@link rebuildFromStore} are the lower-level - * primitives that the conformance suite is built on; they are re-exported - * for adapter authors who want to compose their own bespoke checks. + * Public entry point for adapter parity testing. {@link assertGraphParity} + * proves an adapter round-trips a `KnowledgeGraph` to a byte-identical + * `graphHash`; {@link rebuildFromStore} reconstructs the graph from the + * stored rows via the typed finders. In-tree, `sqlite-parity.test.ts` uses + * both to pin the graphHash determinism invariant; a third-party SQL-shaped + * adapter fork can compose the same checks against its own implementation. */ -export { assertIGraphStoreConformance } from "./conformance.js"; export { applyRepoNullables, assertGraphParity, diff --git a/packages/summarizer/README.md b/packages/summarizer/README.md index 5946f723..97f2164e 100644 --- a/packages/summarizer/README.md +++ b/packages/summarizer/README.md @@ -47,7 +47,7 @@ console.log(`attempts=${result.attempts} cacheRead=${result.usageByAttempt[0].ca - **Ingestion call site:** `packages/ingestion/src/pipeline/phases/summarize.ts` invokes `summarizeSymbol` once per high-confidence callable (SCIP-backed Function / Method / Class), gated by `--summaries` + budget + offline flags. - Results land in the `symbol_summaries` DuckDB table (see + Results land in the `symbol_summaries` table in store.sqlite (see `packages/storage/src/schema-ddl.ts`); they never mutate graph nodes or edges. - **Retrieval site:** `packages/ingestion/src/pipeline/phases/embeddings.ts` fuses each summary into the symbol-tier embedding (`signature + summary + diff --git a/packages/wiki/src/index.test.ts b/packages/wiki/src/index.test.ts index 95254c77..4456ad98 100644 --- a/packages/wiki/src/index.test.ts +++ b/packages/wiki/src/index.test.ts @@ -1,6 +1,6 @@ /** * Wiki generation tests — confirm the deterministic-output + success-criteria - * contract without spinning up DuckDB. + * contract without spinning up the store. * * `WikiFakeStore` implements `IGraphStore` finder methods directly * over in-memory `nodes` + `edges` arrays. Every helper in diff --git a/plugins/opencodehub/skills/codehub-code-pack/SKILL.md b/plugins/opencodehub/skills/codehub-code-pack/SKILL.md index 9ccfea7f..2e28ee88 100644 --- a/plugins/opencodehub/skills/codehub-code-pack/SKILL.md +++ b/plugins/opencodehub/skills/codehub-code-pack/SKILL.md @@ -21,7 +21,7 @@ model: sonnet Surface the `pack_codebase` MCP tool to a Claude Code agent. Produces a **deterministic, 9-item Bill of Materials (BOM)** at `/.codehub/packs//` that is byte-identical given the same `(commit, tokenizer, budget, -chonkie_version, duckdb_version, grammar_commits)`. The pack is the +chonkie_version, grammar_commits)`. The pack is the durable artifact agents hand to long-context LLMs, archive in S3 for later replay, or diff between commits to prove invariants did not change. @@ -80,8 +80,8 @@ input that the pack emitter looked at is identical. The manifest schema is fixed at `schemaVersion: 1`. Required fields: `commit`, `repoOriginUrl`, `tokenizerId`, `determinismClass`, -`budgetTokens`, `pins` (`chonkieVersion`, `duckdbVersion`, -`grammarCommits`), `files[]`, `packHash`, `schemaVersion`. +`budgetTokens`, `pins` (`chonkieVersion`, `grammarCommits`), `files[]`, +`packHash`, `schemaVersion`. ## Group mode @@ -108,7 +108,7 @@ verbatim when surfacing the pack to the user. | Class | Meaning | When emitted | |-------|---------|--------------| -| `strict` | Same `(commit, tokenizer, budget, chonkieVersion, duckdbVersion, grammarCommits)` → same `packHash`. The full reproducibility contract holds. | Default path: chonkie native binding loaded, deterministic tokenizer (e.g. local HF tokenizer with pinned hash). | +| `strict` | Same `(commit, tokenizer, budget, chonkieVersion, grammarCommits)` → same `packHash`. The full reproducibility contract holds. | Default path: chonkie native binding loaded, deterministic tokenizer (e.g. local HF tokenizer with pinned hash). | | `best_effort` | The tokenizer is an Anthropic API tokenizer (Claude family) — Anthropic may rotate the tokenizer pin behind the model name. Other inputs are still strictly pinned, but a future tokenizer rotation can change the output. | When `tokenizerId` resolves to a Claude model. The BOM verifier MUST warn callers checking byte-identity. | | `degraded` | A primitive fallback was used (e.g. line-split chunker because `@chonkiejs/core` failed to load). The pack is still self-consistent and re-runs match locally, but **does not** match a `strict` pack on a different machine. | When chonkie native binding is unavailable on CI platform. | @@ -136,9 +136,9 @@ file. A caller proves byte-identity by re-running and diffing: ```bash -# 1. Pin the environment so chonkie/duckdb match. +# 1. Pin the environment so chonkie matches. node --version -cat packages/pack/package.json | jq '.dependencies."@chonkiejs/core", .dependencies."@duckdb/node-api"' +cat packages/pack/package.json | jq '.dependencies."@chonkiejs/core"' # 2. Run the pack twice with identical args. codehub code-pack --budget 200000 --tokenizer cl100k_base --out /tmp/packA @@ -172,7 +172,7 @@ identical: been re-analyzed under you (an `analyze` invalidates the previous pack's `commit` field). 3. If `pins` differs, the local toolchain has changed — pin - `@chonkiejs/core` and `@duckdb/node-api` in `package.json`. + `@chonkiejs/core` in `package.json`. 4. If only `files[i].fileHash` differs for a single BOM item, that item's emitter has a determinism bug; raise it in the determinism suite under `packages/pack/src/`. diff --git a/plugins/opencodehub/skills/codehub-code-pack/references/determinism-contract.md b/plugins/opencodehub/skills/codehub-code-pack/references/determinism-contract.md index 3da92089..577487e8 100644 --- a/plugins/opencodehub/skills/codehub-code-pack/references/determinism-contract.md +++ b/plugins/opencodehub/skills/codehub-code-pack/references/determinism-contract.md @@ -26,12 +26,12 @@ identical output: ## Invariants - **graphHash byte-identity** holds before and after every pack- - affecting commit — the `DuckDbStore` / `GraphDbStore` parity suite - stays green. + affecting commit — the cross-adapter graphHash parity suite stays + green. - **packHash byte-identity** — same - `(commit, tokenizer, budget, chonkie_version, duckdb_version, - grammar_commits)` → same `packHash`. Verified by the determinism - suite at `packages/pack/src/pack-determinism.test.ts`. + `(commit, tokenizer, budget, chonkie_version, grammar_commits)` → + same `packHash`. Verified by the determinism suite at + `packages/pack/src/pack-determinism.test.ts`. - **No banned literals** in tracked source — `bash scripts/check-banned-strings.sh` exits 0 post-commit. - **`mise run check`** exits 0 after every commit. @@ -56,7 +56,7 @@ identical output: on every file under the output directory). - `manifest.json` carries `{commit, repo_origin_url, tokenizer_id, determinism_class, - budget_tokens, grammar_commits, chonkie_version, duckdb_version, + budget_tokens, grammar_commits, chonkie_version, files[], pack_hash}` with `pack_hash = sha256(canonicalJson(all-other-fields))`. - PageRank is computed at request time from the loaded @@ -81,7 +81,7 @@ identical output: - No LLM calls in `@opencodehub/pack` (enforced by `scripts/check-banned-strings.sh`-style audit + a `no-bedrock-outside-summarizer` test). -- No writer metadata (DuckDB `created_by`, chonkie writer tags) as +- No backend writer metadata (e.g. chonkie writer tags) as top-level fields in `manifest.json` — all tool-version pins live in a single nested `pins: {}` object so the BOM schema is stable across tool upgrades. @@ -125,7 +125,7 @@ in TS) takes one of three values: | Class | Trigger | Implication | |-------|---------|-------------| -| `strict` | None of the degraded triggers fire | The byte-identity invariant holds in full: same `(commit, tokenizer, budget, chonkie_version, duckdb_version, grammar_commits)` → same `pack_hash`. | +| `strict` | None of the degraded triggers fire | The byte-identity invariant holds in full: same `(commit, tokenizer, budget, chonkie_version, grammar_commits)` → same `pack_hash`. | | `best_effort` | `tokenizer_id` resolves to a Claude model | The verifier MUST warn callers checking byte-identity. | | `degraded` | `@chonkiejs/core` native binding fails to load | Line-split fallback used; pack still self-consistent locally but not portable. | @@ -140,7 +140,7 @@ suite. When debugging a `pack_hash` drift: 1. Re-run with `engine: "pack"` and capture both manifests. -2. Compare `pins` first — a chonkie or duckdb upgrade in node_modules +2. Compare `pins` first — a chonkie upgrade in node_modules is the most common cause. 3. Compare `files[i].file_hash` row-by-row — the first mismatch identifies which BOM emitter is non-deterministic. diff --git a/plugins/opencodehub/skills/codehub-debugging/SKILL.md b/plugins/opencodehub/skills/codehub-debugging/SKILL.md index 3959ad79..a99f76c8 100644 --- a/plugins/opencodehub/skills/codehub-debugging/SKILL.md +++ b/plugins/opencodehub/skills/codehub-debugging/SKILL.md @@ -86,20 +86,20 @@ Two-hop upstream trace for every caller of `validatePayment`: ```sql WITH direct AS ( - SELECT from_id, to_id, 1 AS depth - FROM relations + SELECT src, dst, 1 AS depth + FROM edges WHERE type = 'CALLS' - AND to_id IN (SELECT id FROM nodes WHERE name = 'validatePayment' AND kind = 'Function') + AND dst IN (SELECT id FROM nodes WHERE name = 'validatePayment' AND kind = 'Function') ), indirect AS ( - SELECT r.from_id, d.to_id, 2 AS depth - FROM relations r - JOIN direct d ON d.from_id = r.to_id - WHERE r.type = 'CALLS' + SELECT e.src, d.dst, 2 AS depth + FROM edges e + JOIN direct d ON d.src = e.dst + WHERE e.type = 'CALLS' ) SELECT caller.name, caller.file_path, caller.start_line, u.depth FROM (SELECT * FROM direct UNION ALL SELECT * FROM indirect) u -JOIN nodes caller ON caller.id = u.from_id +JOIN nodes caller ON caller.id = u.src ORDER BY u.depth ASC, caller.name; ``` diff --git a/plugins/opencodehub/skills/codehub-document/references/data-source-map.md b/plugins/opencodehub/skills/codehub-document/references/data-source-map.md index 44fe433b..ca141a88 100644 --- a/plugins/opencodehub/skills/codehub-document/references/data-source-map.md +++ b/plugins/opencodehub/skills/codehub-document/references/data-source-map.md @@ -11,7 +11,7 @@ graph_hash: ## Repo profile # from project_profile - languages: TypeScript 87%, Rust 11%, Python 2% -- stacks: Node 22, pnpm 10, DuckDB, Vitest +- stacks: Node 24, pnpm 10, SQLite (node:sqlite), Vitest - entry points: packages/mcp/src/index.ts, packages/cli/src/bin.ts ## Top communities (≤ 10) # from sql: SELECT name, inferred_label, cohesion, symbol_count @@ -126,7 +126,7 @@ Steps marked `# wave 0a` and `# wave 0b` each run as a single parallel tool-use # wave 0b — depends on schema + profile (one parallel batch) 11. communities = sql("SELECT … FROM nodes WHERE kind='Community' …") 12. processes = sql("SELECT … FROM nodes WHERE kind='Process' …") -13. relations = sql("SELECT … FROM relations …") # for diagrams +13. edges = sql("SELECT … FROM edges …") # for diagrams 14. top_folders = top-5 folders by file count (from profile.entryPoints + glob) 15. owners_summary = [owners({path}) for path in top_folders] 16. if --group: group_hits = group_query({group, canonical_terms}) diff --git a/plugins/opencodehub/skills/codehub-document/references/document-templates.md b/plugins/opencodehub/skills/codehub-document/references/document-templates.md index c4d22c3e..6ff0e4de 100644 --- a/plugins/opencodehub/skills/codehub-document/references/document-templates.md +++ b/plugins/opencodehub/skills/codehub-document/references/document-templates.md @@ -23,7 +23,7 @@ Cites `packages/foo/src/index.ts` (200 LOC) style file references. | Layer | Technology | Source | |---|---|---| | Runtime | Node 22 | `package.json:7` | -| Storage | DuckDB + hnsw_acorn | `packages/storage/src/index.ts:12` | +| Storage | SQLite (single-file, node:sqlite) — FTS5 + vector KNN | `packages/storage/src/index.ts:12` | | ... | ... | ... | ## Module map diff --git a/plugins/opencodehub/skills/codehub-document/references/mermaid-patterns.md b/plugins/opencodehub/skills/codehub-document/references/mermaid-patterns.md index ae047e3d..128c2059 100644 --- a/plugins/opencodehub/skills/codehub-document/references/mermaid-patterns.md +++ b/plugins/opencodehub/skills/codehub-document/references/mermaid-patterns.md @@ -14,11 +14,11 @@ flowchart LR core[Core types] ingestion[Ingestion DAG] storage[Storage] - duckdb[(DuckDB)]:::external + sqlite[(store.sqlite)]:::external mcp --> core ingestion --> core ingestion --> storage - storage --> duckdb + storage --> sqlite classDef external stroke-dasharray: 3 3 ``` @@ -104,19 +104,19 @@ For `architecture/data-flow.md`. flowchart TB source[Repo files] parse[tree-sitter parser] - graph[DuckDB graph] + store[(store.sqlite)] embed[ONNX embedder] query[MCP query] source --> parse - parse --> graph + parse --> store parse --> embed - embed --> graph - query --> graph + embed --> store + query --> store ``` **Rules:** - Top-to-bottom flow (`TB`). -- Stores as rectangular bracket nodes; processes as simple bracket nodes; external interfaces as parenthesized nodes. +- Stores as cylinder nodes `store[(name)]`; processes as simple bracket nodes; external interfaces as parenthesized nodes. ## Cross-repo portfolio — `flowchart LR` diff --git a/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-components.md b/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-components.md index 517c3959..86a432dd 100644 --- a/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-components.md +++ b/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-components.md @@ -26,7 +26,7 @@ Produce `{{ docs_root }}/diagrams/architecture/components.md`: a single Mermaid | Shared context | `Read {{ context_path }}` | always first | | Prefetch ledger | `Read {{ prefetch_path }}` | always first | | Top communities | `{{ context_path }} § Top communities` | cached | -| Community relations | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT source, target, kind FROM relations WHERE kind IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | +| Community relations | `{{ prefetch_path }} § sql edges` or `mcp__codehub__sql({query: "SELECT src, dst, type FROM edges WHERE type IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | | Component method list | `mcp__codehub__context({symbol: })` per top 8 | mid-run | ## 4. Process diff --git a/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md b/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md index 2b629074..c930aad9 100644 --- a/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md +++ b/plugins/opencodehub/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md @@ -26,7 +26,7 @@ Produce `{{ docs_root }}/diagrams/structural/dependency-graph.md`: a single Merm | Shared context | `Read {{ context_path }}` | always first | | Prefetch ledger | `Read {{ prefetch_path }}` | always first | | Top communities | `{{ context_path }} § Top communities` | cached | -| Internal edges | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT source, target, kind FROM relations WHERE kind IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | +| Internal edges | `{{ prefetch_path }} § sql edges` or `mcp__codehub__sql({query: "SELECT src, dst, type FROM edges WHERE type IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | | External dependencies | `{{ context_path }} § Stack` or `mcp__codehub__dependencies({repo: "{{ repo }}"})` | cached if digest present; mid-run otherwise | ## 4. Process diff --git a/plugins/opencodehub/skills/codehub-exploring/SKILL.md b/plugins/opencodehub/skills/codehub-exploring/SKILL.md index f1e8f7d5..0adda161 100644 --- a/plugins/opencodehub/skills/codehub-exploring/SKILL.md +++ b/plugins/opencodehub/skills/codehub-exploring/SKILL.md @@ -74,14 +74,14 @@ When a name is ambiguous, `context` returns a ranked candidate list instead of s ### `mcp__codehub__sql` — trace a named process end-to-end ```sql -SELECT r.step, callee.name, callee.file_path, callee.start_line -FROM relations r -JOIN nodes proc ON proc.id = r.from_id -JOIN nodes callee ON callee.id = r.to_id -WHERE r.type = 'PROCESS_STEP' +SELECT e.step, callee.name, callee.file_path, callee.start_line +FROM edges e +JOIN nodes proc ON proc.id = e.src +JOIN nodes callee ON callee.id = e.dst +WHERE e.type = 'PROCESS_STEP' AND proc.kind = 'Process' AND proc.name = 'CheckoutFlow' -ORDER BY r.step ASC; +ORDER BY e.step ASC; ``` ## Cross-repo exploration diff --git a/plugins/opencodehub/skills/codehub-guide/SKILL.md b/plugins/opencodehub/skills/codehub-guide/SKILL.md index e35df607..625bef32 100644 --- a/plugins/opencodehub/skills/codehub-guide/SKILL.md +++ b/plugins/opencodehub/skills/codehub-guide/SKILL.md @@ -5,7 +5,7 @@ description: "Use when the user asks about OpenCodeHub itself — available MCP # OpenCodeHub Guide -Quick reference for every OpenCodeHub MCP tool, MCP resource, and the graph + temporal store schema. +Quick reference for every OpenCodeHub MCP tool, MCP resource, and the single-file `store.sqlite` schema. ## Always Start Here @@ -59,7 +59,7 @@ standalone artifact producer with its own preconditions and output path. | `mcp__codehub__context` | 360-degree symbol view + `confidenceBreakdown` + `cochanges` side-section | | `mcp__codehub__impact` | Blast radius with risk tier + `confidenceBreakdown` | | `mcp__codehub__detect_changes` | Map an uncommitted or committed diff to affected symbols and flows | -| `mcp__codehub__sql` | Read-only query: `sql` arg → temporal DuckDB (cochanges/summaries); `cypher` arg → lbug graph (5 s timeout) | +| `mcp__codehub__sql` | Read-only SQL over the single-file `store.sqlite`: all tables queryable (`nodes`, `edges`, `embeddings`, `cochanges`, `symbol_summaries`, `store_meta`), 5 s timeout. `cypher` arg is reserved for community-fork graph adapters and is unsupported by the default backend | | `mcp__codehub__signature` | Symbol declaration + stubbed members (class/interface header + method/property signatures, bodies elided) | ### HTTP / RPC surface @@ -115,91 +115,121 @@ Lightweight reads for navigation (every URI uses the `codehub://` scheme): | `codehub://repo/{name}/context` | Stats + staleness envelope | | `codehub://repo/{name}/schema` | Live node kinds / relation types for `sql` | -> Cluster and process navigation resources (`codehub://repo/{name}/clusters`, `codehub://repo/{name}/processes`, etc.) are slated for a later wave. Until then, use the typed tools or Cypher (below) filtered to `kind = 'Community'` / `kind = 'Process'`. +> Cluster and process navigation resources (`codehub://repo/{name}/clusters`, `codehub://repo/{name}/processes`, etc.) are slated for a later wave. Until then, use the typed tools or `sql` (below) filtered to `kind = 'Community'` / `kind = 'Process'`. -## Where the graph lives (ADR 0016) +## Where the graph lives (ADR 0019) -There are **two stores**, and they are queried differently: +There is **one store**: a single-file `/.codehub/store.sqlite` (WAL, via +Node's built-in `node:sqlite`). ADR 0019 supersedes ADR 0016: the old +two-tier layout (a `graph.lbug` graph file plus a `temporal.duckdb` file) is +gone. One `SqliteStore` class implements both the graph and temporal views over +that single file. -- **Graph tier — `graph.lbug`** (ladybug, Cypher dialect). Holds nodes, edges, - and embeddings. Query it via the typed tools (`query` / `context` / `impact` / - `route_map` / …) or, for bespoke questions, **Cypher** via the MCP `sql` - tool's `cypher` argument. There is NO `nodes` or `relations` SQL table. -- **Temporal tier — `temporal.duckdb`** (DuckDB SQL). Holds only the - `cochanges` and `symbol_summaries` tables. The `sql` argument of the MCP - `sql` tool (and `codehub sql` on the CLI) targets THIS store. +Every table is directly SQL-queryable through the MCP `sql` tool's `sql` +argument (and `codehub sql` on the CLI): `nodes`, `edges`, `embeddings`, +`cochanges`, `symbol_summaries`, and `store_meta`. Query the graph via the typed +tools (`query` / `context` / `impact` / `route_map` / …) for the high-level path, +or write SQL directly against these tables for bespoke questions. Multi-hop graph +traversal is a **recursive SQL CTE over the `edges` table**, not Cypher. +Full-text search is BM25 via SQLite FTS5. -Pass exactly one of `sql` (temporal DuckDB) or `cypher` (lbug graph) to the MCP -`sql` tool. +The `cypher` argument of the `sql` tool is **reserved for community-fork graph +adapters** (AGE / Memgraph / Neo4j / Neptune) and is **not supported by the +default backend**. On the default single-file SQLite backend, always pass `sql`. -### Graph schema (lbug / Cypher) +### Store schema (single-file SQLite) -One node label `CodeNode` carrying `kind` as a **property** (NOT a per-kind -label). One relationship table per relation type. Properties are **snake_case** -(`file_path`, `start_line`, `inferred_label`, `step_count`, `entry_point_id`); -a camelCase RETURN alias comes back as the alias you give it, but the stored -property names are snake_case. +Two universal tables carry the graph. `nodes` has base columns +(`id`, `kind`, `name`, `file_path`, `start_line`, `end_line`) plus a `payload` +JSON column holding the kind-specific fields; reach those via SQLite JSON1, +`payload->>'$.field'` (e.g. `payload->>'$.inferredLabel'`, +`payload->>'$.stepCount'`, `payload->>'$.entryPointId'`). `edges` is one +polymorphic table keyed by the `(src, dst, type, step)` dedup tuple, with columns +`id`, `src`, `dst`, `type`, `confidence`, `step`, `reason`. -**Node kinds** (`n.kind` values): File, Folder, Function, Class, Method, +**Node kinds** (`kind` values): File, Folder, Function, Class, Method, Interface, Constructor, Struct, Enum, Macro, Typedef, Union, Namespace, Trait, Impl, TypeAlias, Const, Static, Variable, Property, Record, Delegate, Annotation, Template, Module, CodeElement, Community, Process, Route, Tool, Finding, Dependency, Contributor, Repo, ProjectProfile, Section. -**Relationship types** (each is its own edge label): CONTAINS, DEFINES, IMPORTS, +**Edge types** (`edges.type` values): CONTAINS, DEFINES, IMPORTS, CALLS, EXTENDS, IMPLEMENTS, HAS_METHOD, HAS_PROPERTY, ACCESSES, METHOD_OVERRIDES, OVERRIDES, METHOD_IMPLEMENTS, MEMBER_OF, PROCESS_STEP, HANDLES_ROUTE, FETCHES, HANDLES_TOOL, ENTRY_POINT_OF, WRAPS, QUERIES, REFERENCES, FOUND_IN, DEPENDS_ON, OWNED_BY. -Cochanges live only in the **temporal** `cochanges` table (DuckDB SQL), never as -graph edges. +Cochanges live in the `cochanges` table, never as graph edges. -## Cypher cheat-sheet (MCP `sql` tool, `cypher` arg) +## SQL cheat-sheet (MCP `sql` tool, `sql` arg) -All inbound callers of a function by name: +All inbound callers of a function by name (join `edges` back to `nodes`): -```cypher -MATCH (caller:CodeNode)-[r:CALLS]->(callee:CodeNode) -WHERE callee.name = 'validateUser' AND callee.kind = 'Function' -RETURN caller.name AS name, caller.file_path AS file, caller.start_line AS line, - r.confidence AS confidence, r.reason AS reason -ORDER BY r.confidence DESC -LIMIT 50 +```sql +SELECT caller.name AS name, caller.file_path AS file, caller.start_line AS line, + e.confidence AS confidence, e.reason AS reason +FROM edges e +JOIN nodes callee ON callee.id = e.dst +JOIN nodes caller ON caller.id = e.src +WHERE e.type = 'CALLS' + AND callee.name = 'validateUser' AND callee.kind = 'Function' +ORDER BY e.confidence DESC +LIMIT 50; ``` -Top communities by cohesion: +Top communities by cohesion (kind-specific fields via JSON1): -```cypher -MATCH (n:CodeNode) -WHERE n.kind = 'Community' -RETURN n.name AS name, n.inferred_label AS label, n.cohesion AS cohesion, - n.symbol_count AS symbols -ORDER BY n.cohesion DESC -LIMIT 20 +```sql +SELECT name, + payload->>'$.inferredLabel' AS label, + payload->>'$.cohesion' AS cohesion, + payload->>'$.symbolCount' AS symbols +FROM nodes +WHERE kind = 'Community' +ORDER BY cohesion DESC +LIMIT 20; ``` Process entry points: -```cypher -MATCH (n:CodeNode) -WHERE n.kind = 'Process' -RETURN n.name AS name, n.inferred_label AS label, n.step_count AS steps, - n.entry_point_id AS entry_point -ORDER BY n.step_count DESC +```sql +SELECT name, + payload->>'$.inferredLabel' AS label, + payload->>'$.stepCount' AS steps, + payload->>'$.entryPointId' AS entry_point +FROM nodes +WHERE kind = 'Process' +ORDER BY steps DESC; ``` SCIP-confirmed CALLS edges only (strict impact): -```cypher -MATCH ()-[r:CALLS]->() -WHERE r.confidence >= 0.95 AND r.reason STARTS WITH 'scip:' -RETURN r +```sql +SELECT id, src, dst, confidence, reason +FROM edges +WHERE type = 'CALLS' + AND confidence >= 0.95 + AND reason LIKE 'scip:%'; ``` -### Temporal SQL cheat-sheet (MCP `sql` tool, `sql` arg) +Multi-hop traversal is a recursive CTE over `edges` (this is what `impact` +runs under the hood): + +```sql +WITH RECURSIVE reach(id, depth) AS ( + SELECT :start_id, 0 + UNION + SELECT e.src, r.depth + 1 + FROM edges e + JOIN reach r ON e.dst = r.id + WHERE r.depth < 2 AND e.confidence >= 0.5 +) +SELECT DISTINCT n.id, n.name, n.file_path +FROM reach r JOIN nodes n ON n.id = r.id +WHERE r.depth > 0; +``` -Tightest co-change pairs (DuckDB SQL — temporal store): +Tightest co-change pairs (from the `cochanges` table): ```sql SELECT source_file, target_file, lift, cocommit_count diff --git a/plugins/opencodehub/skills/codehub-onboarding/SKILL.md b/plugins/opencodehub/skills/codehub-onboarding/SKILL.md index ceec520d..c68b9a7d 100644 --- a/plugins/opencodehub/skills/codehub-onboarding/SKILL.md +++ b/plugins/opencodehub/skills/codehub-onboarding/SKILL.md @@ -52,7 +52,7 @@ Produces a single ONBOARDING.md with a ranked reading order drawn from graph cen | Layer | Tech | Source | |---|---|---| | Runtime | Node 22 | `package.json:7` | -| Storage | DuckDB | `packages/storage/src/index.ts:12` | +| Storage | SQLite (single-file, node:sqlite) | `packages/storage/src/index.ts:12` | | ... | ... | ... | ## Read these 10 files first (in order) diff --git a/plugins/opencodehub/skills/codehub-refactoring/SKILL.md b/plugins/opencodehub/skills/codehub-refactoring/SKILL.md index 693d55c1..04a752d8 100644 --- a/plugins/opencodehub/skills/codehub-refactoring/SKILL.md +++ b/plugins/opencodehub/skills/codehub-refactoring/SKILL.md @@ -149,18 +149,22 @@ mcp__codehub__shape_check({ route: "GET /users/:id", repo: "my-app" }) → mismatches: [{ consumer, expected, actual }] ``` -### `mcp__codehub__sql` — custom reference query (temporal store) +### `mcp__codehub__sql` — custom reference query (single-file store) -The `sql` arg is read-only DuckDB over the temporal store (cochanges + -symbol_summaries). To enumerate every file referencing a symbol from the graph, -use the `cypher` arg of the same tool instead (the node/edge graph lives in -`graph.lbug`, not the SQL store): +The `sql` arg is read-only SQL over the single-file `store.sqlite`, and every +table is queryable, including `nodes` and `edges`. To enumerate every file +referencing a symbol from the graph, join `edges` back to `nodes` (the `cypher` +arg is reserved for community-fork graph adapters and is unsupported by the +default backend): -```cypher -MATCH (caller:CodeNode)-[r:REFERENCES|CALLS|IMPORTS]->(target:CodeNode) -WHERE target.name = 'validateUser' -RETURN DISTINCT caller.file_path AS file -ORDER BY file +```sql +SELECT DISTINCT caller.file_path AS file +FROM edges e +JOIN nodes target ON target.id = e.dst +JOIN nodes caller ON caller.id = e.src +WHERE e.type IN ('REFERENCES', 'CALLS', 'IMPORTS') + AND target.name = 'validateUser' +ORDER BY file; ``` This catches references a textual rename might miss — useful as a manual-check @@ -172,7 +176,7 @@ list before and after you edit. | --------------------------------- | ----------------------------------------------------------------------- | | Many callers (> 5) | Use your editor's LSP rename for the mechanical work; `impact` is the checklist | | Cross-module references | Run `detect_changes` after editing; watch for missed imports | -| String / dynamic references | Use the `cypher` arg with `REFERENCES`; the graph cannot see string-keyed dispatch — read those by hand | +| String / dynamic references | Use the `sql` arg to query `REFERENCES` edges; the graph cannot see string-keyed dispatch — read those by hand | | Public / exported API | Version and deprecate; mirror symbol names in a transition layer | | Heuristic edges (confirmed = 0) | Cross-check by reading source; the SCIP oracle did not weigh in | diff --git a/scripts/acceptance.sh b/scripts/acceptance.sh index a4d5f5ae..1a528a5e 100755 --- a/scripts/acceptance.sh +++ b/scripts/acceptance.sh @@ -25,11 +25,11 @@ # 14. license-audit-smoke (analyze + license_audit tool) [NEW v1.0] # 15. verdict-smoke (2-commit fixture → tier) [NEW v1.0] # 16. pack-determinism (code-pack ×2 → diff -r) [NEW v1.0] -# 17. m7-parity-audit (retired — lbug-only backend, ADR 0016; always SKIP) +# 17. m7-parity-audit (retired — single SQLite backend, ADR 0019; always SKIP) # # Gates 10-17 MUST degrade gracefully: when their dependency binary is not -# available (semgrep, embedder weights, codehub verdict command, -# @ladybugdb/core binding), they print `[SKIP]` with a reason and +# available (semgrep, embedder weights, codehub verdict command), they +# print `[SKIP]` with a reason and # do not change the exit code. This lets the acceptance run complete on any # developer laptop and in CI, while still enforcing gates when those # dependencies are present. @@ -561,8 +561,8 @@ echo # --------------------------------------------------------------------------- echo "16/${TOTAL_GATES}: pack-determinism (code-pack ×2 → diff -r)" # The audit script SKIPs cleanly when the CLI isn't built or the repo lacks -# a populated `.codehub/duck.db` graph (worktree native-binding lesson). Pipe -# its output through and translate PASS/SKIP/FAIL into our gate vocabulary. +# a populated `.codehub/store.sqlite` index. Pipe its output through and +# translate PASS/SKIP/FAIL into our gate vocabulary. PACK_LOG="$tmpdir/pack-determinism.log" if bash "$ROOT/scripts/pack-determinism-audit.sh" > "$PACK_LOG" 2>&1; then PACK_LINE=$(head -1 "$PACK_LOG" || true) @@ -578,18 +578,18 @@ fi echo # --------------------------------------------------------------------------- -# 17. M7 parity audit: retired (lbug is the only graph backend post-ADR 0016) +# 17. M7 parity audit: retired (single SQLite backend post-ADR 0019) # --------------------------------------------------------------------------- echo "17/${TOTAL_GATES}: m7-parity-audit (analyze ×2 backends → graphHash)" -# ADR 0016 made `@ladybugdb/core` the only graph backend. The cross-backend -# `CODEHUB_STORE=duck` vs `CODEHUB_STORE=lbug` parity audit no longer has two -# backends to compare — `CODEHUB_STORE` is a no-op — so the underlying audit -# script was removed. The banner stays at slot 17 so the `codehub bench` -# dashboard contract (packages/cli/src/commands/bench.ts MVP_GATES) keeps its -# verbatim banner match; the gate is now a permanent SKIP. In-memory graphHash -# byte-identity is still pinned by gate 6 (determinism) and the parity harness -# at packages/storage/src/test-utils/parity-harness.ts. -skip "m7-parity-audit: retired — lbug is the only graph backend (ADR 0016); nothing to compare" +# ADR 0019 collapsed storage onto a single `store.sqlite` backend. The +# cross-backend parity audit no longer has two backends to compare +# (`CODEHUB_STORE` is a no-op) so the underlying audit script was removed. The +# banner stays at slot 17 so the `codehub bench` dashboard contract +# (packages/cli/src/commands/bench.ts MVP_GATES) keeps its verbatim banner +# match; the gate is now a permanent SKIP. In-memory graphHash byte-identity is +# still pinned by gate 6 (determinism) and the parity harness at +# packages/storage/src/test-utils/parity-harness.ts. +skip "m7-parity-audit: retired — single SQLite backend (ADR 0019); nothing to compare" echo # --------------------------------------------------------------------------- diff --git a/scripts/check-banned-strings.sh b/scripts/check-banned-strings.sh index 634fa81c..e0871fba 100755 --- a/scripts/check-banned-strings.sh +++ b/scripts/check-banned-strings.sh @@ -13,12 +13,14 @@ set -euo pipefail # Literal strings we reject outright. Case-insensitive. # -# Removed at v1: `ladybug` and `kuzu`. LadybugDB is now the default graph -# backend (M7, ADR 0013); the bare product name is critical prose surface -# for end-user docs, slash-command help, and the public site. `kuzu` is -# retained as historical lineage in cross-link prose ("LadybugDB is the -# open-source successor to the pre-1.0 Kuzu codebase") and ADRs already -# cite it for provenance. +# The prior-backend names (`ladybug`, `kuzu`, `duckdb`, `lbug`) are NOT in +# this global list: ADR 0019 removed both native storage backends, and the +# names remain legitimate REMOVAL PROSE in end-user docs, ADRs, CHANGELOGs, +# and the public site ("ADR 0019 removed @ladybugdb/core and @duckdb/node-api"). +# Banning them globally would corrupt correct historical documentation. They +# are instead hard-banned ONLY in live source (`packages/**/src`, excluding +# tests) via the SOURCE_BANNED_REGEX sweep below, and their dead on-disk +# artifact filenames are banned in the docs surface via DOC_STALE_LITERALS. BANNED_LITERALS=( 'STEP_IN_PROCESS' 'heuristicLabel' @@ -112,6 +114,56 @@ for pat in "${BANNED_REGEX[@]}"; do fi done +# Published-docs staleness sweep — scoped to the user-facing docs surface +# (the Starlight site, README, and the two generated agent-facing files). +# ADR 0019 replaced the lbug + DuckDB two-file store with a single +# `store.sqlite`, and the MCP surface is 29 tools. These literals are the +# unambiguous drift signals: the dead on-disk filenames, and the stale tool +# counts. Removal/supersession PROSE ("ADR 0019 removed @ladybugdb/core") is +# NOT banned — only the concrete dead artifacts and wrong numbers are. Scoped +# so architectural-history ADRs and internal planning notes stay free to name +# the old backend. +DOC_STALE_LITERALS=( + 'graph.lbug' # dead on-disk graph file (was LadybugDB) + 'temporal.duckdb' # dead on-disk temporal file (was DuckDB) + '28 tools' # stale MCP tool count (now 29) + '30 tools' # stale MCP tool count (now 29) + '28 MCP tool' # stale MCP tool count (now 29) + '30 MCP tool' # stale MCP tool count (now 29) +) +DOC_PATHSPEC=( + 'packages/docs/src' + 'packages/docs/public/tool-catalog.json' + 'packages/docs/astro.config.mjs' + 'README.md' +) +for pat in "${DOC_STALE_LITERALS[@]}"; do + if matches=$(git grep -I -n -i -e "$pat" --untracked -- "${DOC_PATHSPEC[@]}" 2>/dev/null); then + echo "FAIL: stale docs literal '$pat' found (ADR 0019 storage / 29-tool drift):" >&2 + printf '%s\n' "$matches" >&2 + fail=1 + fi +done + +# ── Prior-backend names hard-banned in LIVE SOURCE ──────────────────────────── +# ADR 0019 removed the two native storage backends. Their names must never +# reappear in shipping source code (only in removal prose / ADRs / CHANGELOGs). +# Scoped to `packages/**/src`, EXCLUDING `*.test.ts` — one test deliberately +# keeps the tokens: `sqlite-adapter.test.ts` asserts NO `.lbug`/`.duckdb` +# sidecar file is ever created, which is the regression guard that the removal +# stays removed. That assertion IS the enforcement, so its source is exempt. +SOURCE_BANNED_REGEX='duckdb|ladybug|lbug|kuzu' +SOURCE_PATHSPEC=( + ':(glob)packages/*/src/**/*.ts' + ':(exclude,glob)packages/*/src/**/*.test.ts' + ':(exclude)packages/storage/src/test-utils' +) +if matches=$(git grep -I -n -i -E -e "$SOURCE_BANNED_REGEX" -- "${SOURCE_PATHSPEC[@]}" 2>/dev/null); then + echo "FAIL: prior-backend name (duckdb/ladybug/lbug/kuzu) found in live source (ADR 0019 removed both backends — use 'store.sqlite' / 'the store'):" >&2 + printf '%s\n' "$matches" >&2 + fail=1 +fi + if [ "$fail" -ne 0 ]; then echo "Banned-strings check failed." >&2 exit 1 diff --git a/scripts/pack-determinism-audit.sh b/scripts/pack-determinism-audit.sh index fa61c931..802be09b 100755 --- a/scripts/pack-determinism-audit.sh +++ b/scripts/pack-determinism-audit.sh @@ -17,11 +17,9 @@ # SKIP behavior: # The script exits 0 with a SKIP message when: # - The CLI binary at packages/cli/dist/index.js is absent (build first). -# - The repo lacks a `/.codehub/duck.db` graph (run `codehub -# analyze` first). DuckDB native bindings may not load on every host -# (worktree native-binding lesson) so we degrade gracefully. -# These are not failures — they let the script run safely as part of -# `scripts/acceptance.sh` on developer laptops without a populated index. +# - The repo lacks a `/.codehub/store.sqlite` index (run `codehub +# analyze` first). This lets the script run safely as part of +# `scripts/acceptance.sh` on developer laptops without a populated index. set -euo pipefail @@ -34,8 +32,8 @@ if [ ! -f "$CLI" ]; then exit 0 fi -if [ ! -f "$REPO/.codehub/duck.db" ]; then - echo "SKIP: pack-determinism — no DuckStore at $REPO/.codehub/duck.db (run 'codehub analyze' first)" +if [ ! -f "$REPO/.codehub/store.sqlite" ]; then + echo "SKIP: pack-determinism — no index at $REPO/.codehub/store.sqlite (run 'codehub analyze' first)" exit 0 fi diff --git a/scripts/verify-global-install.sh b/scripts/verify-global-install.sh index 060f05fb..088595d1 100755 --- a/scripts/verify-global-install.sh +++ b/scripts/verify-global-install.sh @@ -30,10 +30,11 @@ # (default: 120). The budget guards against a # regression that makes install HANG or refetch (the # old native tree-sitter-cli GHCR fetch); it is not a -# perf benchmark. A cold-cache `npm install -g` of the -# native prebuilts (ladybug + duckdb + onnxruntime) on a -# loaded shared runner legitimately varies 30–90s, so a -# tight 60s tripped on slow cells despite a clean install. +# perf benchmark. A cold-cache `npm install -g` of a +# pure-JS + WASM package (ADR 0019: no native storage or +# embedder bindings) on a loaded shared runner legitimately +# varies 30–90s, so a tight 60s tripped on slow cells +# despite a clean install. # # Exit codes: # 0 every gate passed From 48253838d00fc84587f78aa180050fdbfc0e3533 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Fri, 3 Jul 2026 14:38:56 +0000 Subject: [PATCH 02/11] fix(pack): wire pack provenance across CLI+MCP and fix traversal cycle guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two correctness bugs from the tech-debt audit (both "green tests, wrong production behavior"): B1 [P0] — MCP pack_codebase shipped a hollow pack + divergent packHash. callRealPackEngine called generatePack({...}, { store }), omitting the provenance bundle (commit, repoOriginUrl, chunkerFiles, grammarCommits) the CLI wires. The manifest preimage binds commit/repoOriginUrl/fileHash/ chonkie_version/grammar_commits, so an MCP-triggered pack produced an empty ast-chunks.jsonl, a byte-range-free context-bom, commit="", and a packHash that silently diverged from the CLI's for the same repo+commit. Fix: move resolvePackProvenance from cli/commands/code-pack.ts into @opencodehub/pack (new provenance.ts) so both entry points call it; MCP now passes ...provenance. provenance.test.ts pins that omitting provenance changes the packHash and that equal provenance yields an equal packHash across entry points. B2 [P1] — traverse recursive CTE used an unanchored instr(path, id) cycle guard. Node ids have no disambiguating suffix, so one id being a SUBSTRING of another already on the path (Class:a.ts:Foo in Class:a.ts:FooBar) was read as a revisit and pruned, dropping the node and its whole subtree — silently under-reporting blast radius in impact/api_impact/verdict. Fix: anchor both operands on comma delimiters so only a whole id counts as a revisit. B2b [P1] — the final SELECT grouped by node_id with a bare path column, so SQLite picked an arbitrary tied row on equal-depth paths (diamond graphs), making the reported predecessor/path nondeterministic. Fix: rank by (depth, path) via ROW_NUMBER and keep rank 1 (lexicographically smallest). traverse-substring-id.test.ts covers both; verified it fails against the pre-fix query and passes after. B5 (pack-determinism-audit.sh gating on the nonexistent duck.db) was fixed in the prior commit. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/cli/src/commands/code-pack.ts | 109 +-------- packages/mcp/src/tools/pack-codebase.ts | 9 +- packages/pack/src/index.ts | 2 + packages/pack/src/provenance.test.ts | 220 ++++++++++++++++++ packages/pack/src/provenance.ts | 126 ++++++++++ packages/storage/src/sqlite-adapter.ts | 29 ++- .../storage/src/traverse-substring-id.test.ts | 134 +++++++++++ 7 files changed, 514 insertions(+), 115 deletions(-) create mode 100644 packages/pack/src/provenance.test.ts create mode 100644 packages/pack/src/provenance.ts create mode 100644 packages/storage/src/traverse-substring-id.test.ts diff --git a/packages/cli/src/commands/code-pack.ts b/packages/cli/src/commands/code-pack.ts index 990b59f3..4442bb2d 100644 --- a/packages/cli/src/commands/code-pack.ts +++ b/packages/cli/src/commands/code-pack.ts @@ -35,15 +35,13 @@ import { existsSync, statSync } from "node:fs"; import { mkdir, mkdtemp, readFile, rename, rm, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join, resolve } from "node:path"; -import type { FileNode, GraphNode, RepoNode } from "@opencodehub/core-types"; -import { sha256Hex } from "@opencodehub/core-types"; -import { parse as ingestionParse } from "@opencodehub/ingestion"; import { buildContextAttestation, type CacheChannel, DEFAULT_CACHE_CHANNEL, generatePack, type PackManifest, + resolvePackProvenance, serializeAttestation, } from "@opencodehub/pack"; import { type IGraphStore, openStore, resolveGraphPath, type Store } from "@opencodehub/storage"; @@ -285,111 +283,6 @@ async function runRepomixEngine(repoPath: string, args: CodePackArgs): Promise; - readonly grammarCommits: Readonly>; -} - -/** - * Derive {@link PackProvenance} from the opened graph and the repo working - * tree. - * - * - commit / repoOriginUrl: read from the singleton `Repo` node, so the - * pack stays a pure read of the indexed state (no `git` spawn here). - * - chunkerFiles: every indexed `File` node's bytes, read from disk and - * **hash-verified against the node's `contentHash`**. A file whose - * working-tree bytes drifted from the index is skipped, so the pack never - * chunks content that disagrees with what was analyzed — preserving the - * "pack reflects the indexed commit" contract. - * - grammarCommits: the vendored grammar version pins. - * - * A `graph` of `undefined` (no store) or one lacking `listNodes` (a bare test - * stub) yields empty file/commit provenance but still returns grammar pins. - */ -async function resolvePackProvenance( - graph: IGraphStore | undefined, - repoPath: string, -): Promise { - const grammarCommits = await loadGrammarCommits(); - - const canList = typeof graph?.listNodes === "function"; - if (graph === undefined || !canList) { - return { commit: "", repoOriginUrl: null, chunkerFiles: [], grammarCommits }; - } - - const [repoNodes, fileNodes] = await Promise.all([ - graph.listNodes({ kinds: ["Repo"] }), - graph.listNodes({ kinds: ["File"] }), - ]); - - const repo = repoNodes.find((n): n is RepoNode => n.kind === "Repo"); - const commit = repo?.commitSha ?? ""; - const repoOriginUrl = repo?.originUrl ?? null; - - const chunkerFiles = await collectChunkerFiles(fileNodes, repoPath); - return { commit, repoOriginUrl, chunkerFiles, grammarCommits }; -} - -/** - * Read + hash-verify the bytes of every indexed `File` node. Only files whose - * on-disk sha256 matches the indexed `contentHash` are returned, so a pack run - * against a dirty working tree silently drops drifted files rather than - * chunking stale bytes. Files with no recorded `contentHash` are read as-is - * (the index never claimed a hash to verify against). - */ -async function collectChunkerFiles( - fileNodes: readonly GraphNode[], - repoPath: string, -): Promise { - const out: Array<{ path: string; bytes: Uint8Array; language?: string }> = []; - for (const node of fileNodes) { - if (node.kind !== "File") continue; - const file = node as FileNode; - let buf: Buffer; - try { - buf = await readFile(resolve(repoPath, file.filePath)); - } catch { - continue; // file vanished from the tree since indexing — skip it - } - const bytes = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength); - if (file.contentHash !== undefined && sha256Hex(bytes) !== file.contentHash) { - continue; // working-tree bytes drifted from the indexed state — skip - } - out.push({ - path: file.filePath, - bytes, - ...(file.language !== undefined ? { language: file.language } : {}), - }); - } - return out; -} - -/** - * Load the vendored grammar version pins for the manifest. Best-effort: an - * unreadable manifest yields `{}` rather than failing the pack. - */ -async function loadGrammarCommits(): Promise>> { - try { - return await ingestionParse.grammarVersions(); - } catch { - return {}; - } -} - /** * Read the on-disk size of `path`. Exported so the CLI's user-facing * recap can format byte counts without re-walking the dir tree. diff --git a/packages/mcp/src/tools/pack-codebase.ts b/packages/mcp/src/tools/pack-codebase.ts index b922fa4c..ab95eda1 100644 --- a/packages/mcp/src/tools/pack-codebase.ts +++ b/packages/mcp/src/tools/pack-codebase.ts @@ -23,7 +23,7 @@ import { existsSync, statSync } from "node:fs"; import { mkdir } from "node:fs/promises"; import { dirname, join } from "node:path"; import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import { generatePack as defaultGeneratePack } from "@opencodehub/pack"; +import { generatePack as defaultGeneratePack, resolvePackProvenance } from "@opencodehub/pack"; import { z } from "zod"; import { toolAmbiguousRepoError, toolError, toolErrorFromUnknown } from "../error-envelope.js"; import { withNextSteps } from "../next-step-hints.js"; @@ -282,6 +282,11 @@ async function callRealPackEngine(args: { const store = await openStore({ path: dbPath, readOnly: true }); const stagingDir = await mkdtemp(join(tmpdir(), "codehub-pack-mcp-")); try { + // Resolve the SAME provenance bundle the CLI wires (commit, repoOriginUrl, + // per-file chunker bytes, grammar pins). Without it the MCP pack ships an + // empty ast-chunks.jsonl + byte-range-free context-bom and a packHash that + // diverges from the CLI's for the identical repo+commit (B1 / injection-seam). + const provenance = await resolvePackProvenance(store.graph, args.repo); const manifest = await defaultGeneratePack( { repoPath: args.repo, @@ -289,7 +294,7 @@ async function callRealPackEngine(args: { budgetTokens: args.budget, tokenizerId: args.tokenizer, }, - { store }, + { store, ...provenance }, ); const finalOutDir = resolve(args.repo, ".codehub", "packs", manifest.packHash); await mkdir(dirname(finalOutDir), { recursive: true }); diff --git a/packages/pack/src/index.ts b/packages/pack/src/index.ts index 51ba05cc..128c580d 100644 --- a/packages/pack/src/index.ts +++ b/packages/pack/src/index.ts @@ -90,6 +90,8 @@ export type { LicensesContent, LicensesOpts } from "./licenses.js"; export { buildLicenses } from "./licenses.js"; export type { BuildManifestOpts } from "./manifest.js"; export { buildManifest, serializeManifest } from "./manifest.js"; +export type { PackProvenance } from "./provenance.js"; +export { resolvePackProvenance } from "./provenance.js"; export type { ReadmeOpts } from "./readme.js"; export { buildReadme } from "./readme.js"; export type { SkeletonOpts, SkeletonRow } from "./skeleton.js"; diff --git a/packages/pack/src/provenance.test.ts b/packages/pack/src/provenance.test.ts new file mode 100644 index 00000000..352d3c61 --- /dev/null +++ b/packages/pack/src/provenance.test.ts @@ -0,0 +1,220 @@ +/** + * Regression test for B1 — the MCP-vs-CLI packHash divergence + * (injection-seam / silent-hollow class). + * + * The MCP `pack_codebase` tool used to call `generatePack({...}, { store })`, + * omitting the provenance bundle (commit, repoOriginUrl, chunkerFiles, + * grammarCommits) that the CLI wires via `resolvePackProvenance`. Because the + * manifest preimage binds `commit` + per-file `fileHash` + `grammar_commits`, + * the two entry points produced DIFFERENT `packHash` values for the identical + * repo + commit, and the MCP pack shipped an empty `ast-chunks.jsonl` and a + * byte-range-free `context-bom.json` — a hollow pack that still exited 0. + * + * `resolvePackProvenance` now lives in `@opencodehub/pack` so BOTH entry points + * call it. These tests pin the contract at the source: + * 1. resolvePackProvenance reads real commit/originUrl/files from the graph. + * 2. a pack built WITH provenance has a different packHash than one built + * WITHOUT it (proving the omission was observable, not cosmetic). + * 3. two packs built WITH the same provenance share a packHash (the CLI-path + * == MCP-path invariant the fix guarantees). + * + * The chonkie loader is a deterministic stub so the test never depends on the + * real `@chonkiejs/core` install. + */ + +import { strict as assert } from "node:assert"; +import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import { test } from "node:test"; +import { type GraphNode, sha256Hex } from "@opencodehub/core-types"; +import type { IGraphStore, ListNodesOptions, Store } from "@opencodehub/storage"; +import { type GeneratePackInternalOpts, generatePack } from "./index.js"; +import { resolvePackProvenance } from "./provenance.js"; + +const COMMIT = "a".repeat(40); +const ORIGIN = "https://github.com/example/repo"; + +/** + * A minimal graph stub exposing the `listNodes({kinds})` finder + * `resolvePackProvenance` uses. Returns one `Repo` node (commit + origin) and + * two `File` nodes whose `contentHash` matches the bytes written to disk. + */ +function makeProvenanceStore(fileHashes: Record): IGraphStore { + const nodes: GraphNode[] = [ + { + id: "repo:example" as GraphNode["id"], + kind: "Repo", + name: "repo", + filePath: ".", + commitSha: COMMIT, + originUrl: ORIGIN, + defaultBranch: "main", + group: null, + } as unknown as GraphNode, + { + id: "file:src/a.ts" as GraphNode["id"], + kind: "File", + name: "a.ts", + filePath: "src/a.ts", + language: "typescript", + contentHash: fileHashes["src/a.ts"], + } as unknown as GraphNode, + { + id: "file:src/b.ts" as GraphNode["id"], + kind: "File", + name: "b.ts", + filePath: "src/b.ts", + language: "typescript", + contentHash: fileHashes["src/b.ts"], + } as unknown as GraphNode, + ]; + + const listNodes = async (opts?: ListNodesOptions): Promise => { + const kinds = opts?.kinds; + if (kinds === undefined) return nodes; + return nodes.filter((n) => kinds.includes(n.kind)); + }; + + // Only the finders resolvePackProvenance + the BOM bodies touch are real; + // everything else throws so an accidental new dependency is caught loudly. + return new Proxy({ listNodes } as unknown as IGraphStore, { + get(target, prop, receiver) { + if (prop in target) return Reflect.get(target, prop, receiver); + if (prop === "listNodesByKind") { + return async (kind: string) => (await listNodes({ kinds: [kind] })) as unknown; + } + if ( + prop === "listEdges" || + prop === "listEdgesByType" || + prop === "listFindings" || + prop === "listDependencies" || + prop === "listRoutes" + ) { + return async () => []; + } + return () => { + throw new Error(`unexpected IGraphStore.${String(prop)} call in provenance test`); + }; + }, + }); +} + +const CHONKIE_STUB: GeneratePackInternalOpts["chonkieLoader"] = async () => ({ + version: "0.0.9", + CodeChunker: { + create: async () => ({ + chunk(text: string) { + return [{ text, startIndex: 0, endIndex: text.length, tokenCount: 1 }]; + }, + }), + }, +}); + +const COMMON_OPTS = { + repoPath: "", // filled per-run with the temp repo dir + budgetTokens: 20_000, + tokenizerId: "openai:o200k_base@tiktoken-0.8.0", +} as const; + +async function seedRepo(): Promise<{ repoPath: string; hashes: Record }> { + const repoPath = await mkdtemp(path.join(tmpdir(), "pack-prov-repo-")); + const bytesA = new TextEncoder().encode("export const a = 1;\n"); + const bytesB = new TextEncoder().encode("export const b = 2;\n"); + await mkdir(path.join(repoPath, "src"), { recursive: true }); + await writeFile(path.join(repoPath, "src/a.ts"), bytesA); + await writeFile(path.join(repoPath, "src/b.ts"), bytesB); + return { + repoPath, + hashes: { "src/a.ts": sha256Hex(bytesA), "src/b.ts": sha256Hex(bytesB) }, + }; +} + +function composedStore(graph: IGraphStore): Store { + return { + graph, + temporal: graph as unknown as Store["temporal"], + graphFile: ":memory:", + temporalFile: ":memory:", + close: async () => {}, + }; +} + +test("resolvePackProvenance reads real commit + origin + files from the graph", async () => { + const { repoPath, hashes } = await seedRepo(); + try { + const graph = makeProvenanceStore(hashes); + const prov = await resolvePackProvenance(graph, repoPath); + assert.equal(prov.commit, COMMIT, "commit must come from the Repo node"); + assert.equal(prov.repoOriginUrl, ORIGIN, "origin must come from the Repo node"); + assert.equal(prov.chunkerFiles.length, 2, "both hash-verified files are collected"); + const paths = prov.chunkerFiles.map((f) => f.path).sort(); + assert.deepEqual(paths, ["src/a.ts", "src/b.ts"]); + } finally { + await rm(repoPath, { recursive: true, force: true }); + } +}); + +test("B1: pack WITH provenance diverges from pack WITHOUT it (the MCP hollow-pack bug)", async () => { + const { repoPath, hashes } = await seedRepo(); + const outWith = await mkdtemp(path.join(tmpdir(), "pack-prov-with-")); + const outWithout = await mkdtemp(path.join(tmpdir(), "pack-prov-without-")); + try { + const graph = makeProvenanceStore(hashes); + const prov = await resolvePackProvenance(graph, repoPath); + + // WITH provenance — what the CLI does, and what the MCP tool does after the fix. + const withManifest = await generatePack( + { ...COMMON_OPTS, repoPath, outDir: outWith }, + { store: composedStore(graph), chonkieLoader: CHONKIE_STUB, ...prov }, + ); + + // WITHOUT provenance — the old MCP `{ store }`-only call. commit="" and + // chunkerFiles=[] default in, so ast-chunks is empty and the hash differs. + const withoutManifest = await generatePack( + { ...COMMON_OPTS, repoPath, outDir: outWithout }, + { store: composedStore(graph), chonkieLoader: CHONKIE_STUB }, + ); + + assert.notEqual( + withManifest.packHash, + withoutManifest.packHash, + "omitting provenance MUST change the packHash — otherwise the MCP hollow pack would be indistinguishable from a real one", + ); + + // The hollow pack carries no commit; the real one carries the indexed commit. + assert.equal(withManifest.commit, COMMIT); + assert.equal(withoutManifest.commit, ""); + } finally { + await rm(repoPath, { recursive: true, force: true }); + await rm(outWith, { recursive: true, force: true }); + await rm(outWithout, { recursive: true, force: true }); + } +}); + +test("B1: two packs built with the same provenance share a packHash (CLI-path == MCP-path)", async () => { + const { repoPath, hashes } = await seedRepo(); + const outA = await mkdtemp(path.join(tmpdir(), "pack-prov-a-")); + const outB = await mkdtemp(path.join(tmpdir(), "pack-prov-b-")); + try { + const graph = makeProvenanceStore(hashes); + // Resolve provenance twice, independently, to mimic two separate entry points. + const provCli = await resolvePackProvenance(graph, repoPath); + const provMcp = await resolvePackProvenance(graph, repoPath); + + const a = await generatePack( + { ...COMMON_OPTS, repoPath, outDir: outA }, + { store: composedStore(graph), chonkieLoader: CHONKIE_STUB, ...provCli }, + ); + const b = await generatePack( + { ...COMMON_OPTS, repoPath, outDir: outB }, + { store: composedStore(graph), chonkieLoader: CHONKIE_STUB, ...provMcp }, + ); + + assert.equal(a.packHash, b.packHash, "same provenance ⇒ same packHash across entry points"); + } finally { + await rm(repoPath, { recursive: true, force: true }); + await rm(outA, { recursive: true, force: true }); + await rm(outB, { recursive: true, force: true }); + } +}); diff --git a/packages/pack/src/provenance.ts b/packages/pack/src/provenance.ts new file mode 100644 index 00000000..58da10fe --- /dev/null +++ b/packages/pack/src/provenance.ts @@ -0,0 +1,126 @@ +/** + * Pack provenance resolution — shared by the CLI `code-pack` command and the + * MCP `pack_codebase` tool so BOTH production entry points feed `generatePack` + * the same `internal` inputs and therefore produce the SAME `packHash` for a + * given repo + commit. + * + * This lived privately in `packages/cli/src/commands/code-pack.ts`, which the + * MCP tool cannot import (it would invert the `cli → mcp` dependency and cycle). + * Hosting it in `@opencodehub/pack` — which both `cli` and `mcp` already depend + * on — lets the MCP path wire real provenance instead of shipping a hollow pack + * (empty ast-chunks, `commit=""`, `chonkieVersion="unknown"`) with a packHash + * that silently diverges from the CLI's for the identical input. + * + * Every field is best-effort: a graph missing the data (or a stubbed store in + * tests) yields safe empties, never a throw, so packing never fails on absent + * provenance. + */ + +import { readFile } from "node:fs/promises"; +import { resolve } from "node:path"; +import type { FileNode, GraphNode, RepoNode } from "@opencodehub/core-types"; +import { sha256Hex } from "@opencodehub/core-types"; +import { parse as ingestionParse } from "@opencodehub/ingestion"; +import type { IGraphStore } from "@opencodehub/storage"; + +/** + * Production provenance the pack manifest records, derived from the indexed + * graph + the working tree. Each field maps to a `generatePack` `internal` + * input. + */ +export interface PackProvenance { + readonly commit: string; + readonly repoOriginUrl: string | null; + readonly chunkerFiles: ReadonlyArray<{ + readonly path: string; + readonly bytes: Uint8Array; + readonly language?: string; + }>; + readonly grammarCommits: Readonly>; +} + +/** + * Derive {@link PackProvenance} from the opened graph and the repo working + * tree. + * + * - commit / repoOriginUrl: read from the singleton `Repo` node, so the + * pack stays a pure read of the indexed state (no `git` spawn here). + * - chunkerFiles: every indexed `File` node's bytes, read from disk and + * **hash-verified against the node's `contentHash`**. A file whose + * working-tree bytes drifted from the index is skipped, so the pack never + * chunks content that disagrees with what was analyzed — preserving the + * "pack reflects the indexed commit" contract. + * - grammarCommits: the vendored grammar version pins. + * + * A `graph` of `undefined` (no store) or one lacking `listNodes` (a bare test + * stub) yields empty file/commit provenance but still returns grammar pins. + */ +export async function resolvePackProvenance( + graph: IGraphStore | undefined, + repoPath: string, +): Promise { + const grammarCommits = await loadGrammarCommits(); + + const canList = typeof graph?.listNodes === "function"; + if (graph === undefined || !canList) { + return { commit: "", repoOriginUrl: null, chunkerFiles: [], grammarCommits }; + } + + const [repoNodes, fileNodes] = await Promise.all([ + graph.listNodes({ kinds: ["Repo"] }), + graph.listNodes({ kinds: ["File"] }), + ]); + + const repo = repoNodes.find((n): n is RepoNode => n.kind === "Repo"); + const commit = repo?.commitSha ?? ""; + const repoOriginUrl = repo?.originUrl ?? null; + + const chunkerFiles = await collectChunkerFiles(fileNodes, repoPath); + return { commit, repoOriginUrl, chunkerFiles, grammarCommits }; +} + +/** + * Read + hash-verify the bytes of every indexed `File` node. Only files whose + * on-disk sha256 matches the indexed `contentHash` are returned, so a pack run + * against a dirty working tree silently drops drifted files rather than + * chunking stale bytes. Files with no recorded `contentHash` are read as-is + * (the index never claimed a hash to verify against). + */ +async function collectChunkerFiles( + fileNodes: readonly GraphNode[], + repoPath: string, +): Promise { + const out: Array<{ path: string; bytes: Uint8Array; language?: string }> = []; + for (const node of fileNodes) { + if (node.kind !== "File") continue; + const file = node as FileNode; + let buf: Buffer; + try { + buf = await readFile(resolve(repoPath, file.filePath)); + } catch { + continue; // file vanished from the tree since indexing — skip it + } + const bytes = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength); + if (file.contentHash !== undefined && sha256Hex(bytes) !== file.contentHash) { + continue; // working-tree bytes drifted from the indexed state — skip + } + out.push({ + path: file.filePath, + bytes, + ...(file.language !== undefined ? { language: file.language } : {}), + }); + } + return out; +} + +/** + * Load the vendored grammar version pins for the manifest. Best-effort: an + * unreadable manifest yields `{}` rather than failing the pack. + */ +async function loadGrammarCommits(): Promise>> { + try { + return await ingestionParse.grammarVersions(); + } catch { + return {}; + } +} diff --git a/packages/storage/src/sqlite-adapter.ts b/packages/storage/src/sqlite-adapter.ts index f0fa3471..ba781da6 100644 --- a/packages/storage/src/sqlite-adapter.ts +++ b/packages/storage/src/sqlite-adapter.ts @@ -990,14 +990,23 @@ export class SqliteStore implements IGraphStore, ITemporalStore { typePredDown = ` AND edges.type IN (${phs})`; typePredUp = ` AND edges.type IN (${phs})`; } + // Cycle guard: the path is a comma-joined id list, so we must anchor the + // membership test on comma delimiters — a raw `instr(path, id)` would + // falsely match when one node id is a SUBSTRING of another already on the + // path (e.g. `Class:a.ts:Foo` ⊂ `Class:a.ts:FooBar`, `File:a.ts` ⊂ + // `File:a.ts.bak`), pruning the real edge and its whole subtree and + // silently under-reporting blast radius. Wrapping both operands in commas + // (`,path,` vs `,id,`) makes the match whole-id-only. (B2) const downStep = "SELECT edges.dst, reach.depth + 1, reach.path || ',' || edges.dst " + "FROM edges JOIN reach ON edges.src = reach.node_id " + - `WHERE reach.depth < ? AND edges.confidence >= ? AND instr(reach.path, edges.dst) = 0${typePredDown}`; + "WHERE reach.depth < ? AND edges.confidence >= ? AND " + + `instr(',' || reach.path || ',', ',' || edges.dst || ',') = 0${typePredDown}`; const upStep = "SELECT edges.src, reach.depth + 1, reach.path || ',' || edges.src " + "FROM edges JOIN reach ON edges.dst = reach.node_id " + - `WHERE reach.depth < ? AND edges.confidence >= ? AND instr(reach.path, edges.src) = 0${typePredUp}`; + "WHERE reach.depth < ? AND edges.confidence >= ? AND " + + `instr(',' || reach.path || ',', ',' || edges.src || ',') = 0${typePredUp}`; let recursive: string; const stepParams: SqlParam[] = []; @@ -1017,15 +1026,25 @@ export class SqliteStore implements IGraphStore, ITemporalStore { pushStep(true); pushStep(false); } + // Per node we want the shallowest reach, and on a depth tie a DETERMINISTIC + // path (multiple equally-short paths can reach the same node on a diamond + // graph). A bare `SELECT ..., path ... GROUP BY node_id` lets SQLite pick an + // ARBITRARY tied row for `path`, so the reported predecessor / viaRelation + // varies across runs and SQLite builds. Rank each row by (depth, path) and + // keep rank 1 — the lexicographically-smallest path at the min depth. (B2b) const sql = ` WITH RECURSIVE reach(node_id, depth, path) AS ( SELECT ?, 0, ? UNION ${recursive} + ), + ranked AS ( + SELECT node_id, depth, path, + ROW_NUMBER() OVER (PARTITION BY node_id ORDER BY depth ASC, path ASC) AS rn + FROM reach WHERE node_id != ? ) - SELECT node_id, MIN(depth) AS depth, path - FROM reach WHERE node_id != ? - GROUP BY node_id ORDER BY depth ASC, node_id ASC`; + SELECT node_id, depth, path FROM ranked WHERE rn = 1 + ORDER BY depth ASC, node_id ASC`; const allParams: SqlParam[] = [ String(q.startId), String(q.startId), diff --git a/packages/storage/src/traverse-substring-id.test.ts b/packages/storage/src/traverse-substring-id.test.ts new file mode 100644 index 00000000..c7d5c33f --- /dev/null +++ b/packages/storage/src/traverse-substring-id.test.ts @@ -0,0 +1,134 @@ +/** + * Regression test for B2 + B2b — the recursive-CTE traversal cycle guard. + * + * B2: the cycle guard used a RAW `instr(reach.path, edges.dst) = 0` substring + * test. Node ids are `${kind}:${filePath}:${qualifiedName}` with no + * disambiguating suffix, so one id can be a SUBSTRING of another + * (`Function:src/app.ts:Foo` ⊂ `Function:src/app.ts:FooBar`). When the longer + * id was already on the path, the edge to the shorter id was falsely pruned as + * a "cycle", dropping that node AND its whole subtree — silently + * under-reporting blast radius in impact / api_impact / verdict. The fix + * anchors the membership test on comma delimiters so only a WHOLE id counts as + * a revisit. + * + * B2b: on a diamond graph two equally-short paths can reach the same node; the + * old `GROUP BY node_id` with a bare `path` column let SQLite pick an arbitrary + * tied row, so the reported predecessor/path varied across runs. The fix ranks + * by (depth, path) and keeps the lexicographically-smallest — deterministic. + * + * These tests would both FAIL against the pre-fix query. + */ + +import assert from "node:assert/strict"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "node:test"; +import { + type GraphNode, + KnowledgeGraph, + makeNodeId, + type NodeId, + type RelationType, +} from "@opencodehub/core-types"; +import { SqliteStore } from "./sqlite-adapter.js"; + +function fn(g: KnowledgeGraph, name: string): NodeId { + const id = makeNodeId("Function", "src/app.ts", name); + g.addNode({ + id, + kind: "Function", + name, + filePath: "src/app.ts", + startLine: 1, + signature: `function ${name}()`, + } as GraphNode); + return id; +} + +function calls(g: KnowledgeGraph, from: NodeId, to: NodeId): void { + g.addEdge({ from, to, type: "CALLS" as RelationType, confidence: 1.0 }); +} + +async function withStore( + graph: KnowledgeGraph, + body: (store: SqliteStore) => Promise, +): Promise { + const dir = await mkdtemp(join(tmpdir(), "och-traverse-b2-")); + const dbPath = join(dir, "store.sqlite"); + const store = new SqliteStore(dbPath); + try { + await store.open(); + await store.createSchema(); + await store.bulkLoad(graph); + await body(store); + } finally { + await store.close(); + await rm(dir, { recursive: true, force: true }); + } +} + +test("B2: prefix-substring node ids do not falsely prune the traversal subtree", async () => { + const g = new KnowledgeGraph(); + g.addNode({ + id: makeNodeId("File", "src/app.ts", "src/app.ts"), + kind: "File", + name: "app.ts", + filePath: "src/app.ts", + } as GraphNode); + // `Foo`'s id is a strict substring of `FooBar`'s id — the exact B2 trigger. + const start = fn(g, "start"); + const fooBar = fn(g, "FooBar"); + const foo = fn(g, "Foo"); + const end = fn(g, "end"); + // Chain: start → FooBar → Foo → end. Walking down from start, once `FooBar` + // is on the path the pre-fix guard saw `Foo` (a substring) as a revisit and + // dropped it plus `end`. + calls(g, start, fooBar); + calls(g, fooBar, foo); + calls(g, foo, end); + + await withStore(g, async (store) => { + const down = await store.traverse({ startId: start, direction: "down", maxDepth: 10 }); + const reached = new Set(down.map((r) => r.nodeId)); + assert.ok(reached.has(fooBar), "FooBar reached"); + assert.ok(reached.has(foo), "Foo reached — pre-fix this was pruned as a false substring cycle"); + assert.ok(reached.has(end), "end reached — the subtree below Foo survives"); + assert.equal(reached.size, 3, "all three downstream nodes reached, none dropped"); + }); +}); + +test("B2b: diamond graph yields a deterministic path across repeated traversals", async () => { + const g = new KnowledgeGraph(); + g.addNode({ + id: makeNodeId("File", "src/app.ts", "src/app.ts"), + kind: "File", + name: "app.ts", + filePath: "src/app.ts", + } as GraphNode); + // Diamond: root → left → sink, root → right → sink. Two equally-short (depth + // 2) paths reach `sink`; the reported path must be stable across runs. + const root = fn(g, "root"); + const left = fn(g, "left"); + const right = fn(g, "right"); + const sink = fn(g, "sink"); + calls(g, root, left); + calls(g, root, right); + calls(g, left, sink); + calls(g, right, sink); + + await withStore(g, async (store) => { + const paths = new Set(); + for (let i = 0; i < 5; i++) { + const down = await store.traverse({ startId: root, direction: "down", maxDepth: 10 }); + const sinkRow = down.find((r) => r.nodeId === sink); + assert.ok(sinkRow, "sink reached"); + paths.add(sinkRow.path.join(",")); + } + assert.equal( + paths.size, + 1, + `sink path must be deterministic across runs, saw: ${[...paths].join(" | ")}`, + ); + }); +}); From 51ba06e5b269994e82ea831f0a64d2492b474a83 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Fri, 3 Jul 2026 18:16:10 +0000 Subject: [PATCH 03/11] feat(core-ops): shared capability core, convert findings as the proof-of-concept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce @opencodehub/core-ops — a transport-free "capability" layer the MCP tools and CLI commands share, so the byte-identical resolve/finder/filter/ projection logic (audit findings D4/D7) lives once and each surface is a thin adapter that only maps the plain Output into its own transport. New package @opencodehub/core-ops (deps: analysis + storage + core-types; leaf, no cycle — both cli and mcp already depend on all three): - capability.ts: Capability = { id, execute(input, ctx) }, where ctx carries the already-open { graph, temporal } store + resolved repoName. Input validation and repo/store lifecycle stay at each surface's boundary for now (the resolvers differ — MCP carries AMBIGUOUS_REPO the CLI does not); unifying them behind a StoreProvider + defineTool/defineCommand factories is the next step. CapabilityStore is the single seam the deferred A1 accessor collapse will flip. - string-or.ts: the one canonical stringOr (kills the D7 copy in both surfaces). - caps/findings.ts: findingsCapability — the shared reader (listFindings push- down of severity+ruleId), TS post-finder (severity="none", scanner, filePath substring), and row projection. Zod-free: each surface keeps its own schema (MCP raw-shape for the SDK, CLI commander flags), which was never the duplicated part. - caps/findings.test.ts: 6 unit tests over a fake CapabilityStore — the first isolated coverage of that shared logic. Rewire both surfaces to call findingsCapability, unchanged public behavior: - cli/commands/findings.ts 111->63 LOC; keeps runFindings(opts) + storeFactory seam; findings.test.ts stays green (3/3). - mcp/tools/list-findings.ts 191->147 LOC; keeps runListFindings / registerListFindingsTool signatures + the full next_steps/staleness envelope. Verified: core-ops + cli findings tests green; every edited file typecheck-clean and biome-clean. (The mcp package cannot be test-run in this sandbox — its zod install is corrupted, unrelated to this change; list-findings.ts itself typechecks clean.) Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/cli/package.json | 5 +- packages/cli/src/commands/findings.ts | 88 +++--------- packages/cli/tsconfig.json | 1 + packages/core-ops/package.json | 64 +++++++++ packages/core-ops/src/capability.ts | 73 ++++++++++ packages/core-ops/src/caps/findings.test.ts | 150 ++++++++++++++++++++ packages/core-ops/src/caps/findings.ts | 116 +++++++++++++++ packages/core-ops/src/index.ts | 9 ++ packages/core-ops/src/string-or.ts | 13 ++ packages/core-ops/tsconfig.json | 14 ++ packages/mcp/package.json | 3 +- packages/mcp/src/tools/list-findings.ts | 86 +++-------- packages/mcp/tsconfig.json | 1 + tsconfig.json | 1 + 14 files changed, 488 insertions(+), 136 deletions(-) create mode 100644 packages/core-ops/package.json create mode 100644 packages/core-ops/src/capability.ts create mode 100644 packages/core-ops/src/caps/findings.test.ts create mode 100644 packages/core-ops/src/caps/findings.ts create mode 100644 packages/core-ops/src/index.ts create mode 100644 packages/core-ops/src/string-or.ts create mode 100644 packages/core-ops/tsconfig.json diff --git a/packages/cli/package.json b/packages/cli/package.json index 269db729..e640c349 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -1,7 +1,7 @@ { "name": "@opencodehub/cli", "version": "0.10.7", - "description": "OpenCodeHub — codehub CLI (analyze, setup, mcp, list, status, clean, query, context, impact, sql)", + "description": "OpenCodeHub \u2014 codehub CLI (analyze, setup, mcp, list, status, clean, query, context, impact, sql)", "license": "Apache-2.0", "repository": { "type": "git", @@ -39,7 +39,7 @@ "test": "pnpm run build:test && node --test \"./dist-test/**/*.test.js\"", "clean": "rm -rf dist dist-test *.tsbuildinfo" }, - "//deps": "The 15 @opencodehub/* workspace libs are INLINED into the bundle at build time (tsup noExternal) — they are devDependencies, not runtime deps. `dependencies` below is exactly the third-party set the bundle imports at runtime (kept `external`), plus the two @sourcegraph/scip-* indexers the parse pipeline spawns as subprocesses. onnxruntime-web (prebuilt WASM, no native binding) is optional (lazy-loaded only when embeddings are enabled).", + "//deps": "The 15 @opencodehub/* workspace libs are INLINED into the bundle at build time (tsup noExternal) \u2014 they are devDependencies, not runtime deps. `dependencies` below is exactly the third-party set the bundle imports at runtime (kept `external`), plus the two @sourcegraph/scip-* indexers the parse pipeline spawns as subprocesses. onnxruntime-web (prebuilt WASM, no native binding) is optional (lazy-loaded only when embeddings are enabled).", "dependencies": { "@apidevtools/swagger-parser": "12.1.0", "@aws-sdk/client-bedrock-runtime": "3.1076.0", @@ -68,6 +68,7 @@ }, "devDependencies": { "@opencodehub/analysis": "workspace:*", + "@opencodehub/core-ops": "workspace:*", "@opencodehub/core-types": "workspace:*", "@opencodehub/embedder": "workspace:*", "@opencodehub/eval": "workspace:*", diff --git a/packages/cli/src/commands/findings.ts b/packages/cli/src/commands/findings.ts index 4bc19749..ef816f08 100644 --- a/packages/cli/src/commands/findings.ts +++ b/packages/cli/src/commands/findings.ts @@ -1,18 +1,14 @@ /** * `codehub findings` — enumerate SARIF Finding nodes for an indexed repo. * - * CLI sibling of the MCP `list_findings` tool. Reuses the same storage - * reader (`store.graph.listFindings`) plus the identical TS post-finder for - * `scanner` / `filePath` substring narrowing and the `severity==="none"` - * filter. Only `note|warning|error` are pushed into `listFindings`; the - * `none` severity is handled entirely in the TS post-finder (both halves — - * we never pass it to the storage tier and we drop rows whose severity is - * not `none` when the caller asked for `none`). - * - * Mirrors `packages/mcp/src/tools/list-findings.ts:runListFindings`. Does NOT - * emit the MCP next_steps / staleness envelope — that is MCP-only. + * CLI sibling of the MCP `list_findings` tool. The shared reader/filter/ + * projection now lives in `@opencodehub/core-ops` `findingsCapability` — this + * command is the thin CLI adapter: open the store, run the capability, render + * to stdout (text or `--json`). Does NOT emit the MCP next_steps / staleness + * envelope — that is MCP-only. */ +import { type FindingsInput, findingsCapability } from "@opencodehub/core-ops"; import type { Store } from "@opencodehub/storage"; import { openStoreForCommand } from "./open-store.js"; @@ -29,72 +25,34 @@ export interface FindingsOptions { readonly storeFactory?: () => Promise<{ store: Store; repoPath: string }>; } -interface FindingRow { - readonly id: string; - readonly scanner: string; - readonly ruleId: string; - readonly severity: string; - readonly message: string; - readonly filePath: string; - readonly startLine?: number; - readonly endLine?: number; - readonly properties: Record; -} - export async function runFindings(opts: FindingsOptions = {}): Promise { - const limit = opts.limit ?? 500; const factory = opts.storeFactory ?? (() => openStoreForCommand({ ...opts, readOnly: true })); - const { store } = await factory(); + const { store, repoPath } = await factory(); try { - const findingsOpts: { - severity?: readonly ("note" | "warning" | "error")[]; - ruleId?: string; - limit?: number; - } = { limit }; - if ( - opts.severity !== undefined && - (opts.severity === "note" || opts.severity === "warning" || opts.severity === "error") - ) { - findingsOpts.severity = [opts.severity]; - } - if (opts.ruleId !== undefined) findingsOpts.ruleId = opts.ruleId; - const all = await store.graph.listFindings(findingsOpts); - - const filtered = all.filter((f) => { - if (opts.severity === "none" && f.severity !== "none") return false; - if (opts.scanner !== undefined && f.scannerId !== opts.scanner) return false; - if (opts.filePath !== undefined && !f.filePath.includes(opts.filePath)) return false; - return true; + const input: FindingsInput = { + ...(opts.severity !== undefined ? { severity: opts.severity } : {}), + ...(opts.scanner !== undefined ? { scanner: opts.scanner } : {}), + ...(opts.ruleId !== undefined ? { ruleId: opts.ruleId } : {}), + ...(opts.filePath !== undefined ? { filePath: opts.filePath } : {}), + ...(opts.limit !== undefined ? { limit: opts.limit } : {}), + }; + const out = await findingsCapability.execute(input, { + store, + repoName: opts.repo ?? repoPath, }); - const rows: FindingRow[] = filtered.map((f) => ({ - id: f.id, - scanner: stringOr(f.scannerId, "unknown"), - ruleId: stringOr(f.ruleId, ""), - severity: stringOr(f.severity, "note"), - message: stringOr(f.message, ""), - filePath: stringOr(f.filePath, ""), - properties: f.propertiesBag, - ...(typeof f.startLine === "number" && Number.isFinite(f.startLine) - ? { startLine: f.startLine } - : {}), - ...(typeof f.endLine === "number" && Number.isFinite(f.endLine) - ? { endLine: f.endLine } - : {}), - })); - if (opts.json) { - console.log(JSON.stringify({ findings: rows, total: rows.length }, null, 2)); + console.log(JSON.stringify({ findings: out.findings, total: out.total }, null, 2)); return; } - if (rows.length === 0) { + if (out.total === 0) { console.warn( "findings: no findings matched — run `codehub scan` or `codehub ingest-sarif ` to populate Finding nodes", ); return; } - for (const f of rows) { + for (const f of out.findings) { const loc = f.startLine !== undefined ? `:${f.startLine}` : ""; const msg = f.message ? ` — ${f.message}` : ""; console.log(`[${f.severity}] ${f.scanner}:${f.ruleId} at ${f.filePath}${loc}${msg}`); @@ -103,9 +61,3 @@ export async function runFindings(opts: FindingsOptions = {}): Promise { await store.close(); } } - -function stringOr(v: unknown, fallback: string): string { - if (typeof v === "string") return v; - if (typeof v === "number" || typeof v === "boolean") return String(v); - return fallback; -} diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index fb355d33..1fb506ef 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -8,6 +8,7 @@ "include": ["src/**/*", "test/**/*"], "references": [ { "path": "../analysis" }, + { "path": "../core-ops" }, { "path": "../core-types" }, { "path": "../embedder" }, { "path": "../eval" }, diff --git a/packages/core-ops/package.json b/packages/core-ops/package.json new file mode 100644 index 00000000..61c66f9c --- /dev/null +++ b/packages/core-ops/package.json @@ -0,0 +1,64 @@ +{ + "name": "@opencodehub/core-ops", + "version": "0.1.0", + "private": true, + "description": "OpenCodeHub — transport-free capability core shared by the CLI and MCP surfaces", + "license": "Apache-2.0", + "repository": { + "type": "git", + "url": "git+https://github.com/theagenticguy/opencodehub.git", + "directory": "packages/core-ops" + }, + "homepage": "https://github.com/theagenticguy/opencodehub#readme", + "bugs": { + "url": "https://github.com/theagenticguy/opencodehub/issues" + }, + "type": "module", + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": [ + "dist/**/*.js", + "!dist/**/*.test.js", + "dist/**/*.d.ts", + "!dist/**/*.test.d.ts", + "dist/**/*.js.map", + "!dist/**/*.test.js.map", + "dist/**/*.d.ts.map", + "!dist/**/*.test.d.ts.map" + ], + "scripts": { + "build": "tsc -b", + "test": "node --test \"./dist/**/*.test.js\"", + "clean": "rm -rf dist *.tsbuildinfo" + }, + "dependencies": { + "@opencodehub/analysis": "workspace:*", + "@opencodehub/core-types": "workspace:*", + "@opencodehub/storage": "workspace:*" + }, + "devDependencies": { + "@types/node": "26.0.1", + "typescript": "6.0.3" + }, + "publishConfig": { + "access": "public" + }, + "keywords": [ + "opencodehub", + "code-intelligence", + "mcp", + "model-context-protocol", + "cli", + "capability", + "typescript" + ], + "engines": { + "node": ">=24.15.0" + } +} diff --git a/packages/core-ops/src/capability.ts b/packages/core-ops/src/capability.ts new file mode 100644 index 00000000..c0d20e13 --- /dev/null +++ b/packages/core-ops/src/capability.ts @@ -0,0 +1,73 @@ +/** + * The `Capability` contract — a transport-free unit of work shared by the MCP + * tool and the CLI command for one code-intelligence operation. + * + * WHY THIS EXISTS. Today the MCP tool (`packages/mcp/src/tools/.ts`) and the + * CLI command (`packages/cli/src/commands/.ts`) for the same operation run + * byte-identical resolve → open-store → typed-finder → filter → row-projection + * logic and diverge ONLY at output (an MCP `CallToolResult` envelope vs the + * CLI's `console.log` / `--json`). A `Capability` owns the shared middle and + * returns a PLAIN typed `Output`; each surface keeps a thin adapter that maps + * that `Output` into its own transport. A filter fix then lands once, not twice. + * + * SCOPE (v1, the findings proof-of-concept). `execute` receives an ALREADY-OPEN + * store view plus the resolved repo's display name — both of which each surface + * already has at its call site (MCP via `withStore`, CLI via + * `openStoreForCommand`). Repo resolution and store lifecycle stay in the two + * surfaces for now because their resolvers differ meaningfully (the MCP side + * carries `AMBIGUOUS_REPO` semantics the CLI does not). Unifying resolution + + * lifecycle behind a `StoreProvider`, and folding the register/try-catch + * boilerplate into `defineTool`/`defineCommand` factories, is the natural + * follow-up once this seam is proven — see `artifacts/och-shared-core/`. + * + * A capability NEVER touches `console`, NEVER builds a `CallToolResult`, and + * NEVER renders. + * + * INPUT VALIDATION stays at each surface's boundary, deliberately. The MCP + * tool validates via the SDK's zod `inputSchema` (raw-shape idiom); the CLI + * validates + coerces commander flags. Both then hand `execute` a plain, + * already-validated `Input` object. Keeping the zod schema out of the + * capability keeps this core package dependency-light and lets each surface + * own the schema shape its transport requires — the shared, duplicated part + * was always the `execute` body (finder → filter → projection), never the + * schema. (A future revision may thread a shared schema through once the two + * surfaces' validation needs are unified; not required for the dedup win.) + */ + +import type { IGraphStore, ITemporalStore } from "@opencodehub/storage"; + +/** + * The already-open store views a capability's `execute` reads. Mirrors the + * `store.graph` / `store.temporal` split every call site uses today, so an + * `execute` body reads exactly like the inline code it replaces. (When the + * deferred A1 accessor-collapse lands, this interface is its single flip + * point: change it to one `store` and update the `execute` bodies, not the + * ~28 adapter files.) + */ +export interface CapabilityStore { + readonly graph: IGraphStore; + readonly temporal: ITemporalStore; +} + +/** Everything an `execute` needs beyond the validated input. */ +export interface CapabilityContext { + /** The open store views. */ + readonly store: CapabilityStore; + /** The resolved repo's display name, for `Output` headers/labels. */ + readonly repoName: string; +} + +/** + * A transport-free operation shared by the MCP tool and CLI command. + * + * - `id` is a stable identifier (e.g. "findings"), used for logging and as + * the default tool/command name. + * - `execute` receives an already-validated, plain `Input` (each surface + * validates at its own boundary), does finder → filter → project, and + * returns a PLAIN `Output`. It must not import commander, the MCP SDK, or + * `console`. + */ +export interface Capability { + readonly id: string; + readonly execute: (input: Input, ctx: CapabilityContext) => Promise; +} diff --git a/packages/core-ops/src/caps/findings.test.ts b/packages/core-ops/src/caps/findings.test.ts new file mode 100644 index 00000000..6ee747b2 --- /dev/null +++ b/packages/core-ops/src/caps/findings.test.ts @@ -0,0 +1,150 @@ +/** + * Unit tests for `findingsCapability.execute` — the shared reader/filter/ + * projection lifted from the (byte-identical) MCP `list_findings` tool and CLI + * `codehub findings` command. Exercises `execute` directly against a fake + * `CapabilityStore`, so it needs no real store, no repo resolution, and no + * transport. This is the one place the shared logic is now tested. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { FindingNode, NodeId } from "@opencodehub/core-types"; +import type { IGraphStore, ListFindingsOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { type FindingsInput, findingsCapability } from "./findings.js"; + +/** + * Build a Finding fixture from a plain string id. The id is kept verbatim (not + * a real `${kind}:${path}:${name}` node id) so assertions can compare against + * the literal; the cast satisfies the `NodeId` brand without altering the value. + */ +function finding(over: Omit, "id"> & { id: string }): FindingNode { + return { + kind: "Finding", + name: over.id, + filePath: "src/a.ts", + ruleId: "rule-x", + severity: "warning", + scannerId: "semgrep", + message: "msg", + propertiesBag: {}, + ...over, + id: over.id as NodeId, + } as FindingNode; +} + +/** + * A fake store whose `listFindings` records the opts it was called with and + * returns a fixed corpus filtered by the storage-tier predicates the capability + * pushes down (severity + ruleId + limit). Everything else on IGraphStore + * throws so an accidental new read is caught loudly. + */ +function fakeStore(corpus: readonly FindingNode[]): { + store: CapabilityStore; + lastOpts: () => ListFindingsOptions | undefined; +} { + let captured: ListFindingsOptions | undefined; + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listFindings") { + return async (opts?: ListFindingsOptions): Promise => { + captured = opts; + let rows = corpus; + if (opts?.severity !== undefined) { + const set = new Set(opts.severity); + rows = rows.filter((f) => set.has(f.severity as "note" | "warning" | "error")); + } + if (opts?.ruleId !== undefined) rows = rows.filter((f) => f.ruleId === opts.ruleId); + if (opts?.limit !== undefined) rows = rows.slice(0, opts.limit); + return rows; + }; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in findings capability test`); + }, + }); + const store: CapabilityStore = { + graph, + temporal: {} as CapabilityStore["temporal"], + }; + return { store, lastOpts: () => captured }; +} + +function ctxFor(corpus: readonly FindingNode[]): { + ctx: CapabilityContext; + lastOpts: () => ListFindingsOptions | undefined; +} { + const { store, lastOpts } = fakeStore(corpus); + return { ctx: { store, repoName: "demo-repo" }, lastOpts }; +} + +async function run(input: FindingsInput, corpus: readonly FindingNode[]) { + const { ctx, lastOpts } = ctxFor(corpus); + const out = await findingsCapability.execute(input, ctx); + return { out, lastOpts }; +} + +test("findings: projects rows, echoes repoName, defaults limit to 500", async () => { + const { out, lastOpts } = await run({}, [finding({ id: "f1" }), finding({ id: "f2" })]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.total, 2); + assert.equal(out.findings.length, 2); + assert.equal(lastOpts()?.limit, 500, "default limit pushed to the storage tier"); + const r = out.findings[0]; + assert.equal(r?.id, "f1"); + assert.equal(r?.scanner, "semgrep"); + assert.equal(r?.severity, "warning"); +}); + +test("findings: severity + ruleId are pushed to the storage tier", async () => { + const { out, lastOpts } = await run({ severity: "error", ruleId: "rule-x" }, [ + finding({ id: "e1", severity: "error", ruleId: "rule-x" }), + finding({ id: "w1", severity: "warning", ruleId: "rule-x" }), + finding({ id: "e2", severity: "error", ruleId: "rule-y" }), + ]); + assert.deepEqual(lastOpts()?.severity, ["error"], "severity pushed down"); + assert.equal(lastOpts()?.ruleId, "rule-x", "ruleId pushed down"); + assert.equal(out.total, 1); + assert.equal(out.findings[0]?.id, "e1"); + assert.deepEqual(out.filters, { severity: "error", ruleId: "rule-x" }); +}); + +test("findings: severity='none' is NOT pushed down; filtered in TS to none-only", async () => { + const { out, lastOpts } = await run({ severity: "none" }, [ + finding({ id: "n1", severity: "none" }), + finding({ id: "w1", severity: "warning" }), + ]); + assert.equal(lastOpts()?.severity, undefined, "'none' must not reach the storage tier"); + assert.equal(out.total, 1); + assert.equal(out.findings[0]?.id, "n1"); +}); + +test("findings: scanner + filePath substring are applied in the TS post-finder", async () => { + const { out } = await run({ scanner: "osv-scanner", filePath: "pkg/" }, [ + finding({ id: "a", scannerId: "osv-scanner", filePath: "pkg/dep.ts" }), + finding({ id: "b", scannerId: "semgrep", filePath: "pkg/dep.ts" }), // wrong scanner + finding({ id: "c", scannerId: "osv-scanner", filePath: "src/app.ts" }), // path miss + ]); + assert.equal(out.total, 1); + assert.equal(out.findings[0]?.id, "a"); + assert.deepEqual(out.filters, { scanner: "osv-scanner", filePath: "pkg/" }); +}); + +test("findings: startLine/endLine included only when finite; missing fields fall back", async () => { + const { out } = await run({}, [ + finding({ id: "withLines", startLine: 3, endLine: 7 }), + finding({ id: "noLines" }), + ]); + const withLines = out.findings.find((f) => f.id === "withLines"); + const noLines = out.findings.find((f) => f.id === "noLines"); + assert.equal(withLines?.startLine, 3); + assert.equal(withLines?.endLine, 7); + assert.equal(noLines?.startLine, undefined, "absent startLine stays absent"); + assert.equal(noLines?.endLine, undefined); +}); + +test("findings: empty corpus yields total 0 and empty filters when unfiltered", async () => { + const { out } = await run({}, []); + assert.equal(out.total, 0); + assert.equal(out.findings.length, 0); + assert.deepEqual(out.filters, {}); +}); diff --git a/packages/core-ops/src/caps/findings.ts b/packages/core-ops/src/caps/findings.ts new file mode 100644 index 00000000..f8aeadb2 --- /dev/null +++ b/packages/core-ops/src/caps/findings.ts @@ -0,0 +1,116 @@ +/** + * `findingsCapability` — the shared reader/filter/projection behind the MCP + * `list_findings` tool and the CLI `codehub findings` command. + * + * Lifted verbatim from the byte-identical bodies of + * `mcp/src/tools/list-findings.ts` and `cli/src/commands/findings.ts` (audit + * findings D4/D7): the `listFindings` push-down (severity + ruleId narrowed at + * the storage tier), the TS post-finder (`severity==="none"`, `scanner`, and + * `filePath` substring), and the row projection through the one canonical + * `stringOr`. Each surface now maps `FindingsOutput` into its own transport. + */ + +import type { Capability, CapabilityContext } from "../capability.js"; +import { stringOr } from "../string-or.js"; + +/** + * The validated, plain input `findingsCapability.execute` consumes. Each + * surface validates its own transport shape into this: the MCP tool via its + * SDK zod `inputSchema`, the CLI via coerced commander flags. `repo`/`repo_uri` + * are resolved to a concrete store by the surface BEFORE `execute` runs, so + * they are not read here — they live on the input only so a surface can pass + * its parsed args object through unchanged. + */ +export interface FindingsInput { + readonly repo?: string; + readonly repo_uri?: string; + readonly severity?: "error" | "warning" | "note" | "none"; + readonly scanner?: string; + readonly ruleId?: string; + readonly filePath?: string; + readonly limit?: number; +} + +/** One projected finding row — the plain shape both surfaces render. */ +export interface FindingRow { + readonly id: string; + readonly scanner: string; + readonly ruleId: string; + readonly severity: string; + readonly message: string; + readonly filePath: string; + readonly startLine?: number; + readonly endLine?: number; + readonly properties: Record; +} + +/** The applied filters, echoed back so presenters can label the output. */ +export interface FindingsFilters { + readonly severity?: string; + readonly scanner?: string; + readonly ruleId?: string; + readonly filePath?: string; +} + +export interface FindingsOutput { + readonly repoName: string; + readonly findings: readonly FindingRow[]; + readonly total: number; + readonly filters: FindingsFilters; +} + +export const findingsCapability: Capability = { + id: "findings", + async execute(input: FindingsInput, ctx: CapabilityContext): Promise { + const limit = input.limit ?? 500; + + // Push severity + ruleId into the storage tier; scanner + filePath + // substring + the `severity==="none"` case are applied in the TS + // post-finder below (we never pass `none` to listFindings). + const findingsOpts: { + severity?: readonly ("note" | "warning" | "error")[]; + ruleId?: string; + limit?: number; + } = { limit }; + if ( + input.severity !== undefined && + (input.severity === "note" || input.severity === "warning" || input.severity === "error") + ) { + findingsOpts.severity = [input.severity]; + } + if (input.ruleId !== undefined) findingsOpts.ruleId = input.ruleId; + const all = await ctx.store.graph.listFindings(findingsOpts); + + const filtered = all.filter((f) => { + if (input.severity === "none" && f.severity !== "none") return false; + if (input.scanner !== undefined && f.scannerId !== input.scanner) return false; + if (input.filePath !== undefined && !f.filePath.includes(input.filePath)) return false; + return true; + }); + + const findings: FindingRow[] = filtered.map((f) => ({ + id: f.id, + scanner: stringOr(f.scannerId, "unknown"), + ruleId: stringOr(f.ruleId, ""), + severity: stringOr(f.severity, "note"), + message: stringOr(f.message, ""), + filePath: stringOr(f.filePath, ""), + properties: f.propertiesBag, + ...(typeof f.startLine === "number" && Number.isFinite(f.startLine) + ? { startLine: f.startLine } + : {}), + ...(typeof f.endLine === "number" && Number.isFinite(f.endLine) + ? { endLine: f.endLine } + : {}), + })); + + const filters: FindingsFilters = { + ...(input.severity !== undefined ? { severity: input.severity } : {}), + ...(input.scanner !== undefined ? { scanner: input.scanner } : {}), + ...(input.ruleId !== undefined ? { ruleId: input.ruleId } : {}), + ...(input.filePath !== undefined ? { filePath: input.filePath } : {}), + }; + + return { repoName: ctx.repoName, findings, total: findings.length, filters }; + }, +}; diff --git a/packages/core-ops/src/index.ts b/packages/core-ops/src/index.ts new file mode 100644 index 00000000..9bf0e2b4 --- /dev/null +++ b/packages/core-ops/src/index.ts @@ -0,0 +1,9 @@ +export type { Capability, CapabilityContext, CapabilityStore } from "./capability.js"; +export { + type FindingRow, + type FindingsFilters, + type FindingsInput, + type FindingsOutput, + findingsCapability, +} from "./caps/findings.js"; +export { stringOr } from "./string-or.js"; diff --git a/packages/core-ops/src/string-or.ts b/packages/core-ops/src/string-or.ts new file mode 100644 index 00000000..564e4a10 --- /dev/null +++ b/packages/core-ops/src/string-or.ts @@ -0,0 +1,13 @@ +/** + * The one canonical `stringOr`. Coerces a value to a string: passes strings + * through, stringifies numbers/booleans, and falls back otherwise. + * + * This was copy-pasted byte-identically across the MCP tools and CLI commands + * (tech-debt audit finding D7 — 7 files). Capabilities and their adapters + * import it from here so a change lands once. + */ +export function stringOr(v: unknown, fallback: string): string { + if (typeof v === "string") return v; + if (typeof v === "number" || typeof v === "boolean") return String(v); + return fallback; +} diff --git a/packages/core-ops/tsconfig.json b/packages/core-ops/tsconfig.json new file mode 100644 index 00000000..293f2428 --- /dev/null +++ b/packages/core-ops/tsconfig.json @@ -0,0 +1,14 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "composite": true + }, + "references": [ + { "path": "../core-types" }, + { "path": "../storage" }, + { "path": "../analysis" } + ], + "include": ["src/**/*", "test/**/*"] +} diff --git a/packages/mcp/package.json b/packages/mcp/package.json index 1e7a808a..b2786d0a 100644 --- a/packages/mcp/package.json +++ b/packages/mcp/package.json @@ -2,7 +2,7 @@ "name": "@opencodehub/mcp", "version": "0.5.0", "private": true, - "description": "OpenCodeHub — stdio MCP server exposing code-graph + group tools", + "description": "OpenCodeHub \u2014 stdio MCP server exposing code-graph + group tools", "license": "Apache-2.0", "repository": { "type": "git", @@ -40,6 +40,7 @@ "dependencies": { "@modelcontextprotocol/sdk": "1.29.0", "@opencodehub/analysis": "workspace:*", + "@opencodehub/core-ops": "workspace:*", "@opencodehub/core-types": "workspace:*", "@opencodehub/embedder": "workspace:*", "@opencodehub/pack": "workspace:*", diff --git a/packages/mcp/src/tools/list-findings.ts b/packages/mcp/src/tools/list-findings.ts index bd85ce08..4be157c8 100644 --- a/packages/mcp/src/tools/list-findings.ts +++ b/packages/mcp/src/tools/list-findings.ts @@ -6,6 +6,11 @@ * `rule_id`, `severity`, and `message` plus a flat `properties_bag` * JSON string for the scanner's custom properties. * + * The shared reader/filter/projection lives in `@opencodehub/core-ops` + * `findingsCapability` — this tool is the thin MCP adapter: resolve + open the + * store via `withStore`, run the capability, and render its `FindingsOutput` + * into the MCP text body + `next_steps` + staleness envelope. + * * Filters (all optional): * - `severity` — restrict to one SARIF level. * - `scanner` — restrict to a single scanner id. @@ -13,9 +18,9 @@ * - `filePath` — substring match against `file_path`. * - `limit` — row cap (default 500, max 10_000). */ -// biome-ignore-all lint/complexity/useLiteralKeys: dot-access disallowed on Record index signatures import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { type FindingsInput, findingsCapability } from "@opencodehub/core-ops"; import { z } from "zod"; import { toolErrorFromUnknown } from "../error-envelope.js"; import { withNextSteps } from "../next-step-hints.js"; @@ -53,18 +58,6 @@ const ListFindingsInput = { .describe("Maximum number of findings to return (default 500, max 10000)."), }; -interface FindingRow { - readonly id: string; - readonly scanner: string; - readonly ruleId: string; - readonly severity: string; - readonly message: string; - readonly filePath: string; - readonly startLine?: number; - readonly endLine?: number; - readonly properties: Record; -} - interface ListFindingsArgs { readonly repo?: string | undefined; readonly repo_uri?: string | undefined; @@ -79,60 +72,29 @@ export async function runListFindings( ctx: ToolContext, args: ListFindingsArgs, ): Promise { - const limit = args.limit ?? 500; const call = await withStore(ctx, args, async (store, resolved) => { try { - // listFindings narrows by severity / ruleId at the storage tier. - // scanner / filePath substring are applied in TS post-finder. - const findingsOpts: { - severity?: readonly ("note" | "warning" | "error")[]; - ruleId?: string; - limit?: number; - } = { limit }; - if ( - args.severity !== undefined && - (args.severity === "note" || args.severity === "warning" || args.severity === "error") - ) { - findingsOpts.severity = [args.severity]; - } - if (args.ruleId !== undefined) findingsOpts.ruleId = args.ruleId; - const all = await store.graph.listFindings(findingsOpts); - - const filtered = all.filter((f) => { - if (args.severity === "none" && f.severity !== "none") return false; - if (args.scanner !== undefined && f.scannerId !== args.scanner) return false; - if (args.filePath !== undefined && !f.filePath.includes(args.filePath)) return false; - return true; + const input: FindingsInput = { + ...(args.severity !== undefined ? { severity: args.severity } : {}), + ...(args.scanner !== undefined ? { scanner: args.scanner } : {}), + ...(args.ruleId !== undefined ? { ruleId: args.ruleId } : {}), + ...(args.filePath !== undefined ? { filePath: args.filePath } : {}), + ...(args.limit !== undefined ? { limit: args.limit } : {}), + }; + const out = await findingsCapability.execute(input, { + store, + repoName: resolved.name, }); - const rows: FindingRow[] = filtered.map((f) => { - const base: FindingRow = { - id: f.id, - scanner: stringOr(f.scannerId, "unknown"), - ruleId: stringOr(f.ruleId, ""), - severity: stringOr(f.severity, "note"), - message: stringOr(f.message, ""), - filePath: stringOr(f.filePath, ""), - properties: f.propertiesBag, - ...(typeof f.startLine === "number" && Number.isFinite(f.startLine) - ? { startLine: f.startLine } - : {}), - ...(typeof f.endLine === "number" && Number.isFinite(f.endLine) - ? { endLine: f.endLine } - : {}), - }; - return base; - }); - - const header = `Findings (${rows.length}) for ${resolved.name}${ + const header = `Findings (${out.total}) for ${out.repoName}${ args.severity ? ` · severity=${args.severity}` : "" }${args.scanner ? ` · scanner=${args.scanner}` : ""}${ args.ruleId ? ` · rule=${args.ruleId}` : "" }${args.filePath ? ` · filePath~${args.filePath}` : ""}:`; const body = - rows.length === 0 + out.total === 0 ? "(no findings matched — run `codehub scan` or `codehub ingest-sarif ` to populate Finding nodes)" - : rows + : out.findings .map( (f) => `- [${f.severity}] ${f.scanner}:${f.ruleId} at ${f.filePath}${ @@ -142,7 +104,7 @@ export async function runListFindings( .join("\n"); const next = - rows.length === 0 + out.total === 0 ? [ "run `codehub scan` in the target repo to generate findings", "call `list_repos` to confirm the repo is indexed", @@ -154,7 +116,7 @@ export async function runListFindings( return withNextSteps( `${header}\n${body}`, - { findings: rows, total: rows.length }, + { findings: out.findings, total: out.total }, next, stalenessFromMeta(resolved.meta), ); @@ -183,9 +145,3 @@ export function registerListFindingsTool(server: McpServer, ctx: ToolContext): v async (args) => fromToolResult(await runListFindings(ctx, args)), ); } - -function stringOr(v: unknown, fallback: string): string { - if (typeof v === "string") return v; - if (typeof v === "number" || typeof v === "boolean") return String(v); - return fallback; -} diff --git a/packages/mcp/tsconfig.json b/packages/mcp/tsconfig.json index 38084feb..7a1e011d 100644 --- a/packages/mcp/tsconfig.json +++ b/packages/mcp/tsconfig.json @@ -7,6 +7,7 @@ }, "include": ["src/**/*"], "references": [ + { "path": "../core-ops" }, { "path": "../core-types" }, { "path": "../storage" }, { "path": "../search" }, diff --git a/tsconfig.json b/tsconfig.json index 93408419..5d152e18 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -9,6 +9,7 @@ { "path": "./packages/search" }, { "path": "./packages/embedder" }, { "path": "./packages/analysis" }, + { "path": "./packages/core-ops" }, { "path": "./packages/pack" }, { "path": "./packages/policy" }, { "path": "./packages/eval" }, From fcf02ad45b3d732eb513ad0639f0516d828e877a Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Fri, 3 Jul 2026 21:21:12 +0000 Subject: [PATCH 04/11] chore(core-ops): add @opencodehub/core-ops to the lockfile Records the new workspace package (its analysis/storage/core-types workspace links + typescript/@types/node dev deps) in pnpm-lock.yaml. Companion to 51ba06e, which added the package but not its lockfile entry. Run via `pnpm install --no-frozen-lockfile`; no other dependency changed. Co-Authored-By: Claude Opus 4.8 (1M context) --- pnpm-lock.yaml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 10588e17..421613a5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -167,6 +167,9 @@ importers: '@opencodehub/analysis': specifier: workspace:* version: link:../analysis + '@opencodehub/core-ops': + specifier: workspace:* + version: link:../core-ops '@opencodehub/core-types': specifier: workspace:* version: link:../core-types @@ -239,6 +242,25 @@ importers: specifier: 6.0.3 version: 6.0.3 + packages/core-ops: + dependencies: + '@opencodehub/analysis': + specifier: workspace:* + version: link:../analysis + '@opencodehub/core-types': + specifier: workspace:* + version: link:../core-types + '@opencodehub/storage': + specifier: workspace:* + version: link:../storage + devDependencies: + '@types/node': + specifier: 26.0.1 + version: 26.0.1 + typescript: + specifier: 6.0.3 + version: 6.0.3 + packages/core-types: devDependencies: '@types/node': @@ -428,6 +450,9 @@ importers: '@opencodehub/analysis': specifier: workspace:* version: link:../analysis + '@opencodehub/core-ops': + specifier: workspace:* + version: link:../core-ops '@opencodehub/core-types': specifier: workspace:* version: link:../core-types From b532ffa6621438d89565bb8511248abe3ec7ad03 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Sat, 4 Jul 2026 01:41:23 +0000 Subject: [PATCH 05/11] refactor(core-types): single-source NODE_COLUMNS + relation roster; fix D15 schema staleness Lift NODE_COLUMNS + RELATION_COLUMNS to a new core-types/node-columns.ts and have storage/column-encode.ts and mcp/resources/repo-schema.ts import the one copy; point storage/relations.ts at core-types RELATION_TYPES. Fixes the D15 staleness bug where the MCP schema resource advertised only 26 of 73 logical node columns to SQL-authoring agents (now 73). Adds drift-guard tests asserting storage rosters deep-equal core-types. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/core-types/src/index.ts | 1 + packages/core-types/src/node-columns.ts | 145 ++++++++++++++++++ .../mcp/src/resources/repo-schema.test.ts | 66 ++++++++ packages/mcp/src/resources/repo-schema.ts | 47 +----- packages/storage/src/column-encode.ts | 111 +------------- packages/storage/src/relations.test.ts | 29 ++++ packages/storage/src/relations.ts | 42 +---- 7 files changed, 264 insertions(+), 177 deletions(-) create mode 100644 packages/core-types/src/node-columns.ts create mode 100644 packages/mcp/src/resources/repo-schema.test.ts create mode 100644 packages/storage/src/relations.test.ts diff --git a/packages/core-types/src/index.ts b/packages/core-types/src/index.ts index 638f72d2..be351fc7 100644 --- a/packages/core-types/src/index.ts +++ b/packages/core-types/src/index.ts @@ -7,6 +7,7 @@ export type { EdgeId, MakeNodeIdOptions, NodeId, ParsedNodeId } from "./id.js"; export { makeEdgeId, makeNodeId, parseNodeId } from "./id.js"; export type { LanguageId } from "./language-id.js"; export { PROVENANCE_PREFIXES, SCIP_PROVENANCE_PREFIXES } from "./lsp-provenance.js"; +export { NODE_COLUMNS, RELATION_COLUMNS } from "./node-columns.js"; export type { AnnotationNode, ClassNode, diff --git a/packages/core-types/src/node-columns.ts b/packages/core-types/src/node-columns.ts new file mode 100644 index 00000000..82bf59c6 --- /dev/null +++ b/packages/core-types/src/node-columns.ts @@ -0,0 +1,145 @@ +/** + * Canonical column rosters for the polymorphic graph store — pure, + * dependency-free. + * + * These two lists are the SINGLE SOURCE OF TRUTH for the logical field + * vocabulary of the graph. They live in `@opencodehub/core-types` (the + * deepest, zero-runtime-dep package) so every downstream consumer stays in + * lock-step: + * - `@opencodehub/storage` re-exports {@link NODE_COLUMNS} from + * `column-encode.ts` as the encoder's canonical field ordering, which a + * community-fork `IGraphStore` adapter (AGE / Memgraph / Neo4j / Neptune) + * consumes when it stores the universal base as typed columns. + * - `@opencodehub/mcp` advertises both lists in the + * `codehub://repo/{name}/schema` resource so SQL-authoring agents see the + * full logical field vocabulary they may filter on. + * + * Keeping one copy here fixes the staleness class where a hand-maintained + * duplicate silently truncated the advertised roster. + */ + +/** + * Canonical field ordering for the polymorphic `nodes` table (73 entries). + * The shared reference a community-fork adapter (AGE / Memgraph / Neo4j / + * Neptune) consumes when it stores the universal base as typed columns. + * + * The in-tree `SqliteStore` (ADR 0019) stores only the universal base + * (`id, kind, name, file_path, start_line, end_line`) as typed columns and + * folds every remaining kind-specific field into a single canonical-JSON + * `payload` column, so adding a kind-specific field needs NO schema change + * there — it round-trips through `payload` automatically. The `[]`-vs-absent + * and `{}`-vs-absent distinctions are preserved by `canonicalJson` over + * `payload`, not by per-column encoding. + * + * Rules for a fork that DOES store a new field as a typed column: + * 1. Append to the END of this list — reordering rewrites every prepared + * statement parameter slot and breaks already-persisted graphs. + * 2. Append the writer in `nodeToColumns` (`@opencodehub/storage`). + * 3. Append the reader in the adapter's row decoder. + * 4. Update that adapter's CREATE TABLE DDL to keep the on-disk schema in + * lock step with this list. + * + * ORDER IS APPEND-ONLY AND LOAD-BEARING — never reorder. + */ +export const NODE_COLUMNS: readonly string[] = [ + "id", + "kind", + "name", + "file_path", + "start_line", + "end_line", + "is_exported", + "signature", + "parameter_count", + "return_type", + "declared_type", + "owner", + "url", + "method", + "tool_name", + "content", + "content_hash", + "inferred_label", + "symbol_count", + "cohesion", + "keywords", + "entry_point_id", + "step_count", + "level", + "response_keys", + "description", + // Finding + "severity", + "rule_id", + "scanner_id", + "message", + "properties_bag", + // Dependency + "version", + "license", + "lockfile_source", + "ecosystem", + // Operation + "http_method", + "http_path", + "summary", + "operation_id", + // Contributor + "email_hash", + "email_plain", + // ProjectProfile + "languages_json", + "frameworks_json", + "iac_types_json", + "api_contracts_json", + "manifests_json", + "src_dirs_json", + // File ownership (H.5) + Community ownership (H.4) + "orphan_grade", + "is_orphan", + "truck_factor", + "ownership_drift_30d", + "ownership_drift_90d", + "ownership_drift_365d", + // v1.2 extensions (append-only). + "deadness", + "coverage_percent", + "covered_lines_json", + "cyclomatic_complexity", + "nesting_depth", + "nloc", + "halstead_volume", + "input_schema_json", + "partial_fingerprint", + "baseline_state", + "suppressed_json", + // Repo. + "origin_url", + "repo_uri", + "default_branch", + "commit_sha", + "index_time", + "repo_group", + "visibility", + "indexer", + "language_stats_json", +]; + +/** + * Logical column roster for the polymorphic `relations` (edges) table + * (7 entries) as advertised to SQL-authoring agents. + * + * These are LOGICAL names. The physical SQLite DDL names the endpoint columns + * `src`/`dst`, but the advertised/logical roster uses `from_id`/`to_id` — do + * not "fix" these to `src`/`dst`; that would change the schema resource's + * advertised output and break the honest logical vocabulary. + */ +export const RELATION_COLUMNS: readonly string[] = [ + "id", + "from_id", + "to_id", + "type", + "confidence", + "reason", + "step", +]; diff --git a/packages/mcp/src/resources/repo-schema.test.ts b/packages/mcp/src/resources/repo-schema.test.ts new file mode 100644 index 00000000..df058795 --- /dev/null +++ b/packages/mcp/src/resources/repo-schema.test.ts @@ -0,0 +1,66 @@ +/** + * Behavioural test for the `codehub://repo/{name}/schema` MCP resource. + * + * Guards the D15 staleness fix: the `nodes:` roster is now single-sourced from + * `@opencodehub/core-types` (73 logical columns) instead of the truncated + * 26-entry local literal that silently rotted. Asserts the emitted YAML + * advertises the full 73 node columns and 7 relation columns, and that the + * count matches the canonical `NODE_COLUMNS` length so the two can never drift. + */ + +import { strict as assert } from "node:assert"; +import { test } from "node:test"; +import { NODE_COLUMNS, RELATION_COLUMNS } from "@opencodehub/core-types"; +import { getResourceHandler, makeFakeGraphStore, withMcpHarness } from "../test-utils.js"; +import { registerRepoSchemaResource } from "./repo-schema.js"; +import type { ResourceContext } from "./repos.js"; + +test("repo-schema: advertises the full 73 node columns and 7 relation columns", async () => { + await withMcpHarness( + { + tmpPrefix: "codehub-schema-test-", + serverCapabilities: { resources: {} }, + storeFactory: () => makeFakeGraphStore({ nodes: [], edges: [] }), + }, + async ({ server, pool, home, repoName }) => { + const ctx: ResourceContext = { pool, home }; + registerRepoSchemaResource(server, ctx); + const handler = getResourceHandler(server, "repo-schema"); + const uri = new URL(`codehub://repo/${encodeURIComponent(repoName)}/schema`); + const result = await handler(uri, { name: repoName }, {}); + const text = (result.contents[0] as { text: string }).text; + + // Count the `nodes:` list items: they are the ` - ` lines that + // sit under the ` nodes:` block and before ` relations:`. + const nodesBlock = text.slice( + text.indexOf(" nodes:") + " nodes:".length, + text.indexOf(" relations:"), + ); + const nodeEntries = nodesBlock.split("\n").filter((l) => l.startsWith(" - ")); + assert.equal( + nodeEntries.length, + 73, + "schema resource must advertise all 73 logical node columns (was 26 before the D15 fix)", + ); + assert.equal(nodeEntries.length, NODE_COLUMNS.length); + + // First and last node columns pin the append-only order. + assert.match(text, /^ {4}- id$/m); + assert.match(text, /^ {4}- language_stats_json$/m); + // Columns that were missing from the stale 26-entry roster now appear. + assert.match(text, /^ {4}- severity$/m); + assert.match(text, /^ {4}- repo_uri$/m); + + const relationsBlock = text.slice( + text.indexOf(" relations:") + " relations:".length, + text.indexOf("nodeKinds:"), + ); + const relationEntries = relationsBlock.split("\n").filter((l) => l.startsWith(" - ")); + assert.equal(relationEntries.length, RELATION_COLUMNS.length); + assert.equal(relationEntries.length, 7); + // Logical endpoint names (NOT the physical src/dst). + assert.match(text, /^ {4}- from_id$/m); + assert.match(text, /^ {4}- to_id$/m); + }, + ); +}); diff --git a/packages/mcp/src/resources/repo-schema.ts b/packages/mcp/src/resources/repo-schema.ts index 90cf0059..334d3395 100644 --- a/packages/mcp/src/resources/repo-schema.ts +++ b/packages/mcp/src/resources/repo-schema.ts @@ -11,51 +11,18 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js"; import type { ListResourcesResult, ReadResourceResult } from "@modelcontextprotocol/sdk/types.js"; -import { NODE_KINDS, RELATION_TYPES, SCHEMA_VERSION } from "@opencodehub/core-types"; +import { + NODE_COLUMNS, + NODE_KINDS, + RELATION_COLUMNS, + RELATION_TYPES, + SCHEMA_VERSION, +} from "@opencodehub/core-types"; import { readRegistry } from "../repo-resolver.js"; import type { ResourceContext } from "./repos.js"; const PATTERN = "codehub://repo/{name}/schema"; -const NODE_COLUMNS = [ - "id", - "kind", - "name", - "file_path", - "start_line", - "end_line", - "is_exported", - "signature", - "parameter_count", - "return_type", - "declared_type", - "owner", - "url", - "method", - "tool_name", - "content", - "content_hash", - "inferred_label", - "symbol_count", - "cohesion", - "keywords", - "entry_point_id", - "step_count", - "level", - "response_keys", - "description", -] as const; - -const RELATION_COLUMNS = [ - "id", - "from_id", - "to_id", - "type", - "confidence", - "reason", - "step", -] as const; - export function registerRepoSchemaResource(server: McpServer, ctx: ResourceContext): void { const template = new ResourceTemplate(PATTERN, { list: async (): Promise => { diff --git a/packages/storage/src/column-encode.ts b/packages/storage/src/column-encode.ts index 0929ffd7..d57bb8f1 100644 --- a/packages/storage/src/column-encode.ts +++ b/packages/storage/src/column-encode.ts @@ -62,112 +62,17 @@ * never carried `frameworksDetected` round-trip byte-identically. */ -import { canonicalJson, type GraphNode } from "@opencodehub/core-types"; +import { canonicalJson, type GraphNode, NODE_COLUMNS } from "@opencodehub/core-types"; /** - * Canonical field ordering for the polymorphic `nodes` table. Retained as - * the shared reference a community-fork adapter (AGE / Memgraph / Neo4j / - * Neptune) consumes when it stores the universal base as typed columns. - * - * The in-tree `SqliteStore` (ADR 0019) stores only the universal base - * (`id, kind, name, file_path, start_line, end_line`) as typed columns and - * folds every remaining kind-specific field into a single canonical-JSON - * `payload` column (`sqlite-adapter.ts`), so adding a kind-specific field - * needs NO schema change there — it round-trips through `payload` - * automatically. The `[]`-vs-absent and `{}`-vs-absent distinctions are - * preserved by `canonicalJson` over `payload`, not by per-column encoding. - * - * Rules for a fork that DOES store a new field as a typed column: - * 1. Append to the END of this list — reordering rewrites every prepared - * statement parameter slot and breaks already-persisted graphs. - * 2. Append the writer in {@link nodeToColumns}. - * 3. Append the reader in the adapter's row decoder. - * 4. Update that adapter's CREATE TABLE DDL to keep the on-disk schema in - * lock step with this list. + * Canonical field ordering for the polymorphic `nodes` table. Single-sourced + * from `@opencodehub/core-types` (the zero-runtime-dep package) and re-exported + * here unchanged so `nodeToColumns` and every community-fork `IGraphStore` + * adapter (AGE / Memgraph / Neo4j / Neptune) that imports `./column-encode.js` + * keep consuming it under the same name. See `core-types/src/node-columns.ts` + * for the append-only-order rules (order is load-bearing — never reorder). */ -export const NODE_COLUMNS: readonly string[] = [ - "id", - "kind", - "name", - "file_path", - "start_line", - "end_line", - "is_exported", - "signature", - "parameter_count", - "return_type", - "declared_type", - "owner", - "url", - "method", - "tool_name", - "content", - "content_hash", - "inferred_label", - "symbol_count", - "cohesion", - "keywords", - "entry_point_id", - "step_count", - "level", - "response_keys", - "description", - // Finding - "severity", - "rule_id", - "scanner_id", - "message", - "properties_bag", - // Dependency - "version", - "license", - "lockfile_source", - "ecosystem", - // Operation - "http_method", - "http_path", - "summary", - "operation_id", - // Contributor - "email_hash", - "email_plain", - // ProjectProfile - "languages_json", - "frameworks_json", - "iac_types_json", - "api_contracts_json", - "manifests_json", - "src_dirs_json", - // File ownership (H.5) + Community ownership (H.4) - "orphan_grade", - "is_orphan", - "truck_factor", - "ownership_drift_30d", - "ownership_drift_90d", - "ownership_drift_365d", - // v1.2 extensions (append-only). - "deadness", - "coverage_percent", - "covered_lines_json", - "cyclomatic_complexity", - "nesting_depth", - "nloc", - "halstead_volume", - "input_schema_json", - "partial_fingerprint", - "baseline_state", - "suppressed_json", - // Repo. - "origin_url", - "repo_uri", - "default_branch", - "commit_sha", - "index_time", - "repo_group", - "visibility", - "indexer", - "language_stats_json", -]; +export { NODE_COLUMNS }; /** * Encode a GraphNode into a `column → value` map indexed by the canonical diff --git a/packages/storage/src/relations.test.ts b/packages/storage/src/relations.test.ts new file mode 100644 index 00000000..7392019e --- /dev/null +++ b/packages/storage/src/relations.test.ts @@ -0,0 +1,29 @@ +/** + * Drift guard for the two single-sourced rosters lifted to + * `@opencodehub/core-types`. + * + * `getAllRelationTypes()` (this package) and the storage `NODE_COLUMNS` + * re-export MUST stay deep-equal to their core-types originals + * (`RELATION_TYPES`, `NODE_COLUMNS`). Before the hoist, `relations.ts` kept a + * hand-maintained `RELATION_KINDS` twin and the MCP schema resource kept a + * truncated `NODE_COLUMNS` twin; both could rot silently. These assertions are + * the guard that was missing — a future edit to either roster in core-types is + * automatically reflected here (imports), and any accidental re-introduction of + * a local literal that diverges trips this test. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import { NODE_COLUMNS as CORE_NODE_COLUMNS, RELATION_TYPES } from "@opencodehub/core-types"; +import { NODE_COLUMNS as STORAGE_NODE_COLUMNS } from "./column-encode.js"; +import { getAllRelationTypes } from "./relations.js"; + +test("getAllRelationTypes() deep-equals core-types RELATION_TYPES", () => { + assert.deepEqual(getAllRelationTypes(), RELATION_TYPES); + assert.equal(getAllRelationTypes().length, 25); +}); + +test("storage NODE_COLUMNS deep-equals core-types NODE_COLUMNS", () => { + assert.deepEqual(STORAGE_NODE_COLUMNS, CORE_NODE_COLUMNS); + assert.equal(STORAGE_NODE_COLUMNS.length, 73); +}); diff --git a/packages/storage/src/relations.ts b/packages/storage/src/relations.ts index e7a2714a..74d403e0 100644 --- a/packages/storage/src/relations.ts +++ b/packages/storage/src/relations.ts @@ -1,42 +1,16 @@ /** - * Canonical relation-kind roster — pure, dependency-free. + * Canonical relation-kind roster accessor. * * The single source of truth for which edge relation types exist, in their - * load-bearing order (append new kinds, NEVER reorder — commit diffs and any - * schema emitter depend on the order). Extracted into this pure module so the - * single-file `SqliteStore` and the parity tests can reach it directly (the - * prior schema module that once held it was removed in the single-file - * migration, ADR 0019). + * load-bearing order, is `RELATION_TYPES` in `@opencodehub/core-types` + * (`edges.ts`) — append new kinds there, NEVER reorder (commit diffs and any + * schema emitter depend on the order). This module previously held a hand-kept + * `RELATION_KINDS` duplicate; it now delegates to core-types so the two can + * never drift. */ -const RELATION_KINDS: readonly string[] = [ - "CONTAINS", - "DEFINES", - "IMPORTS", - "CALLS", - "EXTENDS", - "IMPLEMENTS", - "HAS_METHOD", - "HAS_PROPERTY", - "ACCESSES", - "METHOD_OVERRIDES", - "OVERRIDES", - "METHOD_IMPLEMENTS", - "MEMBER_OF", - "PROCESS_STEP", - "HANDLES_ROUTE", - "FETCHES", - "HANDLES_TOOL", - "ENTRY_POINT_OF", - "WRAPS", - "QUERIES", - "REFERENCES", - "FOUND_IN", - "DEPENDS_ON", - "OWNED_BY", - "TYPE_OF", -]; +import { RELATION_TYPES } from "@opencodehub/core-types"; /** Every relation kind, in canonical order. Source of truth for finders + tests. */ export function getAllRelationTypes(): readonly string[] { - return RELATION_KINDS; + return RELATION_TYPES; } From 4a7bd565ae04e9de88528f783e5755037d532048 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Sat, 4 Jul 2026 01:41:51 +0000 Subject: [PATCH 06/11] refactor(mcp): add defineTool factory; convert 3 reader tools to shared capabilities Fold the register-tool + withStore + try/catch + next-steps/staleness envelope boilerplate (audit A8) into one defineTool factory, parameterized by a capability + args-projector + presenter. Convert dependencies/license_audit/project_profile to thin adapters over new core-ops capabilities; retire their local stringOr copies. Wire names, tool count (29), and annotations unchanged, so the server contract test stays green. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../core-ops/src/caps/dependencies.test.ts | 114 +++++++++ packages/core-ops/src/caps/dependencies.ts | 92 ++++++++ .../core-ops/src/caps/license-audit.test.ts | 80 +++++++ packages/core-ops/src/caps/license-audit.ts | 52 +++++ .../core-ops/src/caps/project-profile.test.ts | 94 ++++++++ packages/core-ops/src/caps/project-profile.ts | 65 ++++++ packages/core-ops/src/index.ts | 24 ++ packages/mcp/src/tools/define-tool.test.ts | 145 ++++++++++++ packages/mcp/src/tools/define-tool.ts | 131 +++++++++++ packages/mcp/src/tools/dependencies.ts | 177 ++++++-------- packages/mcp/src/tools/license-audit.ts | 206 ++++++++--------- packages/mcp/src/tools/project-profile.ts | 216 ++++++++---------- 12 files changed, 1048 insertions(+), 348 deletions(-) create mode 100644 packages/core-ops/src/caps/dependencies.test.ts create mode 100644 packages/core-ops/src/caps/dependencies.ts create mode 100644 packages/core-ops/src/caps/license-audit.test.ts create mode 100644 packages/core-ops/src/caps/license-audit.ts create mode 100644 packages/core-ops/src/caps/project-profile.test.ts create mode 100644 packages/core-ops/src/caps/project-profile.ts create mode 100644 packages/mcp/src/tools/define-tool.test.ts create mode 100644 packages/mcp/src/tools/define-tool.ts diff --git a/packages/core-ops/src/caps/dependencies.test.ts b/packages/core-ops/src/caps/dependencies.test.ts new file mode 100644 index 00000000..7db49f70 --- /dev/null +++ b/packages/core-ops/src/caps/dependencies.test.ts @@ -0,0 +1,114 @@ +/** + * Unit tests for `dependenciesCapability.execute` — the shared reader/filter/ + * projection lifted from the MCP `dependencies` tool. Exercises `execute` + * directly against a fake `CapabilityStore`, so it needs no real store, no repo + * resolution, and no transport. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { DependencyNode, NodeId } from "@opencodehub/core-types"; +import type { IGraphStore, ListDependenciesOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { type DependenciesInput, dependenciesCapability } from "./dependencies.js"; + +/** Build a Dependency fixture from a plain string id (kept verbatim). */ +function dep(over: Omit, "id"> & { id: string }): DependencyNode { + return { + kind: "Dependency", + name: over.id, + filePath: "package.json", + version: "1.0.0", + ecosystem: "npm", + lockfileSource: "package-lock.json", + ...over, + id: over.id as NodeId, + } as DependencyNode; +} + +function fakeStore(corpus: readonly DependencyNode[]): { + store: CapabilityStore; + lastOpts: () => ListDependenciesOptions | undefined; +} { + let captured: ListDependenciesOptions | undefined; + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listDependencies") { + return async (opts?: ListDependenciesOptions): Promise => { + captured = opts; + let rows = corpus; + if (opts?.ecosystem !== undefined) + rows = rows.filter((d) => d.ecosystem === opts.ecosystem); + if (opts?.limit !== undefined) rows = rows.slice(0, opts.limit); + return rows; + }; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in dependencies capability test`); + }, + }); + const store: CapabilityStore = { graph, temporal: {} as CapabilityStore["temporal"] }; + return { store, lastOpts: () => captured }; +} + +async function run(input: DependenciesInput, corpus: readonly DependencyNode[]) { + const { store, lastOpts } = fakeStore(corpus); + const ctx: CapabilityContext = { store, repoName: "demo-repo" }; + const out = await dependenciesCapability.execute(input, ctx); + return { out, lastOpts }; +} + +test("dependencies: projects rows, echoes repoName, defaults limit to 500", async () => { + const { out, lastOpts } = await run({}, [dep({ id: "a" }), dep({ id: "b" })]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.total, 2); + assert.equal(out.dependencies.length, 2); + assert.equal(lastOpts()?.limit, 500, "default limit pushed to the storage tier"); + const r = out.dependencies[0]; + assert.equal(r?.id, "a"); + assert.equal(r?.ecosystem, "npm"); + assert.equal(r?.version, "1.0.0"); +}); + +test("dependencies: ecosystem is pushed to the storage tier", async () => { + const { out, lastOpts } = await run({ ecosystem: "cargo" }, [ + dep({ id: "rust", ecosystem: "cargo" }), + dep({ id: "node", ecosystem: "npm" }), + ]); + assert.equal(lastOpts()?.ecosystem, "cargo", "ecosystem pushed down"); + assert.equal(out.total, 1); + assert.equal(out.dependencies[0]?.id, "rust"); +}); + +test("dependencies: filePath substring is applied in the TS post-finder over lockfileSource", async () => { + const { out } = await run({ filePath: "apps/web/" }, [ + dep({ id: "hit", lockfileSource: "apps/web/package-lock.json" }), + dep({ id: "miss", lockfileSource: "apps/api/package-lock.json" }), + ]); + assert.equal(out.total, 1); + assert.equal(out.dependencies[0]?.id, "hit"); +}); + +test("dependencies: missing/loose fields fall back through stringOr; lockfile falls back to filePath", async () => { + // A deliberately loose runtime row built WITHOUT the `dep()` defaults: no + // lockfileSource, no license. Production rehydration can produce rows looser + // than the typed shape, which is exactly why the projection uses `stringOr` + + // the `lockfileSource ?? filePath` guard. + const loose = { + kind: "Dependency", + id: "loose" as NodeId, + name: "loose", + filePath: "pkg.json", + version: "1.0.0", + ecosystem: "npm", + } as unknown as DependencyNode; + const { out } = await run({}, [loose]); + const r = out.dependencies[0]; + assert.equal(r?.license, "UNKNOWN", "missing license → UNKNOWN sentinel"); + assert.equal(r?.lockfileSource, "pkg.json", "missing lockfileSource → filePath fallback"); +}); + +test("dependencies: empty corpus yields total 0", async () => { + const { out } = await run({}, []); + assert.equal(out.total, 0); + assert.equal(out.dependencies.length, 0); +}); diff --git a/packages/core-ops/src/caps/dependencies.ts b/packages/core-ops/src/caps/dependencies.ts new file mode 100644 index 00000000..171aec15 --- /dev/null +++ b/packages/core-ops/src/caps/dependencies.ts @@ -0,0 +1,92 @@ +/** + * `dependenciesCapability` — the shared reader/filter/projection behind the MCP + * `dependencies` tool (and, once the CLI adopts it, `codehub dependencies`). + * + * Lifted verbatim from the body of `mcp/src/tools/dependencies.ts`: the typed + * `listDependencies({ecosystem?, limit})` finder, the TS `filePath` substring + * post-filter over `lockfileSource ?? filePath`, and the row projection through + * the one canonical `stringOr`. The surface maps `DependenciesOutput` into its + * own transport (text body + next_steps + staleness envelope). + */ + +import type { Capability, CapabilityContext } from "../capability.js"; +import { stringOr } from "../string-or.js"; + +/** + * The validated, plain input `dependenciesCapability.execute` consumes. + * `repo`/`repo_uri` are resolved to a concrete store by the surface BEFORE + * `execute` runs; they live on the input only so a surface can pass its parsed + * args object through unchanged. + */ +export interface DependenciesInput { + readonly repo?: string; + readonly repo_uri?: string; + readonly ecosystem?: "npm" | "pypi" | "go" | "cargo" | "maven" | "nuget"; + readonly filePath?: string; + readonly limit?: number; +} + +/** + * One projected dependency row — the flat shape the surface renders. Kept as a + * flat object so clients that only inspect `structuredContent` can grok it + * without crawling the graph. + */ +export interface DependencyRow { + readonly id: string; + readonly name: string; + readonly version: string; + readonly ecosystem: string; + readonly license: string; + readonly lockfileSource: string; +} + +/** The applied filters, echoed back so presenters can label the output. */ +export interface DependenciesFilters { + readonly ecosystem?: string; + readonly filePath?: string; +} + +export interface DependenciesOutput { + readonly repoName: string; + readonly dependencies: readonly DependencyRow[]; + readonly total: number; + readonly filters: DependenciesFilters; +} + +export const dependenciesCapability: Capability = { + id: "dependencies", + async execute(input: DependenciesInput, ctx: CapabilityContext): Promise { + const limit = input.limit ?? 500; + + // Typed `listDependencies` finder reads the Dependency rows directly, + // already rehydrated into the typed shape. The `filePath` substring + // filter is applied in TS because the finder doesn't expose a LIKE + // option — dependencies are bounded per repo so a TS filter is fine. + const opts: { ecosystem?: string; limit?: number } = { limit }; + if (input.ecosystem !== undefined) opts.ecosystem = input.ecosystem; + const all = await ctx.store.graph.listDependencies(opts); + const filtered = + input.filePath === undefined + ? all + : all.filter((d) => { + const lf = d.lockfileSource ?? d.filePath; + return lf.includes(input.filePath as string); + }); + + const dependencies: DependencyRow[] = filtered.map((d) => ({ + id: d.id, + name: d.name, + version: stringOr(d.version, "UNKNOWN"), + ecosystem: stringOr(d.ecosystem, "unknown"), + license: stringOr(d.license, "UNKNOWN"), + lockfileSource: stringOr(d.lockfileSource, d.filePath), + })); + + const filters: DependenciesFilters = { + ...(input.ecosystem !== undefined ? { ecosystem: input.ecosystem } : {}), + ...(input.filePath !== undefined ? { filePath: input.filePath } : {}), + }; + + return { repoName: ctx.repoName, dependencies, total: dependencies.length, filters }; + }, +}; diff --git a/packages/core-ops/src/caps/license-audit.test.ts b/packages/core-ops/src/caps/license-audit.test.ts new file mode 100644 index 00000000..4262f4ac --- /dev/null +++ b/packages/core-ops/src/caps/license-audit.test.ts @@ -0,0 +1,80 @@ +/** + * Unit tests for `licenseAuditCapability.execute` — the shared reader/classifier + * lifted from the MCP `license_audit` tool. Exercises `execute` directly against + * a fake `CapabilityStore`; the tier logic itself is covered exhaustively in + * `@opencodehub/analysis` `license-classify.test.ts`, so here we assert only the + * read + projection + hand-off to `classifyDependencies`. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { DependencyNode, NodeId } from "@opencodehub/core-types"; +import type { IGraphStore, ListDependenciesOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { type LicenseAuditInput, licenseAuditCapability } from "./license-audit.js"; + +function dep(over: Omit, "id"> & { id: string }): DependencyNode { + return { + kind: "Dependency", + name: over.id, + filePath: "package.json", + version: "1.0.0", + ecosystem: "npm", + lockfileSource: "package-lock.json", + ...over, + id: over.id as NodeId, + } as DependencyNode; +} + +function fakeStore(corpus: readonly DependencyNode[]): CapabilityStore { + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listDependencies") { + return async (_opts?: ListDependenciesOptions): Promise => + corpus; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in license-audit capability test`); + }, + }); + return { graph, temporal: {} as CapabilityStore["temporal"] }; +} + +async function run(corpus: readonly DependencyNode[], input: LicenseAuditInput = {}) { + const ctx: CapabilityContext = { store: fakeStore(corpus), repoName: "demo-repo" }; + return licenseAuditCapability.execute(input, ctx); +} + +test("license-audit: echoes repoName and classifies an all-clear set as OK", async () => { + const out = await run([ + dep({ id: "lodash", license: "MIT" }), + dep({ id: "axios", license: "Apache-2.0" }), + ]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.result.tier, "OK"); + assert.equal(out.result.summary.total, 2); + assert.equal(out.result.summary.flaggedCount, 0); +}); + +test("license-audit: missing license → UNKNOWN sentinel → WARN tier", async () => { + // `dep()` sets no default license, so `mystery` arrives with license absent — + // the projection's `stringOr(d.license, "UNKNOWN")` yields the UNKNOWN sentinel. + const out = await run([dep({ id: "mystery" }), dep({ id: "good", license: "MIT" })]); + assert.equal(out.result.tier, "WARN"); + assert.equal(out.result.flagged.unknown.length, 1); + assert.equal(out.result.flagged.unknown[0]?.name, "mystery"); +}); + +test("license-audit: a copyleft dep drives BLOCK", async () => { + const out = await run([ + dep({ id: "readline", license: "GPL-3.0" }), + dep({ id: "good", license: "MIT" }), + ]); + assert.equal(out.result.tier, "BLOCK"); + assert.equal(out.result.flagged.copyleft.length, 1); +}); + +test("license-audit: empty corpus classifies as OK with zero total", async () => { + const out = await run([]); + assert.equal(out.result.tier, "OK"); + assert.equal(out.result.summary.total, 0); +}); diff --git a/packages/core-ops/src/caps/license-audit.ts b/packages/core-ops/src/caps/license-audit.ts new file mode 100644 index 00000000..266389c0 --- /dev/null +++ b/packages/core-ops/src/caps/license-audit.ts @@ -0,0 +1,52 @@ +/** + * `licenseAuditCapability` — the shared reader/classifier behind the MCP + * `license_audit` tool (and, once the CLI adopts it, `codehub license-audit`). + * + * Lifted verbatim from the body of `mcp/src/tools/license-audit.ts`: read every + * Dependency node, project each into a `DependencyRef` through the one canonical + * `stringOr`, then hand the set to `classifyDependencies` (the pure tier logic in + * `@opencodehub/analysis`). The surface maps `LicenseAuditOutput` into its own + * transport (text body + next_steps + staleness envelope). + */ + +import { + classifyDependencies, + type DependencyRef, + type LicenseAuditResult, +} from "@opencodehub/analysis"; +import type { Capability, CapabilityContext } from "../capability.js"; +import { stringOr } from "../string-or.js"; + +/** + * The validated, plain input `licenseAuditCapability.execute` consumes. + * `repo`/`repo_uri` are resolved to a concrete store by the surface BEFORE + * `execute` runs; they live on the input only so a surface can pass its parsed + * args object through unchanged. + */ +export interface LicenseAuditInput { + readonly repo?: string; + readonly repo_uri?: string; +} + +export interface LicenseAuditOutput { + readonly repoName: string; + readonly result: LicenseAuditResult; +} + +export const licenseAuditCapability: Capability = { + id: "license_audit", + async execute(_input: LicenseAuditInput, ctx: CapabilityContext): Promise { + const all = await ctx.store.graph.listDependencies(); + const deps: DependencyRef[] = all.map((d) => ({ + id: d.id, + name: d.name, + version: stringOr(d.version, "UNKNOWN"), + ecosystem: stringOr(d.ecosystem, "unknown"), + license: stringOr(d.license, "UNKNOWN"), + lockfileSource: stringOr(d.lockfileSource, d.filePath), + })); + + const result = classifyDependencies(deps); + return { repoName: ctx.repoName, result }; + }, +}; diff --git a/packages/core-ops/src/caps/project-profile.test.ts b/packages/core-ops/src/caps/project-profile.test.ts new file mode 100644 index 00000000..9d10519c --- /dev/null +++ b/packages/core-ops/src/caps/project-profile.test.ts @@ -0,0 +1,94 @@ +/** + * Unit tests for `projectProfileCapability.execute` — the shared singleton + * reader/decoder lifted from the MCP `project_profile` tool. Exercises `execute` + * directly against a fake `CapabilityStore`. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { NodeId, ProjectProfileNode } from "@opencodehub/core-types"; +import type { IGraphStore, ListNodesByKindOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { projectProfileCapability } from "./project-profile.js"; + +function profileNode(over: Partial): ProjectProfileNode { + return { + kind: "ProjectProfile", + id: "ProjectProfile:.:profile" as NodeId, + name: "profile", + filePath: ".", + languages: [], + frameworks: [], + iacTypes: [], + apiContracts: [], + manifests: [], + srcDirs: [], + ...over, + } as ProjectProfileNode; +} + +function fakeStore(nodes: readonly ProjectProfileNode[]): CapabilityStore { + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listNodesByKind") { + return async ( + _kind: string, + _opts?: ListNodesByKindOptions, + ): Promise => nodes; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in project-profile capability test`); + }, + }); + return { graph, temporal: {} as CapabilityStore["temporal"] }; +} + +async function run(nodes: readonly ProjectProfileNode[]) { + const ctx: CapabilityContext = { store: fakeStore(nodes), repoName: "demo-repo" }; + return projectProfileCapability.execute({}, ctx); +} + +test("project-profile: decodes arrays, echoes repoName, flags profileExists", async () => { + const out = await run([ + profileNode({ + languages: ["typescript", "python"], + frameworks: ["nextjs"], + iacTypes: ["terraform"], + apiContracts: ["openapi"], + manifests: ["package.json"], + srcDirs: ["src"], + }), + ]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.profileExists, true); + assert.deepEqual([...out.profile.languages], ["typescript", "python"]); + assert.deepEqual([...out.profile.frameworks], ["nextjs"]); + assert.deepEqual([...out.profile.iacTypes], ["terraform"]); + assert.deepEqual([...out.profile.apiContracts], ["openapi"]); + assert.equal(out.profile.frameworksDetected.length, 0, "absent frameworksDetected → empty array"); +}); + +test("project-profile: carries structured frameworksDetected when present", async () => { + const out = await run([ + profileNode({ + frameworks: ["nextjs"], + frameworksDetected: [ + { + name: "nextjs", + category: "meta", + variant: "app-router", + confidence: "deterministic", + evidence: [], + }, + ], + }), + ]); + assert.equal(out.profile.frameworksDetected.length, 1); + assert.equal(out.profile.frameworksDetected[0]?.variant, "app-router"); +}); + +test("project-profile: no node → profileExists false with empty arrays", async () => { + const out = await run([]); + assert.equal(out.profileExists, false); + assert.equal(out.profile.languages.length, 0); + assert.equal(out.profile.srcDirs.length, 0); +}); diff --git a/packages/core-ops/src/caps/project-profile.ts b/packages/core-ops/src/caps/project-profile.ts new file mode 100644 index 00000000..d798101e --- /dev/null +++ b/packages/core-ops/src/caps/project-profile.ts @@ -0,0 +1,65 @@ +/** + * `projectProfileCapability` — the shared reader behind the MCP + * `project_profile` tool (and, once the CLI adopts it, `codehub profile`). + * + * Lifted verbatim from the body of `mcp/src/tools/project-profile.ts`: read the + * singleton ProjectProfile node, decode every array column back into a plain + * array, and report whether the node existed at all (so the surface can nudge + * toward `codehub analyze --force`). The surface maps `ProjectProfileOutput` + * into its own transport (text body + next_steps + staleness envelope). + */ + +import type { FrameworkDetection } from "@opencodehub/core-types"; +import type { Capability, CapabilityContext } from "../capability.js"; + +/** + * The validated, plain input `projectProfileCapability.execute` consumes. + * `repo`/`repo_uri` are resolved to a concrete store by the surface BEFORE + * `execute` runs; they live on the input only so a surface can pass its parsed + * args object through unchanged. + */ +export interface ProjectProfileInput { + readonly repo?: string; + readonly repo_uri?: string; +} + +export interface ProjectProfilePayload { + readonly languages: readonly string[]; + /** Flat-string framework view (backward-compat). */ + readonly frameworks: readonly string[]; + /** Structured framework detections with variant / version / confidence / parent. */ + readonly frameworksDetected: readonly FrameworkDetection[]; + readonly iacTypes: readonly string[]; + readonly apiContracts: readonly string[]; + readonly manifests: readonly string[]; + readonly srcDirs: readonly string[]; +} + +export interface ProjectProfileOutput { + readonly repoName: string; + /** Whether a ProjectProfile node was present (drives the surface's hint). */ + readonly profileExists: boolean; + readonly profile: ProjectProfilePayload; +} + +export const projectProfileCapability: Capability = { + id: "project_profile", + async execute( + _input: ProjectProfileInput, + ctx: CapabilityContext, + ): Promise { + const nodes = await ctx.store.graph.listNodesByKind("ProjectProfile", { limit: 1 }); + const profile = nodes[0]; + const payload: ProjectProfilePayload = { + languages: profile?.languages ? [...profile.languages] : [], + frameworks: profile?.frameworks ? [...profile.frameworks] : [], + frameworksDetected: profile?.frameworksDetected ? [...profile.frameworksDetected] : [], + iacTypes: profile?.iacTypes ? [...profile.iacTypes] : [], + apiContracts: profile?.apiContracts ? [...profile.apiContracts] : [], + manifests: profile?.manifests ? [...profile.manifests] : [], + srcDirs: profile?.srcDirs ? [...profile.srcDirs] : [], + }; + + return { repoName: ctx.repoName, profileExists: profile !== undefined, profile: payload }; + }, +}; diff --git a/packages/core-ops/src/index.ts b/packages/core-ops/src/index.ts index 9bf0e2b4..f4a35424 100644 --- a/packages/core-ops/src/index.ts +++ b/packages/core-ops/src/index.ts @@ -1,4 +1,17 @@ export type { Capability, CapabilityContext, CapabilityStore } from "./capability.js"; +export { + type ContextInput, + type ContextOutput, + type ContextProcessParticipation, + contextCapability, +} from "./caps/context.js"; +export { + type DependenciesFilters, + type DependenciesInput, + type DependenciesOutput, + type DependencyRow, + dependenciesCapability, +} from "./caps/dependencies.js"; export { type FindingRow, type FindingsFilters, @@ -6,4 +19,15 @@ export { type FindingsOutput, findingsCapability, } from "./caps/findings.js"; +export { + type LicenseAuditInput, + type LicenseAuditOutput, + licenseAuditCapability, +} from "./caps/license-audit.js"; +export { + type ProjectProfileInput, + type ProjectProfileOutput, + type ProjectProfilePayload, + projectProfileCapability, +} from "./caps/project-profile.js"; export { stringOr } from "./string-or.js"; diff --git a/packages/mcp/src/tools/define-tool.test.ts b/packages/mcp/src/tools/define-tool.test.ts new file mode 100644 index 00000000..3b4d120e --- /dev/null +++ b/packages/mcp/src/tools/define-tool.test.ts @@ -0,0 +1,145 @@ +/** + * Unit tests for the `defineTool` factory — the register + `withStore` + + * try/catch + `withNextSteps` envelope + `toToolResult` boilerplate the reader + * tools share. Exercises one fake capability through the real MCP harness so + * the register wiring, the success envelope, and the error path are all covered + * in one place (a table-driven test beats N per-tool copies). + */ + +import { strict as assert } from "node:assert"; +import { test } from "node:test"; +import type { Capability } from "@opencodehub/core-ops"; +import { getToolHandler, makeFakeGraphStore, withMcpHarness } from "../test-utils.js"; +import { defineTool } from "./define-tool.js"; +import { repoArgShape, type ToolContext } from "./shared.js"; + +interface EchoInput { + readonly repo?: string; + readonly repo_uri?: string; + readonly label?: string; +} +interface EchoOutput { + readonly repoName: string; + readonly label: string; +} + +/** A fake capability that echoes the resolved repo name + a label. */ +const echoCapability: Capability = { + id: "echo", + async execute(input, ctx) { + return { repoName: ctx.repoName, label: input.label ?? "(none)" }; + }, +}; + +/** A fake capability that always throws, to drive the error path. */ +const boomCapability: Capability = { + id: "boom", + async execute() { + throw new Error("kaboom"); + }, +}; + +const echoTool = defineTool({ + name: "echo", + title: "Echo", + description: "Echo a label back with the resolved repo name.", + inputSchema: { ...repoArgShape }, + annotations: { + readOnlyHint: true, + destructiveHint: false, + openWorldHint: false, + idempotentHint: true, + }, + capability: echoCapability, + toInput: (args) => ({ ...(args.label !== undefined ? { label: args.label } : {}) }), + present: (out) => ({ + text: `echo ${out.repoName}: ${out.label}`, + structured: { repoName: out.repoName, label: out.label }, + nextSteps: ["call `echo` again"], + }), +}); + +const boomTool = defineTool({ + name: "boom", + title: "Boom", + description: "Always throws.", + inputSchema: { ...repoArgShape }, + annotations: { + readOnlyHint: true, + destructiveHint: false, + openWorldHint: false, + idempotentHint: true, + }, + capability: boomCapability, + toInput: () => ({}), + present: (out) => ({ text: "unreachable", structured: { out }, nextSteps: [] }), +}); + +async function withHarness( + fn: ( + ctx: ToolContext, + server: import("@modelcontextprotocol/sdk/server/mcp.js").McpServer, + ) => Promise, +): Promise { + await withMcpHarness( + { tmpPrefix: "codehub-define-tool-", storeFactory: () => makeFakeGraphStore({}) }, + async ({ server, pool, home }) => { + await fn({ pool, home }, server); + }, + ); +} + +test("defineTool: register + run wires the SDK handler under the wire name", async () => { + await withHarness(async (ctx, server) => { + echoTool.register(server, ctx); + // A handler must exist under the exact wire name. + const handler = getToolHandler(server, "echo"); + assert.ok(handler, "handler registered under wire name 'echo'"); + }); +}); + +test("defineTool: success path renders present() into the withNextSteps envelope", async () => { + await withHarness(async (ctx, server) => { + echoTool.register(server, ctx); + const handler = getToolHandler(server, "echo"); + const result = await handler({ repo: "fakerepo", label: "hello" }, {}); + const first = result.content[0]; + assert.ok(first && first.type === "text"); + // Presenter body + the withNextSteps "Suggested next tools" block. + assert.match(first.text, /echo fakerepo: hello/); + assert.match(first.text, /Suggested next tools:/); + assert.match(first.text, /call `echo` again/); + const sc = result.structuredContent as { + repoName: string; + label: string; + next_steps: string[]; + _meta?: Record; + }; + assert.equal(sc.repoName, "fakerepo"); + assert.equal(sc.label, "hello"); + assert.deepEqual(sc.next_steps, ["call `echo` again"]); + assert.notEqual(result.isError, true); + }); +}); + +test("defineTool: run() returns the same structuredContent as the SDK handler", async () => { + await withHarness(async (ctx) => { + const viaRun = await echoTool.run(ctx, { repo: "fakerepo", label: "x" }); + const sc = viaRun.structuredContent as { repoName: string; label: string }; + assert.equal(sc.repoName, "fakerepo"); + assert.equal(sc.label, "x"); + assert.match(viaRun.text, /echo fakerepo: x/); + }); +}); + +test("defineTool: a throwing capability is mapped to an INTERNAL error envelope", async () => { + await withHarness(async (ctx, server) => { + boomTool.register(server, ctx); + const handler = getToolHandler(server, "boom"); + const result = await handler({ repo: "fakerepo" }, {}); + assert.equal(result.isError, true); + const sc = result.structuredContent as { error: { code: string; message: string } }; + assert.equal(sc.error.code, "INTERNAL"); + assert.match(sc.error.message, /kaboom/); + }); +}); diff --git a/packages/mcp/src/tools/define-tool.ts b/packages/mcp/src/tools/define-tool.ts new file mode 100644 index 00000000..3a4c5d32 --- /dev/null +++ b/packages/mcp/src/tools/define-tool.ts @@ -0,0 +1,131 @@ +/** + * `defineTool` — the factory that folds the repeated MCP register-tool + + * `withStore` + try/catch + envelope + `toToolResult` boilerplate into ONE + * place, so a read-only tool file collapses to four declarations: the + * `core-ops` capability that does the finder → filter → projection, an + * `args → Input` projector (the undefined-strip idiom), a `present` + * function that renders the capability's plain `Output` into the tool's + * text body + `next_steps`, and the register metadata (wire name, title, + * description, input schema, annotations). + * + * WHY THIS EXISTS. Every read-only tool — `list_findings`, `dependencies`, + * `license_audit`, `project_profile`, … — ran the identical wrapper: + * + * run(ctx, args) = withStore(ctx, args, (store, resolved) => try { + * ...capability body... ; return withNextSteps(text, structured, next, + * stalenessFromMeta(resolved.meta)) } catch (err) { + * return toolErrorFromUnknown(err) }) ; return toToolResult(call) + * register(server, ctx) = server.registerTool(name, {...}, args => + * fromToolResult(run(ctx, args))) + * + * Only the middle (the finder/filter/projection, now a `Capability`) and the + * rendering (now `present`) differ per tool. This factory owns the rest. The + * factory is transport-bound (it imports the MCP SDK and the mcp-side helpers), + * so it lives here in `@opencodehub/mcp`, NOT in the dependency-light + * `@opencodehub/core-ops` where the capabilities live. + * + * The zod `inputSchema` stays per-tool (each transport owns its validation + * shape) and so does `present` (text/next-steps rendering is the deliberate + * per-surface part). Everything else is uniform, and this is where it lives. + */ + +import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import type { ToolAnnotations } from "@modelcontextprotocol/sdk/types.js"; +import type { Capability } from "@opencodehub/core-ops"; +import type { z } from "zod"; +import { toolErrorFromUnknown } from "../error-envelope.js"; +import { withNextSteps } from "../next-step-hints.js"; +import type { ResolvedRepo } from "../repo-resolver.js"; +import { stalenessFromMeta } from "../staleness.js"; +import { + fromToolResult, + type RepoArgs, + type ToolContext, + type ToolResult, + toToolResult, + withStore, +} from "./shared.js"; + +/** + * What a tool's `present` returns for one capability `Output` — the exact + * rendered text body, the machine-readable structured payload, and the + * next-step hints. `structured` becomes the `structuredContent` (minus the + * `next_steps` + `_meta` the factory layers on via {@link withNextSteps}). + */ +export interface ToolPresentation { + readonly text: string; + readonly structured: Record; + readonly nextSteps: readonly string[]; +} + +/** + * The declarative spec for one read-only, per-repo MCP tool. `Args` is the + * SDK-validated arg object (spreads `repoArgShape`); `Input` is the plain + * shape the capability consumes; `Output` is the plain shape it returns. + */ +export interface DefineToolSpec { + /** Wire name — also the SDK tool id (usually equals `capability.id`). */ + readonly name: string; + readonly title: string; + readonly description: string; + /** Raw-shape zod schema (spreads `repoArgShape`). Stays per-tool. */ + readonly inputSchema: z.ZodRawShape; + readonly annotations: ToolAnnotations; + readonly capability: Capability; + /** The undefined-strip projection from validated args to capability input. */ + readonly toInput: (args: Args) => Input; + /** Render the capability's plain output into text + structured + next-steps. */ + readonly present: (output: Output, resolved: ResolvedRepo) => ToolPresentation; +} + +/** + * A defined tool: `run` is the transport-agnostic handler (the one place tests + * can call to assert `structuredContent`), and `register` wires it onto an + * `McpServer`. Each tool file keeps its `registerXxxTool` export as a one-line + * delegate to `register`, so `server.ts`'s existing call sites are unchanged. + */ +export interface DefinedTool { + readonly name: string; + readonly run: (ctx: ToolContext, args: Args) => Promise; + readonly register: (server: McpServer, ctx: ToolContext) => void; +} + +export function defineTool( + spec: DefineToolSpec, +): DefinedTool { + async function run(ctx: ToolContext, args: Args): Promise { + const call = await withStore(ctx, args, async (store, resolved) => { + try { + const output = await spec.capability.execute(spec.toInput(args), { + store, + repoName: resolved.name, + }); + const view = spec.present(output, resolved); + return withNextSteps( + view.text, + view.structured, + view.nextSteps, + stalenessFromMeta(resolved.meta), + ); + } catch (err) { + return toolErrorFromUnknown(err); + } + }); + return toToolResult(call); + } + + function register(server: McpServer, ctx: ToolContext): void { + server.registerTool( + spec.name, + { + title: spec.title, + description: spec.description, + inputSchema: spec.inputSchema, + annotations: spec.annotations, + }, + async (args) => fromToolResult(await run(ctx, args as Args)), + ); + } + + return { name: spec.name, run, register }; +} diff --git a/packages/mcp/src/tools/dependencies.ts b/packages/mcp/src/tools/dependencies.ts index b5f42843..e68d5ecb 100644 --- a/packages/mcp/src/tools/dependencies.ts +++ b/packages/mcp/src/tools/dependencies.ts @@ -5,6 +5,12 @@ * from per-ecosystem manifest parsers. Every node carries * `ecosystem`, `name`, `version`, and a `lockfileSource` relpath. * + * The shared reader/filter/projection lives in `@opencodehub/core-ops` + * `dependenciesCapability`; this file is the thin MCP adapter built with + * `defineTool` — it declares the input schema, the args→Input projection, + * and the presenter that renders `DependenciesOutput` into the MCP text + * body + `next_steps` + staleness envelope. + * * Filters: * - `ecosystem` — restrict to one ecosystem (npm, pypi, go, cargo, * maven, nuget). Server does no validation beyond @@ -14,23 +20,18 @@ * lockfile source). Useful when a repo has multiple * workspaces with their own manifests. */ -// biome-ignore-all lint/complexity/useLiteralKeys: dot-access disallowed on Record index signatures import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import { z } from "zod"; -import { toolErrorFromUnknown } from "../error-envelope.js"; -import { withNextSteps } from "../next-step-hints.js"; -import { stalenessFromMeta } from "../staleness.js"; import { - fromToolResult, - repoArgShape, - type ToolContext, - type ToolResult, - toToolResult, - withStore, -} from "./shared.js"; + type DependenciesInput, + type DependenciesOutput, + dependenciesCapability, +} from "@opencodehub/core-ops"; +import { z } from "zod"; +import { defineTool } from "./define-tool.js"; +import { repoArgShape, type ToolContext, type ToolResult } from "./shared.js"; -const DependenciesInput = { +const DependenciesInputSchema = { ...repoArgShape, filePath: z .string() @@ -49,19 +50,6 @@ const DependenciesInput = { .describe("Maximum number of dependencies to return (default 500, max 10000)."), }; -/** - * A row returned to agents. Kept as a flat object so clients that only - * inspect `structuredContent` can grok it without crawling the graph. - */ -interface DependencyRow { - readonly id: string; - readonly name: string; - readonly version: string; - readonly ecosystem: string; - readonly license: string; - readonly lockfileSource: string; -} - interface DependenciesArgs { readonly repo?: string | undefined; readonly repo_uri?: string | undefined; @@ -70,95 +58,64 @@ interface DependenciesArgs { readonly limit?: number | undefined; } +const dependenciesTool = defineTool({ + name: "dependencies", + title: "List external dependencies", + description: + "Enumerate external package dependencies of the indexed repo, sourced from lockfiles and manifests (package-lock.json, pnpm-lock.yaml, pyproject.toml, requirements.txt, uv.lock, go.mod, go.sum, Cargo.lock, Cargo.toml, pom.xml, *.csproj, packages.lock.json). Optionally filter by ecosystem or lockfile path substring. License field is 'UNKNOWN' at v1.0; real license detection lands in a later release.", + inputSchema: DependenciesInputSchema, + annotations: { + readOnlyHint: true, + destructiveHint: false, + openWorldHint: false, + idempotentHint: true, + }, + capability: dependenciesCapability, + toInput: (args) => ({ + ...(args.ecosystem !== undefined ? { ecosystem: args.ecosystem } : {}), + ...(args.filePath !== undefined ? { filePath: args.filePath } : {}), + ...(args.limit !== undefined ? { limit: args.limit } : {}), + }), + present: (out) => { + const header = `Dependencies (${out.total}) for ${out.repoName}${ + out.filters.ecosystem ? ` · ecosystem=${out.filters.ecosystem}` : "" + }${out.filters.filePath ? ` · filePath~${out.filters.filePath}` : ""}:`; + const body = + out.total === 0 + ? "(no dependencies found — index the repo with `codehub analyze` and verify the pipeline ran the `dependencies` phase)" + : out.dependencies + .map( + (d) => + `- [${d.ecosystem}] ${d.name}@${d.version} (${d.lockfileSource}, license=${d.license})`, + ) + .join("\n"); + + const nextSteps = + out.total === 0 + ? [ + "call `list_repos` to confirm the repo is indexed", + "re-index with `codehub analyze` to populate Dependency nodes", + ] + : [ + "call `query` with one of the names above to find import sites", + "call `sql` with cypher 'MATCH ()-[r:DEPENDS_ON]->() RETURN r' for the raw edges", + ]; + + return { + text: `${header}\n${body}`, + structured: { dependencies: out.dependencies, total: out.total }, + nextSteps, + }; + }, +}); + export async function runDependencies( ctx: ToolContext, args: DependenciesArgs, ): Promise { - const limit = args.limit ?? 500; - const call = await withStore(ctx, args, async (store, resolved) => { - try { - // Typed `listDependencies` finder reads the Dependency rows directly, - // already rehydrated into the typed shape. The `filePath` substring - // filter is applied in TS because the finder doesn't expose a LIKE - // option — dependencies are bounded per repo so a TS filter is fine. - const opts: { ecosystem?: string; limit?: number } = { limit }; - if (args.ecosystem !== undefined) opts.ecosystem = args.ecosystem; - const all = await store.graph.listDependencies(opts); - const filtered = - args.filePath === undefined - ? all - : all.filter((d) => { - const lf = d.lockfileSource ?? d.filePath; - return lf.includes(args.filePath as string); - }); - - const rows: DependencyRow[] = filtered.map((d) => ({ - id: d.id, - name: d.name, - version: stringOr(d.version, "UNKNOWN"), - ecosystem: stringOr(d.ecosystem, "unknown"), - license: stringOr(d.license, "UNKNOWN"), - lockfileSource: stringOr(d.lockfileSource, d.filePath), - })); - - const header = `Dependencies (${rows.length}) for ${resolved.name}${ - args.ecosystem ? ` · ecosystem=${args.ecosystem}` : "" - }${args.filePath ? ` · filePath~${args.filePath}` : ""}:`; - const body = - rows.length === 0 - ? "(no dependencies found — index the repo with `codehub analyze` and verify the pipeline ran the `dependencies` phase)" - : rows - .map( - (d) => - `- [${d.ecosystem}] ${d.name}@${d.version} (${d.lockfileSource}, license=${d.license})`, - ) - .join("\n"); - - const next = - rows.length === 0 - ? [ - "call `list_repos` to confirm the repo is indexed", - "re-index with `codehub analyze` to populate Dependency nodes", - ] - : [ - "call `query` with one of the names above to find import sites", - "call `sql` with cypher 'MATCH ()-[r:DEPENDS_ON]->() RETURN r' for the raw edges", - ]; - - return withNextSteps( - `${header}\n${body}`, - { dependencies: rows, total: rows.length }, - next, - stalenessFromMeta(resolved.meta), - ); - } catch (err) { - return toolErrorFromUnknown(err); - } - }); - return toToolResult(call); + return dependenciesTool.run(ctx, args); } export function registerDependenciesTool(server: McpServer, ctx: ToolContext): void { - server.registerTool( - "dependencies", - { - title: "List external dependencies", - description: - "Enumerate external package dependencies of the indexed repo, sourced from lockfiles and manifests (package-lock.json, pnpm-lock.yaml, pyproject.toml, requirements.txt, uv.lock, go.mod, go.sum, Cargo.lock, Cargo.toml, pom.xml, *.csproj, packages.lock.json). Optionally filter by ecosystem or lockfile path substring. License field is 'UNKNOWN' at v1.0; real license detection lands in a later release.", - inputSchema: DependenciesInput, - annotations: { - readOnlyHint: true, - destructiveHint: false, - openWorldHint: false, - idempotentHint: true, - }, - }, - async (args) => fromToolResult(await runDependencies(ctx, args)), - ); -} - -function stringOr(v: unknown, fallback: string): string { - if (typeof v === "string") return v; - if (typeof v === "number" || typeof v === "boolean") return String(v); - return fallback; + dependenciesTool.register(server, ctx); } diff --git a/packages/mcp/src/tools/license-audit.ts b/packages/mcp/src/tools/license-audit.ts index 5661f7f0..b7e8c5a9 100644 --- a/packages/mcp/src/tools/license-audit.ts +++ b/packages/mcp/src/tools/license-audit.ts @@ -19,26 +19,22 @@ * WARN — no copyleft/proprietary, at least one unknown. * OK — nothing flagged. * - * Annotations are {readOnly, closedWorld, idempotent} — the tool only - * queries the graph. + * The shared read + classify lives in `@opencodehub/core-ops` + * `licenseAuditCapability`; this file is the thin MCP adapter built with + * `defineTool` — the presenter renders the `LicenseAuditResult` into the + * three tier branches, and annotations are {readOnly, closedWorld, idempotent}. */ -// biome-ignore-all lint/complexity/useLiteralKeys: dot-access disallowed on Record index signatures import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import { classifyDependencies, type DependencyRef } from "@opencodehub/analysis"; -import { toolErrorFromUnknown } from "../error-envelope.js"; -import { withNextSteps } from "../next-step-hints.js"; -import { stalenessFromMeta } from "../staleness.js"; import { - fromToolResult, - repoArgShape, - type ToolContext, - type ToolResult, - toToolResult, - withStore, -} from "./shared.js"; + type LicenseAuditInput, + type LicenseAuditOutput, + licenseAuditCapability, +} from "@opencodehub/core-ops"; +import { defineTool } from "./define-tool.js"; +import { repoArgShape, type ToolContext, type ToolResult } from "./shared.js"; -const LicenseAuditInput = { +const LicenseAuditInputSchema = { ...repoArgShape, }; @@ -47,114 +43,94 @@ interface LicenseAuditArgs { readonly repo_uri?: string | undefined; } -export async function runLicenseAudit( - ctx: ToolContext, - args: LicenseAuditArgs, -): Promise { - const call = await withStore(ctx, args, async (store, resolved) => { - try { - const all = await store.graph.listDependencies(); - const deps: DependencyRef[] = all.map((d) => ({ - id: d.id, - name: d.name, - version: stringOr(d.version, "UNKNOWN"), - ecosystem: stringOr(d.ecosystem, "unknown"), - license: stringOr(d.license, "UNKNOWN"), - lockfileSource: stringOr(d.lockfileSource, d.filePath), - })); - - const result = classifyDependencies(deps); - const header = `License audit for ${resolved.name}: tier=${result.tier} (${result.summary.okCount}/${result.summary.total} ok, ${result.summary.flaggedCount} flagged)`; - const bodyLines: string[] = []; - if (result.flagged.copyleft.length > 0) { - bodyLines.push( - `Copyleft (${result.flagged.copyleft.length}):`, - ...result.flagged.copyleft.map( - (d) => ` - [${d.ecosystem}] ${d.name}@${d.version} — ${d.license}`, - ), - ); - } - if (result.flagged.proprietary.length > 0) { - bodyLines.push( - `Proprietary (${result.flagged.proprietary.length}):`, - ...result.flagged.proprietary.map( - (d) => ` - [${d.ecosystem}] ${d.name}@${d.version} — ${d.license}`, - ), - ); - } - if (result.flagged.unknown.length > 0) { +const licenseAuditTool = defineTool({ + name: "license_audit", + title: "Audit dependency licenses", + description: + "Classify every Dependency node by license risk: copyleft (GPL/AGPL/SSPL/EUPL/CPAL/OSL/RPL), proprietary, unknown. Returns tier=BLOCK if any copyleft or proprietary dep, WARN if only unknowns, OK otherwise. Note: until per-ecosystem license detection lands, most Dependency nodes carry license='UNKNOWN', so most audits will return tier=WARN until that follow-up ships.", + inputSchema: LicenseAuditInputSchema, + annotations: { + readOnlyHint: true, + destructiveHint: false, + openWorldHint: false, + idempotentHint: true, + }, + capability: licenseAuditCapability, + toInput: () => ({}), + present: (out) => { + const { result } = out; + const header = `License audit for ${out.repoName}: tier=${result.tier} (${result.summary.okCount}/${result.summary.total} ok, ${result.summary.flaggedCount} flagged)`; + const bodyLines: string[] = []; + if (result.flagged.copyleft.length > 0) { + bodyLines.push( + `Copyleft (${result.flagged.copyleft.length}):`, + ...result.flagged.copyleft.map( + (d) => ` - [${d.ecosystem}] ${d.name}@${d.version} — ${d.license}`, + ), + ); + } + if (result.flagged.proprietary.length > 0) { + bodyLines.push( + `Proprietary (${result.flagged.proprietary.length}):`, + ...result.flagged.proprietary.map( + (d) => ` - [${d.ecosystem}] ${d.name}@${d.version} — ${d.license}`, + ), + ); + } + if (result.flagged.unknown.length > 0) { + bodyLines.push( + `Unknown/missing (${result.flagged.unknown.length}):`, + ...result.flagged.unknown + .slice(0, 25) + .map((d) => ` - [${d.ecosystem}] ${d.name}@${d.version}`), + ); + if (result.flagged.unknown.length > 25) { bodyLines.push( - `Unknown/missing (${result.flagged.unknown.length}):`, - ...result.flagged.unknown - .slice(0, 25) - .map((d) => ` - [${d.ecosystem}] ${d.name}@${d.version}`), - ); - if (result.flagged.unknown.length > 25) { - bodyLines.push( - ` ... ${result.flagged.unknown.length - 25} more (see structuredContent.flagged.unknown)`, - ); - } - } - if (bodyLines.length === 0) { - bodyLines.push("All licenses cleared."); - } - - const nextSteps: string[] = []; - if (result.tier === "BLOCK") { - nextSteps.push( - "review the copyleft/proprietary deps above — each must be replaced or explicitly approved by legal", - "call `dependencies` with the offending ecosystem filter to see the full record", - ); - } else if (result.tier === "WARN") { - nextSteps.push( - "populate missing licenses: re-index with `codehub analyze --force` once the license-detection follow-up lands", - "call `dependencies` to inspect the raw Dependency rows", - ); - } else { - nextSteps.push( - "no action required — re-run after bumping any dependency", - "call `dependencies` to inspect the full list", + ` ... ${result.flagged.unknown.length - 25} more (see structuredContent.flagged.unknown)`, ); } + } + if (bodyLines.length === 0) { + bodyLines.push("All licenses cleared."); + } - return withNextSteps( - [header, ...bodyLines].join("\n"), - { - tier: result.tier, - flagged: result.flagged, - summary: result.summary, - }, - nextSteps, - stalenessFromMeta(resolved.meta), + const nextSteps: string[] = []; + if (result.tier === "BLOCK") { + nextSteps.push( + "review the copyleft/proprietary deps above — each must be replaced or explicitly approved by legal", + "call `dependencies` with the offending ecosystem filter to see the full record", + ); + } else if (result.tier === "WARN") { + nextSteps.push( + "populate missing licenses: re-index with `codehub analyze --force` once the license-detection follow-up lands", + "call `dependencies` to inspect the raw Dependency rows", + ); + } else { + nextSteps.push( + "no action required — re-run after bumping any dependency", + "call `dependencies` to inspect the full list", ); - } catch (err) { - return toolErrorFromUnknown(err); } - }); - return toToolResult(call); -} -export function registerLicenseAuditTool(server: McpServer, ctx: ToolContext): void { - server.registerTool( - "license_audit", - { - title: "Audit dependency licenses", - description: - "Classify every Dependency node by license risk: copyleft (GPL/AGPL/SSPL/EUPL/CPAL/OSL/RPL), proprietary, unknown. Returns tier=BLOCK if any copyleft or proprietary dep, WARN if only unknowns, OK otherwise. Note: until per-ecosystem license detection lands, most Dependency nodes carry license='UNKNOWN', so most audits will return tier=WARN until that follow-up ships.", - inputSchema: LicenseAuditInput, - annotations: { - readOnlyHint: true, - destructiveHint: false, - openWorldHint: false, - idempotentHint: true, + return { + text: [header, ...bodyLines].join("\n"), + structured: { + tier: result.tier, + flagged: result.flagged, + summary: result.summary, }, - }, - async (args) => fromToolResult(await runLicenseAudit(ctx, args)), - ); + nextSteps, + }; + }, +}); + +export async function runLicenseAudit( + ctx: ToolContext, + args: LicenseAuditArgs, +): Promise { + return licenseAuditTool.run(ctx, args); } -function stringOr(v: unknown, fallback: string): string { - if (typeof v === "string") return v; - if (typeof v === "number" || typeof v === "boolean") return String(v); - return fallback; +export function registerLicenseAuditTool(server: McpServer, ctx: ToolContext): void { + licenseAuditTool.register(server, ctx); } diff --git a/packages/mcp/src/tools/project-profile.ts b/packages/mcp/src/tools/project-profile.ts index 3c5277ef..2a13ca0b 100644 --- a/packages/mcp/src/tools/project-profile.ts +++ b/packages/mcp/src/tools/project-profile.ts @@ -3,10 +3,10 @@ * * Profile is a singleton per repo, emitted by the ingestion `profile` phase. * Each array field is stored in SQLite as a JSON-encoded TEXT column - * (`languages_json`, `frameworks_json`, etc.) so we decode every column - * back into a `string[]` before returning. If the repo was indexed before - * the profile phase shipped (or the phase failed to write the node), we - * return empty arrays and a hint nudging the caller toward `codehub + * (`languages_json`, `frameworks_json`, etc.) so the capability decodes every + * column back into a `string[]`. If the repo was indexed before the profile + * phase shipped (or the phase failed to write the node), the capability reports + * `profileExists: false` and this presenter nudges the caller toward `codehub * analyze --force`. * * `frameworks_json` is polymorphic across two generations: @@ -15,148 +15,118 @@ * so variant / version / confidence / parent metadata survives the * round-trip. Both are read transparently; callers receive both a * flat form (backward-compat) and the structured form in the payload. + * + * The shared reader/decoder lives in `@opencodehub/core-ops` + * `projectProfileCapability`; this file is the thin MCP adapter built with + * `defineTool` — the presenter builds the line list + conditional next-steps. */ -// biome-ignore-all lint/complexity/useLiteralKeys: dot-access disallowed on Record index signatures import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import type { FrameworkDetection } from "@opencodehub/core-types"; -import { toolErrorFromUnknown } from "../error-envelope.js"; -import { withNextSteps } from "../next-step-hints.js"; -import { stalenessFromMeta } from "../staleness.js"; import { - fromToolResult, - repoArgShape, - type ToolContext, - type ToolResult, - toToolResult, - withStore, -} from "./shared.js"; + type ProjectProfileInput, + type ProjectProfileOutput, + projectProfileCapability, +} from "@opencodehub/core-ops"; +import { defineTool } from "./define-tool.js"; +import { repoArgShape, type ToolContext, type ToolResult } from "./shared.js"; -const ProjectProfileInput = { +const ProjectProfileInputSchema = { ...repoArgShape, }; -interface ProjectProfilePayload { - readonly languages: readonly string[]; - /** Flat-string framework view (backward-compat). */ - readonly frameworks: readonly string[]; - /** Structured framework detections with variant / version / confidence / parent. */ - readonly frameworksDetected: readonly FrameworkDetection[]; - readonly iacTypes: readonly string[]; - readonly apiContracts: readonly string[]; - readonly manifests: readonly string[]; - readonly srcDirs: readonly string[]; -} - interface ProjectProfileArgs { readonly repo?: string | undefined; readonly repo_uri?: string | undefined; } -export async function runProjectProfile( - ctx: ToolContext, - args: ProjectProfileArgs, -): Promise { - const call = await withStore(ctx, args, async (store, resolved) => { - try { - const nodes = await store.graph.listNodesByKind("ProjectProfile", { limit: 1 }); - const profile = nodes[0]; - const payload: ProjectProfilePayload = { - languages: profile?.languages ? [...profile.languages] : [], - frameworks: profile?.frameworks ? [...profile.frameworks] : [], - frameworksDetected: profile?.frameworksDetected ? [...profile.frameworksDetected] : [], - iacTypes: profile?.iacTypes ? [...profile.iacTypes] : [], - apiContracts: profile?.apiContracts ? [...profile.apiContracts] : [], - manifests: profile?.manifests ? [...profile.manifests] : [], - srcDirs: profile?.srcDirs ? [...profile.srcDirs] : [], - }; +const projectProfileTool = defineTool< + ProjectProfileArgs, + ProjectProfileInput, + ProjectProfileOutput +>({ + name: "project_profile", + title: "Project Profile", + description: + "Returns the detected project profile: languages, frameworks (flat + structured), IaC types, API contracts, manifests, source directories.", + inputSchema: ProjectProfileInputSchema, + annotations: { + readOnlyHint: true, + destructiveHint: false, + openWorldHint: false, + idempotentHint: true, + }, + capability: projectProfileCapability, + toInput: () => ({}), + present: (out) => { + const { profile: payload, profileExists } = out; + const header = profileExists + ? `Project profile for ${out.repoName}:` + : `No ProjectProfile node in ${out.repoName}. Re-index with \`codehub analyze --force\` to populate.`; - const profileExists = profile !== undefined; - const header = profileExists - ? `Project profile for ${resolved.name}:` - : `No ProjectProfile node in ${resolved.name}. Re-index with \`codehub analyze --force\` to populate.`; + const lines: string[] = [header]; + if (payload.languages.length > 0) { + lines.push(` languages (${payload.languages.length}): ${payload.languages.join(", ")}`); + } + if (payload.frameworks.length > 0) { + // Prefer the structured form for display when available — render + // each framework with its variant so operators see "nextjs:app-router" + // rather than a bare "nextjs". Fall back to flat names. + const display = + payload.frameworksDetected.length > 0 + ? payload.frameworksDetected.map((d) => (d.variant ? `${d.name}:${d.variant}` : d.name)) + : payload.frameworks; + lines.push(` frameworks (${display.length}): ${display.join(", ")}`); + } + if (payload.iacTypes.length > 0) { + lines.push(` iacTypes (${payload.iacTypes.length}): ${payload.iacTypes.join(", ")}`); + } + if (payload.apiContracts.length > 0) { + lines.push( + ` apiContracts (${payload.apiContracts.length}): ${payload.apiContracts.join(", ")}`, + ); + } + if (payload.manifests.length > 0) { + lines.push(` manifests (${payload.manifests.length}): ${payload.manifests.join(", ")}`); + } + if (payload.srcDirs.length > 0) { + lines.push(` srcDirs (${payload.srcDirs.length}): ${payload.srcDirs.join(", ")}`); + } - const lines: string[] = [header]; - if (payload.languages.length > 0) { - lines.push( - ` languages (${payload.languages.length}): ${payload.languages.join(", ")}`, - ); - } + const nextSteps: string[] = []; + if (!profileExists) { + nextSteps.push("run `codehub analyze --force` to emit the ProjectProfile node"); + } else { if (payload.frameworks.length > 0) { - // Prefer the structured form for display when available — render - // each framework with its variant so operators see "nextjs:app-router" - // rather than a bare "nextjs". Fall back to flat names. - const display = - payload.frameworksDetected.length > 0 - ? payload.frameworksDetected.map((d) => (d.variant ? `${d.name}:${d.variant}` : d.name)) - : payload.frameworks; - lines.push(` frameworks (${display.length}): ${display.join(", ")}`); - } - if (payload.iacTypes.length > 0) { - lines.push(` iacTypes (${payload.iacTypes.length}): ${payload.iacTypes.join(", ")}`); - } - if (payload.apiContracts.length > 0) { - lines.push( - ` apiContracts (${payload.apiContracts.length}): ${payload.apiContracts.join(", ")}`, + nextSteps.push( + `call \`query\` with the framework name (e.g. "${payload.frameworks[0] ?? ""}") to find entry points`, ); } - if (payload.manifests.length > 0) { - lines.push( - ` manifests (${payload.manifests.length}): ${payload.manifests.join(", ")}`, - ); + if (payload.apiContracts.includes("openapi")) { + nextSteps.push("call `query` with kinds=['Operation'] to list OpenAPI operations"); } - if (payload.srcDirs.length > 0) { - lines.push(` srcDirs (${payload.srcDirs.length}): ${payload.srcDirs.join(", ")}`); + if (payload.iacTypes.includes("terraform")) { + nextSteps.push("call `list_findings` (once scanners are wired) for tfsec/checkov results"); } - - const next: string[] = []; - if (!profileExists) { - next.push("run `codehub analyze --force` to emit the ProjectProfile node"); - } else { - if (payload.frameworks.length > 0) { - next.push( - `call \`query\` with the framework name (e.g. "${payload.frameworks[0] ?? ""}") to find entry points`, - ); - } - if (payload.apiContracts.includes("openapi")) { - next.push("call `query` with kinds=['Operation'] to list OpenAPI operations"); - } - if (payload.iacTypes.includes("terraform")) { - next.push("call `list_findings` (once scanners are wired) for tfsec/checkov results"); - } - if (next.length === 0) { - next.push("call `list_repos` to pick a different repo"); - } + if (nextSteps.length === 0) { + nextSteps.push("call `list_repos` to pick a different repo"); } - - return withNextSteps( - lines.join("\n"), - { profile: payload }, - next, - stalenessFromMeta(resolved.meta), - ); - } catch (err) { - return toolErrorFromUnknown(err); } - }); - return toToolResult(call); + + return { + text: lines.join("\n"), + structured: { profile: payload }, + nextSteps, + }; + }, +}); + +export async function runProjectProfile( + ctx: ToolContext, + args: ProjectProfileArgs, +): Promise { + return projectProfileTool.run(ctx, args); } export function registerProjectProfileTool(server: McpServer, ctx: ToolContext): void { - server.registerTool( - "project_profile", - { - title: "Project Profile", - description: - "Returns the detected project profile: languages, frameworks (flat + structured), IaC types, API contracts, manifests, source directories.", - inputSchema: ProjectProfileInput, - annotations: { - readOnlyHint: true, - destructiveHint: false, - openWorldHint: false, - idempotentHint: true, - }, - }, - async (args) => fromToolResult(await runProjectProfile(ctx, args)), - ); + projectProfileTool.register(server, ctx); } From dfb81344d7b20f4500e780ef7b422a965d0e50cf Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Sat, 4 Jul 2026 01:41:57 +0000 Subject: [PATCH 07/11] refactor(ingestion): collapse provider extractors into shared generics Extract the near-identical extractCalls/extractDefinitions/extractHeritage loop shells into extractCallsGeneric + extractDefinitionsGeneric + extractHeritageRefBased in extract-helpers.ts, each config-driven per provider (receiver strategies, kind resolvers, promote/exported rules, heritage rules). Hoist findNameInside + qualifiedForCapture (13 copies). python defs + bespoke heritage (csharp/go/ts/java) kept custom; extractImports left per-language. Adds a characterization harness (16 providers x 4 extractors, value-locking golden) so graphHash byte-identity is enforced on every conversion. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/ingestion/src/providers/c.ts | 100 +-- .../src/providers/characterization-golden.ts | 130 +++ .../src/providers/characterization.test.ts | 780 ++++++++++++++++++ packages/ingestion/src/providers/cpp.ts | 120 +-- packages/ingestion/src/providers/csharp.ts | 111 +-- packages/ingestion/src/providers/dart.ts | 204 +---- .../src/providers/extract-helpers.ts | 416 ++++++++++ packages/ingestion/src/providers/go.ts | 137 +-- packages/ingestion/src/providers/java.ts | 120 +-- .../ingestion/src/providers/javascript.ts | 57 +- packages/ingestion/src/providers/kotlin.ts | 123 +-- packages/ingestion/src/providers/php.ts | 198 +---- packages/ingestion/src/providers/python.ts | 78 +- packages/ingestion/src/providers/ruby.ts | 207 ++--- packages/ingestion/src/providers/rust.ts | 76 +- packages/ingestion/src/providers/swift.ts | 179 +--- packages/ingestion/src/providers/ts-shared.ts | 141 +--- 17 files changed, 1696 insertions(+), 1481 deletions(-) create mode 100644 packages/ingestion/src/providers/characterization-golden.ts create mode 100644 packages/ingestion/src/providers/characterization.test.ts diff --git a/packages/ingestion/src/providers/c.ts b/packages/ingestion/src/providers/c.ts index 3bc9a31a..ed73625b 100644 --- a/packages/ingestion/src/providers/c.ts +++ b/packages/ingestion/src/providers/c.ts @@ -1,10 +1,10 @@ import type { NodeKind } from "@opencodehub/core-types"; -import type { ParseCapture } from "../parse/types.js"; import { + type DefinitionsConfig, + extractCallsGeneric, + extractDefinitionsGeneric, getLine, - innermostEnclosingDef, - isInside, - pairDefinitionsWithNames, + kindFromMap, } from "./extract-helpers.js"; import type { ExtractedCall, @@ -51,90 +51,22 @@ const C_DEF_KIND_MAP: Readonly> = { "definition.macro": "Macro", }; -function extractCDefinitions(input: ExtractDefinitionsInput): readonly ExtractedDefinition[] { - const { filePath, captures, sourceText } = input; - const paired = pairDefinitionsWithNames(captures); - const defCaptures = captures.filter((c) => c.tag.startsWith("definition.")); - const out: ExtractedDefinition[] = []; - - for (const { def, name } of paired) { - const kind = C_DEF_KIND_MAP[def.tag]; - if (kind === undefined) continue; - - let owner: string | undefined; - const ownerDef = innermostEnclosingDef(def, defCaptures); - if (ownerDef !== undefined) { - const ownerPaired = paired.find((p) => p.def === ownerDef); - if (ownerPaired !== undefined) owner = ownerPaired.name.text; - } - - const qualifiedName = owner !== undefined ? `${owner}.${name.text}` : name.text; - const headerLine = getLine(sourceText, def.startLine); - const isStatic = /\bstatic\b/.test(headerLine); - // C convention: identifiers prefixed with `_` tend to be internal; we - // respect both `static` and leading-underscore as non-exported. - const isExported = !isStatic && !name.text.startsWith("_"); +const C_DEFS_CONFIG: DefinitionsConfig = { + kindFor: kindFromMap(C_DEF_KIND_MAP), + // C convention: identifiers prefixed with `_` tend to be internal; we + // respect both `static` (file-scoped) and leading-underscore as non-exported. + isExported: ({ name, def, sourceText }) => + !/\bstatic\b/.test(getLine(sourceText, def.startLine)) && !name.startsWith("_"), +}; - out.push({ - kind, - name: name.text, - qualifiedName, - filePath, - startLine: def.startLine, - endLine: def.endLine, - isExported, - ...(owner !== undefined ? { owner } : {}), - }); - } - return out; +function extractCDefinitions(input: ExtractDefinitionsInput): readonly ExtractedDefinition[] { + return extractDefinitionsGeneric(input, C_DEFS_CONFIG); } function extractCCalls(input: ExtractCallsInput): readonly ExtractedCall[] { - const { filePath, captures, definitions } = input; - const defCaptures = captures.filter((c) => c.tag.startsWith("definition.")); - const callRefs = captures.filter((c) => c.tag === "reference.call"); - const out: ExtractedCall[] = []; - - for (const ref of callRefs) { - const innerName = findNameInside(captures, ref); - const calleeName = innerName?.text ?? ref.text; - - const enclosingDef = innermostEnclosingDef(ref, defCaptures); - const callerQualifiedName = enclosingDef - ? qualifiedForCapture(enclosingDef, definitions) - : ""; - - out.push({ - callerQualifiedName, - calleeName, - filePath, - startLine: ref.startLine, - }); - } - return out; -} - -function findNameInside( - captures: readonly ParseCapture[], - outer: ParseCapture, -): ParseCapture | undefined { - let best: ParseCapture | undefined; - for (const c of captures) { - if (c.tag !== "name") continue; - if (!isInside(c, outer)) continue; - if (best === undefined || c.startLine < best.startLine) best = c; - } - return best; -} - -function qualifiedForCapture( - def: ParseCapture, - definitions: readonly ExtractedDefinition[], -): string { - for (const d of definitions) { - if (d.startLine === def.startLine) return d.qualifiedName; - } - return ""; + // C emits NO receiver: qualified calls (`ns::foo()`) are a C++ concept, not + // applicable here. Omitting `inferReceiver` means `calleeOwner` is never set. + return extractCallsGeneric(input); } /** diff --git a/packages/ingestion/src/providers/characterization-golden.ts b/packages/ingestion/src/providers/characterization-golden.ts new file mode 100644 index 00000000..c58358fe --- /dev/null +++ b/packages/ingestion/src/providers/characterization-golden.ts @@ -0,0 +1,130 @@ +/** + * GENERATED characterization golden — DO NOT hand-edit. + * + * This file is the committed, byte-stable snapshot consumed by + * `characterization.test.ts`. It is a compiled-in `const` (not a JSON asset + * read at runtime) so the test resolves it from `dist` with a plain import, + * dodging `import.meta.url` path-offset fragility on bundle collapse. + * + * Each entry maps a `LanguageId` to the `canonicalJson(...)` string of that + * language's SORTED extractor output (see the test for the sort key), one + * string per core extractor. Full-value equality against these strings is the + * safety net for the extractor-generic refactor: any drift in a hash-relevant + * field (calleeOwner / qualifiedName / startLine / owner / …) changes the + * canonical string and fails the test with a per-language, per-extractor diff. + * + * To regenerate (ONLY for a deliberate, reviewed behavior change): + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion build + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion test + * The test rewrites THIS file's `GOLDEN` literal, then re-asserts against it. + */ + +import type { LanguageId } from "@opencodehub/core-types"; + +/** Per-extractor canonical-JSON snapshots for one language. */ +export interface ExtractorSnapshot { + readonly definitions: string; + readonly calls: string; + readonly heritage: string; + readonly imports: string; +} + +// biome-ignore format: generated literal — leave the regenerator's formatting intact. +export const GOLDEN: Record = { + "typescript": { + "definitions": "[{\"endLine\":15,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.greet\",\"startLine\":12},{\"endLine\":18,\"filePath\":\"greeter.ts\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.log\",\"startLine\":16},{\"endLine\":19,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Welcomer\",\"qualifiedName\":\"Welcomer\",\"startLine\":10},{\"endLine\":21,\"filePath\":\"greeter.ts\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"MESSAGE\",\"qualifiedName\":\"MESSAGE\",\"startLine\":21},{\"endLine\":24,\"filePath\":\"greeter.ts\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"w\",\"owner\":\"run\",\"qualifiedName\":\"run.w\",\"startLine\":24},{\"endLine\":26,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":23},{\"endLine\":8,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":6}]", + "calls": "[{\"calleeName\":\"debug\",\"calleeOwner\":\"Logger\",\"callerQualifiedName\":\"Welcomer.log\",\"filePath\":\"greeter.ts\",\"startLine\":17},{\"calleeName\":\"greet\",\"calleeOwner\":\"w\",\"callerQualifiedName\":\"run\",\"filePath\":\"greeter.ts\",\"startLine\":25},{\"calleeName\":\"log\",\"calleeOwner\":\"this\",\"callerQualifiedName\":\"Welcomer.greet\",\"filePath\":\"greeter.ts\",\"startLine\":13}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"greeter.ts\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":6},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"greeter.ts\",\"parentName\":\"Greeter\",\"relation\":\"IMPLEMENTS\",\"startLine\":10}]", + "imports": "[{\"filePath\":\"greeter.ts\",\"importedNames\":[\"Logger\"],\"kind\":\"named\",\"source\":\"./logger.js\"},{\"filePath\":\"greeter.ts\",\"importedNames\":[\"other\"],\"kind\":\"named\",\"source\":\"./mixed\"},{\"filePath\":\"greeter.ts\",\"isWildcard\":true,\"kind\":\"namespace\",\"localAlias\":\"util\",\"source\":\"./util\"},{\"filePath\":\"greeter.ts\",\"kind\":\"default\",\"localAlias\":\"defaultExport\",\"source\":\"./mixed\"}]" + }, + "tsx": { + "definitions": "[{\"endLine\":10,\"filePath\":\"page.tsx\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"label\",\"owner\":\"Greeting\",\"qualifiedName\":\"Greeting.label\",\"startLine\":10},{\"endLine\":12,\"filePath\":\"page.tsx\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"Greeting\",\"qualifiedName\":\"Greeting\",\"startLine\":9},{\"endLine\":17,\"filePath\":\"page.tsx\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"render\",\"owner\":\"Page\",\"qualifiedName\":\"Page.render\",\"startLine\":15},{\"endLine\":18,\"filePath\":\"page.tsx\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Page\",\"qualifiedName\":\"Page\",\"startLine\":14},{\"endLine\":7,\"filePath\":\"page.tsx\",\"isExported\":false,\"kind\":\"Interface\",\"name\":\"Props\",\"qualifiedName\":\"Props\",\"startLine\":5}]", + "calls": "[{\"calleeName\":\"format\",\"calleeOwner\":\"svc\",\"callerQualifiedName\":\"Greeting\",\"filePath\":\"page.tsx\",\"startLine\":10}]", + "heritage": "[{\"childQualifiedName\":\"Page\",\"filePath\":\"page.tsx\",\"parentName\":\"React.Component\",\"relation\":\"EXTENDS\",\"startLine\":14}]", + "imports": "[{\"filePath\":\"page.tsx\",\"importedNames\":[\"Button\"],\"kind\":\"named\",\"source\":\"./button.js\"},{\"filePath\":\"page.tsx\",\"kind\":\"default\",\"localAlias\":\"React\",\"source\":\"react\"}]" + }, + "javascript": { + "definitions": "[{\"endLine\":16,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":12},{\"endLine\":17,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":11},{\"endLine\":20,\"filePath\":\"esm.js\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"g\",\"owner\":\"run\",\"qualifiedName\":\"run.g\",\"startLine\":20},{\"endLine\":22,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":19},{\"endLine\":26,\"filePath\":\"esm.js\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_internalHelper\",\"qualifiedName\":\"_internalHelper\",\"startLine\":24},{\"endLine\":8,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":6},{\"endLine\":9,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":5}]", + "calls": "[{\"calleeName\":\"debug\",\"calleeOwner\":\"Logger\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"esm.js\",\"startLine\":14},{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"esm.js\",\"startLine\":21},{\"calleeName\":\"hello\",\"calleeOwner\":\"this\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"esm.js\",\"startLine\":13}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"esm.js\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":11}]", + "imports": "[{\"filePath\":\"esm.js\",\"importedNames\":[\"Logger\"],\"kind\":\"named\",\"source\":\"./logger.js\"},{\"filePath\":\"esm.js\",\"kind\":\"default\",\"localAlias\":\"defaultExport\",\"source\":\"./default.js\"}]" + }, + "python": { + "definitions": "[{\"endLine\":12,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":10},{\"endLine\":12,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Base\",\"qualifiedName\":\"Base.greet\",\"startLine\":11},{\"endLine\":17,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":15},{\"endLine\":20,\"filePath\":\"mod.py\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"_private\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter._private\",\"startLine\":19},{\"endLine\":20,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":14},{\"endLine\":23,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Variable\",\"name\":\"g\",\"owner\":\"run\",\"qualifiedName\":\"run.g\",\"startLine\":23},{\"endLine\":24,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":22},{\"endLine\":7,\"filePath\":\"mod.py\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"MAX_RETRY\",\"qualifiedName\":\"MAX_RETRY\",\"startLine\":7},{\"endLine\":8,\"filePath\":\"mod.py\",\"isConst\":false,\"isExported\":false,\"kind\":\"Const\",\"name\":\"_internal_version\",\"qualifiedName\":\"_internal_version\",\"startLine\":8}]", + "calls": "[{\"calleeName\":\"Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"mod.py\",\"startLine\":23},{\"calleeName\":\"getenv\",\"calleeOwner\":\"os\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"mod.py\",\"startLine\":16},{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"mod.py\",\"startLine\":24},{\"calleeName\":\"greet\",\"calleeOwner\":\"self\",\"callerQualifiedName\":\"Greeter._private\",\"filePath\":\"mod.py\",\"startLine\":20},{\"calleeName\":\"super\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"mod.py\",\"startLine\":17},{\"calleeName\":\"super\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"mod.py\",\"startLine\":17}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"mod.py\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":14},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"mod.py\",\"parentName\":\"Mixin\",\"relation\":\"EXTENDS\",\"startLine\":14}]", + "imports": "[{\"filePath\":\"mod.py\",\"importedNames\":[\"List\",\"Opt\"],\"kind\":\"named\",\"source\":\"typing\"},{\"filePath\":\"mod.py\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"utils\"},{\"filePath\":\"mod.py\",\"kind\":\"namespace\",\"localAlias\":\"np\",\"source\":\"numpy\"},{\"filePath\":\"mod.py\",\"kind\":\"namespace\",\"source\":\"os\"}]" + }, + "go": { + "definitions": "[{\"endLine\":1,\"filePath\":\"greet.go\",\"isExported\":false,\"kind\":\"Module\",\"name\":\"greet\",\"qualifiedName\":\"greet\",\"startLine\":1},{\"endLine\":11,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":9},{\"endLine\":15,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Speaker\",\"qualifiedName\":\"Speaker\",\"startLine\":13},{\"endLine\":19,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Const\",\"name\":\"MaxGreet\",\"qualifiedName\":\"MaxGreet\",\"startLine\":17},{\"endLine\":23,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"Greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.Greet\",\"startLine\":21},{\"endLine\":28,\"filePath\":\"greet.go\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":25},{\"endLine\":30,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"Exported\",\"qualifiedName\":\"Exported\",\"startLine\":30},{\"endLine\":31,\"filePath\":\"greet.go\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"internal\",\"qualifiedName\":\"internal\",\"startLine\":31}]", + "calls": "[{\"calleeName\":\"Greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.go\",\"startLine\":27},{\"calleeName\":\"Sprintf\",\"calleeOwner\":\"fmt\",\"callerQualifiedName\":\"Greeter.Greet\",\"filePath\":\"greet.go\",\"startLine\":22},{\"calleeName\":\"ToLower\",\"calleeOwner\":\"str\",\"callerQualifiedName\":\"Greeter.Greet\",\"filePath\":\"greet.go\",\"startLine\":22}]", + "heritage": "[]", + "imports": "[{\"filePath\":\"greet.go\",\"kind\":\"package-wildcard\",\"localAlias\":\".\",\"source\":\"errors\"},{\"filePath\":\"greet.go\",\"kind\":\"package-wildcard\",\"localAlias\":\"str\",\"source\":\"strings\"},{\"filePath\":\"greet.go\",\"kind\":\"package-wildcard\",\"source\":\"fmt\"}]" + }, + "rust": { + "definitions": "[{\"endLine\":13,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":11},{\"endLine\":19,\"filePath\":\"lib.rs\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":16},{\"endLine\":25,\"filePath\":\"lib.rs\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":23},{\"endLine\":28,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Const\",\"name\":\"DEFAULT\",\"qualifiedName\":\"DEFAULT\",\"startLine\":28},{\"endLine\":30,\"filePath\":\"lib.rs\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"internal\",\"qualifiedName\":\"internal\",\"startLine\":30},{\"endLine\":34,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":31},{\"endLine\":9,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Trait\",\"name\":\"Greet\",\"qualifiedName\":\"Greet\",\"startLine\":7}]", + "calls": "[{\"calleeName\":\"debug\",\"calleeOwner\":\"Logger\",\"callerQualifiedName\":\"Greeter.log\",\"filePath\":\"lib.rs\",\"startLine\":24},{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"lib.rs\",\"startLine\":33},{\"calleeName\":\"log\",\"calleeOwner\":\"self\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"lib.rs\",\"startLine\":17},{\"calleeName\":\"to_string\",\"calleeOwner\":\"\\\"world\\\"\",\"callerQualifiedName\":\"run\",\"filePath\":\"lib.rs\",\"startLine\":32}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"lib.rs\",\"parentName\":\"Greet\",\"relation\":\"IMPLEMENTS\",\"startLine\":15}]", + "imports": "[{\"filePath\":\"lib.rs\",\"importedNames\":[\"HashMap\",\"Sorted\"],\"kind\":\"named\",\"source\":\"std::collections\"},{\"filePath\":\"lib.rs\",\"importedNames\":[\"Logger\"],\"kind\":\"named\",\"source\":\"crate::logger\"},{\"filePath\":\"lib.rs\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"crate::util\"},{\"filePath\":\"lib.rs\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"self::public_api\"}]" + }, + "java": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":8},{\"endLine\":14,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":12},{\"endLine\":21,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Constructor\",\"name\":\"Welcomer\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Welcomer\",\"startLine\":19},{\"endLine\":25,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.greet\",\"startLine\":23},{\"endLine\":30,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"run\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.run\",\"startLine\":27},{\"endLine\":31,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Welcomer\",\"qualifiedName\":\"Welcomer\",\"startLine\":16},{\"endLine\":33,\"filePath\":\"Welcomer.java\",\"isExported\":false,\"kind\":\"Class\",\"name\":\"Internal\",\"qualifiedName\":\"Internal\",\"startLine\":33},{\"endLine\":9,\"filePath\":\"Welcomer.java\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"greet\",\"callerQualifiedName\":\"Welcomer.run\",\"filePath\":\"Welcomer.java\",\"startLine\":28},{\"calleeName\":\"greet\",\"callerQualifiedName\":\"Welcomer.run\",\"filePath\":\"Welcomer.java\",\"startLine\":29},{\"calleeName\":\"println\",\"calleeOwner\":\"System.out\",\"callerQualifiedName\":\"Welcomer.run\",\"filePath\":\"Welcomer.java\",\"startLine\":29}]", + "heritage": "[{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.java\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":16},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.java\",\"parentName\":\"Greeter\",\"relation\":\"IMPLEMENTS\",\"startLine\":16},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.java\",\"parentName\":\"Runnable\",\"relation\":\"IMPLEMENTS\",\"startLine\":16}]", + "imports": "[{\"filePath\":\"Welcomer.java\",\"importedNames\":[\"List\"],\"kind\":\"named\",\"source\":\"java.util\"},{\"filePath\":\"Welcomer.java\",\"importedNames\":[\"PI\"],\"kind\":\"named\",\"source\":\"java.lang.Math\"},{\"filePath\":\"Welcomer.java\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"java.util.concurrent\"}]" + }, + "csharp": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Welcomer.cs\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"Greet\",\"owner\":\"IGreeter\",\"qualifiedName\":\"IGreeter.Greet\",\"startLine\":10},{\"endLine\":11,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"IGreeter\",\"qualifiedName\":\"IGreeter\",\"startLine\":8},{\"endLine\":16,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":13},{\"endLine\":25,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Constructor\",\"name\":\"Welcomer\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Welcomer\",\"startLine\":22},{\"endLine\":30,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"Greet\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Greet\",\"startLine\":27},{\"endLine\":35,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"Dispose\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Dispose\",\"startLine\":32},{\"endLine\":36,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Welcomer\",\"qualifiedName\":\"Welcomer\",\"startLine\":18},{\"endLine\":38,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Record\",\"name\":\"Pair\",\"qualifiedName\":\"Pair\",\"startLine\":38},{\"endLine\":40,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"Point\",\"qualifiedName\":\"Point\",\"startLine\":40},{\"endLine\":42,\"filePath\":\"Welcomer.cs\",\"isExported\":false,\"kind\":\"Class\",\"name\":\"Hidden\",\"qualifiedName\":\"Hidden\",\"startLine\":42},{\"endLine\":43,\"filePath\":\"Welcomer.cs\",\"isExported\":false,\"kind\":\"Namespace\",\"name\":\"App.Greet\",\"qualifiedName\":\"App.Greet\",\"startLine\":6}]", + "calls": "[{\"calleeName\":\"WriteLine\",\"calleeOwner\":\"Console\",\"callerQualifiedName\":\"Welcomer.Dispose\",\"filePath\":\"Welcomer.cs\",\"startLine\":34}]", + "heritage": "[{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.cs\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":18},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.cs\",\"parentName\":\"IDisposable\",\"relation\":\"IMPLEMENTS\",\"startLine\":18},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.cs\",\"parentName\":\"IGreeter\",\"relation\":\"IMPLEMENTS\",\"startLine\":18}]", + "imports": "[{\"filePath\":\"Welcomer.cs\",\"kind\":\"namespace\",\"localAlias\":\"Json\",\"source\":\"Newtonsoft.Json\"},{\"filePath\":\"Welcomer.cs\",\"kind\":\"namespace\",\"source\":\"System\"},{\"filePath\":\"Welcomer.cs\",\"kind\":\"namespace\",\"source\":\"System.Collections.Generic\"}]" + }, + "c": { + "definitions": "[{\"endLine\":13,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Enum\",\"name\":\"Status\",\"owner\":\"Status\",\"qualifiedName\":\"Status.Status\",\"startLine\":10},{\"endLine\":13,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Typedef\",\"name\":\"Status\",\"qualifiedName\":\"Status\",\"startLine\":10},{\"endLine\":19,\"filePath\":\"user.c\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"reset_counter\",\"qualifiedName\":\"reset_counter\",\"startLine\":17},{\"endLine\":26,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"register_user\",\"qualifiedName\":\"register_user\",\"startLine\":21},{\"endLine\":32,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"main\",\"qualifiedName\":\"main\",\"startLine\":28},{\"endLine\":8,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"User\",\"owner\":\"User\",\"qualifiedName\":\"User.User\",\"startLine\":5},{\"endLine\":8,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Typedef\",\"name\":\"User\",\"qualifiedName\":\"User\",\"startLine\":5}]", + "calls": "[{\"calleeName\":\"printf\",\"callerQualifiedName\":\"register_user\",\"filePath\":\"user.c\",\"startLine\":24},{\"calleeName\":\"register_user\",\"callerQualifiedName\":\"main\",\"filePath\":\"user.c\",\"startLine\":29},{\"calleeName\":\"reset_counter\",\"callerQualifiedName\":\"main\",\"filePath\":\"user.c\",\"startLine\":30}]", + "heritage": "[]", + "imports": "[{\"filePath\":\"user.c\",\"kind\":\"package-wildcard\",\"source\":\"stdio.h\"},{\"filePath\":\"user.c\",\"kind\":\"package-wildcard\",\"source\":\"user.h\"}]" + }, + "cpp": { + "definitions": "[{\"endLine\":10,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":7},{\"endLine\":14,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"mix\",\"owner\":\"Mixin\",\"qualifiedName\":\"Mixin.mix\",\"startLine\":14},{\"endLine\":15,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Mixin\",\"qualifiedName\":\"Mixin\",\"startLine\":12},{\"endLine\":19,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"Greeter\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.Greeter\",\"startLine\":19},{\"endLine\":20,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":20},{\"endLine\":23,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":17},{\"endLine\":31,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":25},{\"endLine\":33,\"filePath\":\"greet.cpp\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_internalHelper\",\"qualifiedName\":\"_internalHelper\",\"startLine\":33},{\"endLine\":35,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Namespace\",\"name\":\"auth\",\"qualifiedName\":\"auth\",\"startLine\":5},{\"endLine\":9,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"hello\",\"calleeOwner\":\"Base\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.cpp\",\"startLine\":30},{\"calleeName\":\"hello\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.cpp\",\"startLine\":27},{\"calleeName\":\"hello\",\"calleeOwner\":\"ptr\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.cpp\",\"startLine\":29}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"greet.cpp\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":17},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"greet.cpp\",\"parentName\":\"Mixin\",\"relation\":\"EXTENDS\",\"startLine\":17}]", + "imports": "[{\"filePath\":\"greet.cpp\",\"kind\":\"package-wildcard\",\"source\":\"db.h\"},{\"filePath\":\"greet.cpp\",\"kind\":\"package-wildcard\",\"source\":\"string\"}]" + }, + "ruby": { + "definitions": "[{\"endLine\":10,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":6},{\"endLine\":13,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"log\",\"qualifiedName\":\"log\",\"startLine\":13},{\"endLine\":14,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Module\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":12},{\"endLine\":22,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":19},{\"endLine\":26,\"filePath\":\"auth.rb\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"_private\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter._private\",\"startLine\":24},{\"endLine\":27,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":16},{\"endLine\":28,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Module\",\"name\":\"Auth\",\"qualifiedName\":\"Auth\",\"startLine\":5},{\"endLine\":33,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":30},{\"endLine\":9,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Base\",\"qualifiedName\":\"Base.greet\",\"startLine\":7}]", + "calls": "[{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"auth.rb\",\"startLine\":32},{\"calleeName\":\"greet\",\"callerQualifiedName\":\"Greeter._private\",\"filePath\":\"auth.rb\",\"startLine\":25},{\"calleeName\":\"log\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"auth.rb\",\"startLine\":20},{\"calleeName\":\"new\",\"calleeOwner\":\"Auth::Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"auth.rb\",\"startLine\":31}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.rb\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":16},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.rb\",\"parentName\":\"Logger\",\"relation\":\"IMPLEMENTS\",\"startLine\":17}]", + "imports": "[{\"filePath\":\"auth.rb\",\"kind\":\"named\",\"source\":\"./session\"},{\"filePath\":\"auth.rb\",\"kind\":\"package-wildcard\",\"source\":\"digest\"}]" + }, + "kotlin": { + "definitions": "[{\"endLine\":12,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":12},{\"endLine\":13,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":11},{\"endLine\":19,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":16},{\"endLine\":23,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":21},{\"endLine\":24,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":15},{\"endLine\":27,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Property\",\"name\":\"g\",\"owner\":\"run\",\"qualifiedName\":\"run.g\",\"startLine\":27},{\"endLine\":29,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":26},{\"endLine\":31,\"filePath\":\"Auth.kt\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_privateHelper\",\"qualifiedName\":\"_privateHelper\",\"startLine\":31},{\"endLine\":8,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Logger\",\"qualifiedName\":\"Logger.log\",\"startLine\":8},{\"endLine\":9,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":7}]", + "calls": "[{\"calleeName\":\"Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.kt\",\"startLine\":27},{\"calleeName\":\"hello\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.kt\",\"startLine\":28},{\"calleeName\":\"hello\",\"calleeOwner\":\"super\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.kt\",\"startLine\":18},{\"calleeName\":\"log\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.kt\",\"startLine\":17},{\"calleeName\":\"println\",\"callerQualifiedName\":\"Greeter.log\",\"filePath\":\"Auth.kt\",\"startLine\":22}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.kt\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":15},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.kt\",\"parentName\":\"Logger\",\"relation\":\"EXTENDS\",\"startLine\":15}]", + "imports": "[{\"filePath\":\"Auth.kt\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"kotlin.collections\"},{\"filePath\":\"Auth.kt\",\"kind\":\"named\",\"source\":\"java.util.UUID\"}]" + }, + "swift": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":8},{\"endLine\":22,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":19},{\"endLine\":26,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":24},{\"endLine\":27,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":12},{\"endLine\":32,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":29},{\"endLine\":34,\"filePath\":\"Auth.swift\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_privateHelper\",\"qualifiedName\":\"_privateHelper\",\"startLine\":34},{\"endLine\":6,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":4},{\"endLine\":9,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.swift\",\"startLine\":30},{\"calleeName\":\"hello\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.swift\",\"startLine\":31},{\"calleeName\":\"hello\",\"calleeOwner\":\"super\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.swift\",\"startLine\":21},{\"calleeName\":\"log\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.swift\",\"startLine\":20},{\"calleeName\":\"print\",\"callerQualifiedName\":\"Greeter.log\",\"filePath\":\"Auth.swift\",\"startLine\":25}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.swift\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":12},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.swift\",\"parentName\":\"Logger\",\"relation\":\"EXTENDS\",\"startLine\":12}]", + "imports": "[{\"filePath\":\"Auth.swift\",\"kind\":\"named\",\"source\":\"Foundation\"}]" + }, + "php": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Authenticatable\",\"qualifiedName\":\"Authenticatable\",\"startLine\":7},{\"endLine\":14,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"touch\",\"owner\":\"Timestamps\",\"qualifiedName\":\"Timestamps.touch\",\"startLine\":14},{\"endLine\":15,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Trait\",\"name\":\"Timestamps\",\"qualifiedName\":\"Timestamps\",\"startLine\":12},{\"endLine\":19,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":19},{\"endLine\":2,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Namespace\",\"name\":\"Auth\",\"qualifiedName\":\"Auth\",\"startLine\":2},{\"endLine\":20,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":17},{\"endLine\":31,\"filePath\":\"Auth.php\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"__construct\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.__construct\",\"startLine\":28},{\"endLine\":38,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"login\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.login\",\"startLine\":33},{\"endLine\":39,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":22},{\"endLine\":45,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":41},{\"endLine\":9,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"login\",\"owner\":\"Authenticatable\",\"qualifiedName\":\"Authenticatable.login\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"hello\",\"calleeOwner\":\"$this\",\"callerQualifiedName\":\"Greeter.login\",\"filePath\":\"Auth.php\",\"startLine\":35},{\"calleeName\":\"hello\",\"calleeOwner\":\"Base\",\"callerQualifiedName\":\"Greeter.login\",\"filePath\":\"Auth.php\",\"startLine\":36},{\"calleeName\":\"login\",\"calleeOwner\":\"$g\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.php\",\"startLine\":44}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.php\",\"parentName\":\"Authenticatable\",\"relation\":\"IMPLEMENTS\",\"startLine\":22},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.php\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":22},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.php\",\"parentName\":\"Timestamps\",\"relation\":\"IMPLEMENTS\",\"startLine\":24}]", + "imports": "[{\"filePath\":\"Auth.php\",\"kind\":\"named\",\"source\":\"Psr/Log/LoggerInterface\"},{\"filePath\":\"Auth.php\",\"kind\":\"named\",\"source\":\"Timestamps\"},{\"filePath\":\"Auth.php\",\"kind\":\"package-wildcard\",\"source\":\"config.php\"}]" + }, + "dart": { + "definitions": "[{\"endLine\":10,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"touch\",\"qualifiedName\":\"touch\",\"startLine\":10},{\"endLine\":10,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"touch\",\"qualifiedName\":\"touch\",\"startLine\":10},{\"endLine\":11,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Trait\",\"name\":\"Timestamps\",\"qualifiedName\":\"Timestamps\",\"startLine\":9},{\"endLine\":14,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":14},{\"endLine\":14,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":14},{\"endLine\":15,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":13},{\"endLine\":23,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":23},{\"endLine\":23,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":23},{\"endLine\":29,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":29},{\"endLine\":29,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":29},{\"endLine\":32,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":17},{\"endLine\":34,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":34},{\"endLine\":39,\"filePath\":\"auth.dart\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_privateHelper\",\"qualifiedName\":\"_privateHelper\",\"startLine\":39},{\"endLine\":6,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Logger\",\"qualifiedName\":\"Logger.log\",\"startLine\":6},{\"endLine\":7,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":5}]", + "calls": "[]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.dart\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":17},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.dart\",\"parentName\":\"Logger\",\"relation\":\"IMPLEMENTS\",\"startLine\":17},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.dart\",\"parentName\":\"Timestamps\",\"relation\":\"IMPLEMENTS\",\"startLine\":17}]", + "imports": "[{\"filePath\":\"auth.dart\",\"kind\":\"namespace\",\"localAlias\":\"meta\",\"source\":\"package:meta/meta.dart\"},{\"filePath\":\"auth.dart\",\"kind\":\"namespace\",\"source\":\"dart:io\"}]" + }, + "cobol": { + "definitions": "[]", + "calls": "[]", + "heritage": "[]", + "imports": "[]" + } +}; diff --git a/packages/ingestion/src/providers/characterization.test.ts b/packages/ingestion/src/providers/characterization.test.ts new file mode 100644 index 00000000..bcd6a14d --- /dev/null +++ b/packages/ingestion/src/providers/characterization.test.ts @@ -0,0 +1,780 @@ +/** + * Provider extractor characterization ("golden") harness. + * + * WHY THIS EXISTS + * --------------- + * The per-language provider tests (`swift.test.ts`, `csharp.test.ts`, …) are + * STRUCTURAL: they assert set-membership ("defs include `Greeter`"), so they do + * NOT lock hash-relevant VALUES like `calleeOwner`, `qualifiedName`, `startLine`, + * or `owner`. A refactor that collapses the per-provider `extractCalls` / + * `extractHeritage` implementations into shared generics could silently drift one + * of those fields and still pass every existing test — but change the downstream + * `graphHash`. + * + * This harness closes that gap. For all 16 registered providers × the 4 core + * extractors, it snapshots the FULL canonical-JSON output over a representative + * fixture and asserts byte-equality against a committed golden + * (`characterization-golden.ts`). It fails loudly with a per-language, per-extractor + * diff on any value drift. + * + * DESIGN + * ------ + * - Fixtures reuse the exact `FIXTURE` string each per-language `*.test.ts` defines + * (representative; known to exercise defs/calls/heritage/imports). cobol has no + * tree-sitter grammar, so it is NOT routed through the ParsePool — its provider + * ignores inputs and returns [] for every extractor; we snapshot the empty arrays. + * - Each extractor output array is sorted by `canonicalJson(element)` before + * snapshotting. That is a stable TOTAL order, so a pure emission-order refactor + * does NOT false-positive, while any VALUE drift changes an element's canonical + * string and IS caught. + * - Coverage is a tripwire: the harness asserts it snapshotted exactly + * `listProviders().length` languages, so adding a provider forces a golden update. + * + * REGENERATING THE GOLDEN (deliberate, reviewed behavior changes ONLY) + * -------------------------------------------------------------------- + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion build + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion test + * The env flag rewrites `src/providers/characterization-golden.ts` from the live + * extractor output, then STILL asserts against the just-written values. Without the + * flag the golden is never mutated — the test is read-only. + */ + +import { strict as assert } from "node:assert"; +import { writeFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { after, before, describe, it } from "node:test"; +import { canonicalJson, type LanguageId } from "@opencodehub/core-types"; +import { ParsePool } from "../parse/worker-pool.js"; +import { type ExtractorSnapshot, GOLDEN } from "./characterization-golden.js"; +import { getProvider, listProviders } from "./registry.js"; +import { type ParsedFixture, parseFixture } from "./test-helpers.js"; + +/** + * One representative fixture per language. The 15 tree-sitter languages reuse the + * verbatim `FIXTURE` string from their existing behavior test (each already + * exercises definitions + calls + heritage + imports). cobol gets a small program + * whose captures are empty (no grammar), which its stub provider maps to []. + */ +const FIXTURES: Record = { + typescript: { + path: "greeter.ts", + source: ` +import { Logger } from "./logger.js"; +import * as util from "./util"; +import defaultExport, { other } from "./mixed"; + +export interface Greeter extends Base { + greet(name: string): string; +} + +export abstract class Welcomer implements Greeter { + private banner: string; + public greet(name: string): string { + this.log(name); + return "hi " + name; + } + private log(msg: string): void { + Logger.debug(msg); + } +} + +export const MESSAGE = "welcome"; + +export function run(): void { + const w = new Welcomer(); + w.greet("world"); +} +`, + }, + tsx: { + path: "page.tsx", + source: ` +import React from "react"; +import { Button } from "./button.js"; + +interface Props { + name: string; +} + +export function Greeting(props: Props) { + const label = svc.format(props.name); + return