diff --git a/.claude/skills/codehub-document/references/data-source-map.md b/.claude/skills/codehub-document/references/data-source-map.md index 44fe433b..55a05e80 100644 --- a/.claude/skills/codehub-document/references/data-source-map.md +++ b/.claude/skills/codehub-document/references/data-source-map.md @@ -11,7 +11,7 @@ graph_hash: ## Repo profile # from project_profile - languages: TypeScript 87%, Rust 11%, Python 2% -- stacks: Node 22, pnpm 10, DuckDB, Vitest +- stacks: Node 24, pnpm 10, SQLite (node:sqlite), Vitest - entry points: packages/mcp/src/index.ts, packages/cli/src/bin.ts ## Top communities (≤ 10) # from sql: SELECT name, inferred_label, cohesion, symbol_count @@ -80,17 +80,20 @@ File-level fan-out means one role may seed multiple packets (for example, `doc-a ## Schema preflight (non-optional) -**Before composing any SQL query over `nodes`, `relations`, or any other -graph table, Phase 0 MUST probe the schema once and cache the result in -`.prefetch.md`.** Subagents then consult the cached schema instead of -guessing column names, which would fail with `Binder Error: Referenced -column "X" not found in FROM clause`. +**Before composing any SQL query over `nodes`, `edges`, or any other +table in `store.sqlite`, Phase 0 MUST probe the schema once and cache the +result in `.prefetch.md`.** Subagents then consult the cached schema +instead of guessing column names, which would fail with a `no such column` +SQLite error. -The probe is one SQL call: +The probe is one SQL call over SQLite's schema catalog: ``` -sql("SELECT table_name, column_name FROM information_schema.columns - WHERE table_name IN ('nodes','relations') ORDER BY table_name, column_name") +sql("SELECT m.name AS table_name, c.name AS column_name + FROM sqlite_master m + JOIN pragma_table_info(m.name) c + WHERE m.type = 'table' AND m.name IN ('nodes','edges') + ORDER BY table_name, column_name") ``` Write the result as a dedicated `.context.md § Schema` subsection (top 30 @@ -100,8 +103,8 @@ rows, no cap) and as a digest line in `.prefetch.md` with Historical note: `nodes` does not have a `path` column — routes store their endpoint under `name` (as `"METHOD /path"`), and the file path is `file_path`. Observed during a 2026-04-27 dogfood when subagent prompts -blindly referenced `path` and hit a Binder Error on an otherwise fresh -graph. The preflight prevents this class of bug across every subagent. +blindly referenced `path` and hit a `no such column` error on an otherwise +fresh index. The preflight prevents this class of bug across every subagent. ## Phase 0 algorithm (pseudocode) @@ -111,7 +114,7 @@ Steps marked `# wave 0a` and `# wave 0b` each run as a single parallel tool-use # wave 0a — independent precompute (one parallel batch) 1. staleness = list_repos → entry for this repo → _meta.codehub/staleness 2. profile = project_profile({repo}) -3. schema = sql("SELECT table_name, column_name FROM information_schema.columns …") +3. schema = sql("SELECT … FROM sqlite_master JOIN pragma_table_info(name) …") 4. routes = route_map({repo}) 5. tools = tool_map({repo}) 6. deps = dependencies({repo}) @@ -126,7 +129,7 @@ Steps marked `# wave 0a` and `# wave 0b` each run as a single parallel tool-use # wave 0b — depends on schema + profile (one parallel batch) 11. communities = sql("SELECT … FROM nodes WHERE kind='Community' …") 12. processes = sql("SELECT … FROM nodes WHERE kind='Process' …") -13. relations = sql("SELECT … FROM relations …") # for diagrams +13. relations = sql("SELECT … FROM edges …") # for diagrams 14. top_folders = top-5 folders by file count (from profile.entryPoints + glob) 15. owners_summary = [owners({path}) for path in top_folders] 16. if --group: group_hits = group_query({group, canonical_terms}) diff --git a/.claude/skills/codehub-document/references/document-templates.md b/.claude/skills/codehub-document/references/document-templates.md index c4d22c3e..6ff0e4de 100644 --- a/.claude/skills/codehub-document/references/document-templates.md +++ b/.claude/skills/codehub-document/references/document-templates.md @@ -23,7 +23,7 @@ Cites `packages/foo/src/index.ts` (200 LOC) style file references. | Layer | Technology | Source | |---|---|---| | Runtime | Node 22 | `package.json:7` | -| Storage | DuckDB + hnsw_acorn | `packages/storage/src/index.ts:12` | +| Storage | SQLite (single-file, node:sqlite) — FTS5 + vector KNN | `packages/storage/src/index.ts:12` | | ... | ... | ... | ## Module map diff --git a/.claude/skills/codehub-document/references/mermaid-patterns.md b/.claude/skills/codehub-document/references/mermaid-patterns.md index ae047e3d..83837a73 100644 --- a/.claude/skills/codehub-document/references/mermaid-patterns.md +++ b/.claude/skills/codehub-document/references/mermaid-patterns.md @@ -14,11 +14,11 @@ flowchart LR core[Core types] ingestion[Ingestion DAG] storage[Storage] - duckdb[(DuckDB)]:::external + sqlite[(store.sqlite)]:::external mcp --> core ingestion --> core ingestion --> storage - storage --> duckdb + storage --> sqlite classDef external stroke-dasharray: 3 3 ``` @@ -104,14 +104,14 @@ For `architecture/data-flow.md`. flowchart TB source[Repo files] parse[tree-sitter parser] - graph[DuckDB graph] + store[(store.sqlite)] embed[ONNX embedder] query[MCP query] source --> parse - parse --> graph + parse --> store parse --> embed - embed --> graph - query --> graph + embed --> store + query --> store ``` **Rules:** diff --git a/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md b/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md index 517c3959..5a40ce19 100644 --- a/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md +++ b/.claude/skills/codehub-document/templates/agents/doc-diagrams-components.md @@ -26,7 +26,7 @@ Produce `{{ docs_root }}/diagrams/architecture/components.md`: a single Mermaid | Shared context | `Read {{ context_path }}` | always first | | Prefetch ledger | `Read {{ prefetch_path }}` | always first | | Top communities | `{{ context_path }} § Top communities` | cached | -| Community relations | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT source, target, kind FROM relations WHERE kind IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | +| Community relations | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT src, dst, type FROM edges WHERE type IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | | Component method list | `mcp__codehub__context({symbol: })` per top 8 | mid-run | ## 4. Process diff --git a/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md b/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md index 2b629074..653f71f9 100644 --- a/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md +++ b/.claude/skills/codehub-document/templates/agents/doc-diagrams-dependency-graph.md @@ -26,7 +26,7 @@ Produce `{{ docs_root }}/diagrams/structural/dependency-graph.md`: a single Merm | Shared context | `Read {{ context_path }}` | always first | | Prefetch ledger | `Read {{ prefetch_path }}` | always first | | Top communities | `{{ context_path }} § Top communities` | cached | -| Internal edges | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT source, target, kind FROM relations WHERE kind IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | +| Internal edges | `{{ prefetch_path }} § sql relations` or `mcp__codehub__sql({query: "SELECT src, dst, type FROM edges WHERE type IN ('CONTAINS','CALLS','IMPORTS') LIMIT 500"})` | cached if digest present; mid-run otherwise | | External dependencies | `{{ context_path }} § Stack` or `mcp__codehub__dependencies({repo: "{{ repo }}"})` | cached if digest present; mid-run otherwise | ## 4. Process diff --git a/.claude/skills/codehub-onboarding/SKILL.md b/.claude/skills/codehub-onboarding/SKILL.md index 8296905a..94087f7d 100644 --- a/.claude/skills/codehub-onboarding/SKILL.md +++ b/.claude/skills/codehub-onboarding/SKILL.md @@ -52,7 +52,7 @@ Produces a single ONBOARDING.md with a ranked reading order drawn from graph cen | Layer | Tech | Source | |---|---|---| | Runtime | Node 22 | `package.json:7` | -| Storage | DuckDB | `packages/storage/src/index.ts:12` | +| Storage | SQLite (single-file, node:sqlite) | `packages/storage/src/index.ts:12` | | ... | ... | ... | ## Read these 10 files first (in order) diff --git a/.claude/skills/opencodehub-debugging/SKILL.md b/.claude/skills/opencodehub-debugging/SKILL.md index 59cf0db8..1655a0a4 100644 --- a/.claude/skills/opencodehub-debugging/SKILL.md +++ b/.claude/skills/opencodehub-debugging/SKILL.md @@ -86,20 +86,20 @@ Two-hop upstream trace for every caller of `validatePayment`: ```sql WITH direct AS ( - SELECT from_id, to_id, 1 AS depth - FROM relations + SELECT src, dst, 1 AS depth + FROM edges WHERE type = 'CALLS' - AND to_id IN (SELECT id FROM nodes WHERE name = 'validatePayment' AND kind = 'Function') + AND dst IN (SELECT id FROM nodes WHERE name = 'validatePayment' AND kind = 'Function') ), indirect AS ( - SELECT r.from_id, d.to_id, 2 AS depth - FROM relations r - JOIN direct d ON d.from_id = r.to_id + SELECT r.src, d.dst, 2 AS depth + FROM edges r + JOIN direct d ON d.src = r.dst WHERE r.type = 'CALLS' ) SELECT caller.name, caller.file_path, caller.start_line, u.depth FROM (SELECT * FROM direct UNION ALL SELECT * FROM indirect) u -JOIN nodes caller ON caller.id = u.from_id +JOIN nodes caller ON caller.id = u.src ORDER BY u.depth ASC, caller.name; ``` diff --git a/.claude/skills/opencodehub-exploring/SKILL.md b/.claude/skills/opencodehub-exploring/SKILL.md index a4345788..17516442 100644 --- a/.claude/skills/opencodehub-exploring/SKILL.md +++ b/.claude/skills/opencodehub-exploring/SKILL.md @@ -75,9 +75,9 @@ When a name is ambiguous, `context` returns a ranked candidate list instead of s ```sql SELECT r.step, callee.name, callee.file_path, callee.start_line -FROM relations r -JOIN nodes proc ON proc.id = r.from_id -JOIN nodes callee ON callee.id = r.to_id +FROM edges r +JOIN nodes proc ON proc.id = r.src +JOIN nodes callee ON callee.id = r.dst WHERE r.type = 'PROCESS_STEP' AND proc.kind = 'Process' AND proc.name = 'CheckoutFlow' diff --git a/.claude/skills/opencodehub-guide/SKILL.md b/.claude/skills/opencodehub-guide/SKILL.md index 29803824..d1dd833e 100644 --- a/.claude/skills/opencodehub-guide/SKILL.md +++ b/.claude/skills/opencodehub-guide/SKILL.md @@ -5,7 +5,7 @@ description: "Use when the user asks about OpenCodeHub itself — available MCP # OpenCodeHub Guide -Quick reference for every OpenCodeHub MCP tool, MCP resource, and the graph + temporal store schema. +Quick reference for every OpenCodeHub MCP tool, MCP resource, and the single-file `store.sqlite` schema. ## Always Start Here @@ -59,7 +59,7 @@ standalone artifact producer with its own preconditions and output path. | `mcp__codehub__context` | 360-degree symbol view + `confidenceBreakdown` + `cochanges` side-section | | `mcp__codehub__impact` | Blast radius with risk tier + `confidenceBreakdown` | | `mcp__codehub__detect_changes` | Map an uncommitted or committed diff to affected symbols and flows | -| `mcp__codehub__sql` | Read-only query: `sql` arg → temporal DuckDB (cochanges/summaries); `cypher` arg → lbug graph (5 s timeout) | +| `mcp__codehub__sql` | Read-only SQL over the single-file `store.sqlite` (all tables: nodes, edges, embeddings, cochanges, symbol_summaries, store_meta; 5 s timeout). `cypher` arg is reserved for community-fork adapters (unsupported by the default backend) | | `mcp__codehub__signature` | Symbol declaration + stubbed members (class/interface header + method/property signatures, bodies elided) | ### HTTP / RPC surface @@ -115,91 +115,135 @@ Lightweight reads for navigation (every URI uses the `codehub://` scheme): | `codehub://repo/{name}/context` | Stats + staleness envelope | | `codehub://repo/{name}/schema` | Live node kinds / relation types for `sql` | -> Cluster and process navigation resources (`codehub://repo/{name}/clusters`, `codehub://repo/{name}/processes`, etc.) are slated for a later wave. Until then, use the typed tools or Cypher (below) filtered to `kind = 'Community'` / `kind = 'Process'`. - -## Where the graph lives (ADR 0016) - -There are **two stores**, and they are queried differently: - -- **Graph tier — `graph.lbug`** (ladybug, Cypher dialect). Holds nodes, edges, - and embeddings. Query it via the typed tools (`query` / `context` / `impact` / - `route_map` / …) or, for bespoke questions, **Cypher** via the MCP `sql` - tool's `cypher` argument. There is NO `nodes` or `relations` SQL table. -- **Temporal tier — `temporal.duckdb`** (DuckDB SQL). Holds only the - `cochanges` and `symbol_summaries` tables. The `sql` argument of the MCP - `sql` tool (and `codehub sql` on the CLI) targets THIS store. - -Pass exactly one of `sql` (temporal DuckDB) or `cypher` (lbug graph) to the MCP -`sql` tool. - -### Graph schema (lbug / Cypher) - -One node label `CodeNode` carrying `kind` as a **property** (NOT a per-kind -label). One relationship table per relation type. Properties are **snake_case** -(`file_path`, `start_line`, `inferred_label`, `step_count`, `entry_point_id`); -a camelCase RETURN alias comes back as the alias you give it, but the stored -property names are snake_case. - -**Node kinds** (`n.kind` values): File, Folder, Function, Class, Method, +> Cluster and process navigation resources (`codehub://repo/{name}/clusters`, `codehub://repo/{name}/processes`, etc.) are slated for a later wave. Until then, use the typed tools or a `sql` query (below) filtered to `kind = 'Community'` / `kind = 'Process'`. + +## Where the index lives (ADR 0019) + +There is **one store**: a single-file `/.codehub/store.sqlite` +(WAL, via Node's built-in `node:sqlite`). ADR 0019 supersedes ADR 0016: +the old two-tier backend (a `graph.lbug` Ladybug graph plus a +`temporal.duckdb` DuckDB file) is gone. One `SqliteStore` class implements +both the graph and temporal surfaces over that single file. + +Everything is directly SQL-queryable through the MCP `sql` tool's `sql` +argument (and `codehub sql` on the CLI): + +- **Graph tables (`nodes` and `edges`).** `nodes` holds the typed base + columns plus a `payload` JSON overflow; `edges` is one polymorphic table + keyed by `(src, dst, type, step)`. Query them via the typed tools + (`query` / `context` / `impact` / `route_map` / …) or, for bespoke + questions, plain SQL. Multi-hop traversal is a recursive SQL CTE over + `edges`, NOT Cypher. +- **Embeddings (the `embeddings` table).** Vectors live in a BLOB column; + there is NO Parquet sidecar (it was dropped with DuckDB). +- **Temporal tables (`cochanges` and `symbol_summaries`).** Same file, no + second engine. +- **`store_meta`.** Index metadata (graph hash, timestamps). + +Full-text search is BM25 via a SQLite FTS5 virtual table (`nodes_fts`). +The `cypher` argument to the MCP `sql` tool is **reserved for community-fork +graph adapters** (AGE / Memgraph / Neo4j / Neptune) and is **NOT supported +by the default SQLite backend**, so pass `sql` for every query against the +default store. + +### Graph schema (`nodes` / `edges` tables) + +The `nodes` table carries typed base columns (`id`, `kind`, `name`, +`file_path`, `start_line`, `end_line`) plus a `payload` JSON column holding +every kind-specific field. Reach payload fields with SQLite JSON1: +`payload->>'$.inferredLabel'`, `payload->>'$.stepCount'`, +`payload->>'$.entryPointId'`, `payload->>'$.cohesion'`, +`payload->>'$.symbolCount'`. + +The `edges` table is polymorphic: `src`, `dst`, `type`, `confidence`, +`step`, `reason`. The relation kind lives in the `type` column (there is no +per-type table). + +**Node kinds** (`kind` values): File, Folder, Function, Class, Method, Interface, Constructor, Struct, Enum, Macro, Typedef, Union, Namespace, Trait, Impl, TypeAlias, Const, Static, Variable, Property, Record, Delegate, Annotation, Template, Module, CodeElement, Community, Process, Route, Tool, Finding, Dependency, Contributor, Repo, ProjectProfile, Section. -**Relationship types** (each is its own edge label): CONTAINS, DEFINES, IMPORTS, +**Relationship types** (`edges.type` values): CONTAINS, DEFINES, IMPORTS, CALLS, EXTENDS, IMPLEMENTS, HAS_METHOD, HAS_PROPERTY, ACCESSES, METHOD_OVERRIDES, OVERRIDES, METHOD_IMPLEMENTS, MEMBER_OF, PROCESS_STEP, HANDLES_ROUTE, FETCHES, HANDLES_TOOL, ENTRY_POINT_OF, WRAPS, QUERIES, REFERENCES, FOUND_IN, DEPENDS_ON, OWNED_BY. -Cochanges live only in the **temporal** `cochanges` table (DuckDB SQL), never as -graph edges. +Cochanges live only in the `cochanges` table, never as graph edges. -## Cypher cheat-sheet (MCP `sql` tool, `cypher` arg) +## SQL cheat-sheet (MCP `sql` tool, `sql` arg) -All inbound callers of a function by name: +All inbound callers of a function by name (join `edges` to `nodes` on both +endpoints): -```cypher -MATCH (caller:CodeNode)-[r:CALLS]->(callee:CodeNode) +```sql +SELECT caller.name AS name, caller.file_path AS file, + caller.start_line AS line, e.confidence AS confidence, + e.reason AS reason +FROM edges e +JOIN nodes caller ON caller.id = e.src +JOIN nodes callee ON callee.id = e.dst WHERE callee.name = 'validateUser' AND callee.kind = 'Function' -RETURN caller.name AS name, caller.file_path AS file, caller.start_line AS line, - r.confidence AS confidence, r.reason AS reason -ORDER BY r.confidence DESC -LIMIT 50 + AND e.type = 'CALLS' +ORDER BY e.confidence DESC +LIMIT 50; ``` -Top communities by cohesion: +Top communities by cohesion (kind-specific fields via JSON1): -```cypher -MATCH (n:CodeNode) -WHERE n.kind = 'Community' -RETURN n.name AS name, n.inferred_label AS label, n.cohesion AS cohesion, - n.symbol_count AS symbols -ORDER BY n.cohesion DESC -LIMIT 20 +```sql +SELECT name, + payload->>'$.inferredLabel' AS label, + payload->>'$.cohesion' AS cohesion, + payload->>'$.symbolCount' AS symbols +FROM nodes +WHERE kind = 'Community' +ORDER BY cohesion DESC +LIMIT 20; ``` Process entry points: -```cypher -MATCH (n:CodeNode) -WHERE n.kind = 'Process' -RETURN n.name AS name, n.inferred_label AS label, n.step_count AS steps, - n.entry_point_id AS entry_point -ORDER BY n.step_count DESC +```sql +SELECT name, + payload->>'$.inferredLabel' AS label, + payload->>'$.stepCount' AS steps, + payload->>'$.entryPointId' AS entry_point +FROM nodes +WHERE kind = 'Process' +ORDER BY steps DESC; ``` SCIP-confirmed CALLS edges only (strict impact): -```cypher -MATCH ()-[r:CALLS]->() -WHERE r.confidence >= 0.95 AND r.reason STARTS WITH 'scip:' -RETURN r +```sql +SELECT * FROM edges +WHERE type = 'CALLS' + AND confidence >= 0.95 + AND reason LIKE 'scip:%'; +``` + +Multi-hop blast radius is a recursive CTE over `edges`. The typed `impact` +tool wraps this, so prefer it unless you need a bespoke traversal: + +```sql +WITH RECURSIVE reach(id, depth) AS ( + SELECT id, 0 FROM nodes WHERE name = 'validateUser' + UNION + SELECT e.src, r.depth + 1 + FROM edges e JOIN reach r ON e.dst = r.id + WHERE e.type IN ('CALLS', 'REFERENCES') AND r.depth < 3 +) +SELECT DISTINCT n.name, n.file_path, MIN(r.depth) AS depth +FROM reach r JOIN nodes n ON n.id = r.id +GROUP BY n.id ORDER BY depth; ``` -### Temporal SQL cheat-sheet (MCP `sql` tool, `sql` arg) +### Co-change cheat-sheet (MCP `sql` tool, `sql` arg) -Tightest co-change pairs (DuckDB SQL — temporal store): +Tightest co-change pairs (`cochanges` table): ```sql SELECT source_file, target_file, lift, cocommit_count diff --git a/.claude/skills/opencodehub-refactoring/SKILL.md b/.claude/skills/opencodehub-refactoring/SKILL.md index 8acd833f..264f12c1 100644 --- a/.claude/skills/opencodehub-refactoring/SKILL.md +++ b/.claude/skills/opencodehub-refactoring/SKILL.md @@ -149,20 +149,27 @@ mcp__codehub__shape_check({ route: "GET /users/:id", repo: "my-app" }) → mismatches: [{ consumer, expected, actual }] ``` -### `mcp__codehub__sql` — custom reference query (temporal store) - -The `sql` arg is read-only DuckDB over the temporal store (cochanges + -symbol_summaries). To enumerate every file referencing a symbol from the graph, -use the `cypher` arg of the same tool instead (the node/edge graph lives in -`graph.lbug`, not the SQL store): - -```cypher -MATCH (caller:CodeNode)-[r:REFERENCES|CALLS|IMPORTS]->(target:CodeNode) +### `mcp__codehub__sql` — custom reference query (single-file SQLite) + +The `sql` arg is read-only SQL over the single-file `store.sqlite` index +(ADR 0019). Every table is directly queryable: `nodes`, `edges`, `embeddings`, +`cochanges`, `symbol_summaries`, `store_meta`. To enumerate every file +referencing a symbol from the graph, join `edges` to `nodes` on both endpoints: + +```sql +SELECT DISTINCT caller.file_path AS file +FROM edges e +JOIN nodes caller ON caller.id = e.src +JOIN nodes target ON target.id = e.dst WHERE target.name = 'validateUser' -RETURN DISTINCT caller.file_path AS file + AND e.type IN ('REFERENCES', 'CALLS', 'IMPORTS') ORDER BY file ``` +The `cypher` arg of the same tool is reserved for community-fork graph +adapters (AGE / Memgraph / Neo4j / Neptune) and is not supported by the +default SQLite backend. + This catches references a textual rename might miss — useful as a manual-check list before and after you edit. @@ -172,7 +179,7 @@ list before and after you edit. | --------------------------------- | ----------------------------------------------------------------------- | | Many callers (> 5) | Use your editor's LSP rename for the mechanical work; `impact` is the checklist | | Cross-module references | Run `detect_changes` after editing; watch for missed imports | -| String / dynamic references | Use the `cypher` arg with `REFERENCES`; the graph cannot see string-keyed dispatch — read those by hand | +| String / dynamic references | Query the `edges` table for `REFERENCES` rows; the graph cannot see string-keyed dispatch, so read those by hand | | Public / exported API | Version and deprecate; mirror symbol names in a transition layer | | Heuristic edges (confirmed = 0) | Cross-check by reading source; the SCIP oracle did not weigh in | diff --git a/.erpaval/INDEX.md b/.erpaval/INDEX.md index de8fd5a4..e738b0c0 100644 --- a/.erpaval/INDEX.md +++ b/.erpaval/INDEX.md @@ -11,6 +11,7 @@ development sessions. Solutions are reusable; specs are per-feature. - [New code-pack BOM items must anchor on graph nodes, not chunker data](solutions/architecture-patterns/pack-bom-additions-anchor-on-graph-nodes-not-chunker.md) — `generatePack` only gets `chunkerFiles` from the determinism TEST fixture; production `runPackEngine` never wires it, so `ast-chunks.jsonl` is empty in real packs. Anchor new BOM items on `File` nodes (filePath/contentHash/lineCount/language, prod-populated); byte ranges/token counts are best-effort. (Latent prod bug now FIXED — see next entry.) - [Pack provenance is derived in the CLI, with hash-verified disk bytes](solutions/architecture-patterns/pack-provenance-derived-in-cli-with-hash-verified-bytes.md) — `runPackEngine` derives commit/origin from the `Repo` node, chunkerFiles from disk (each hash-verified against `FileNode.contentHash`, drifted files skipped), and grammar pins from `parse.grammarVersions()`, then threads them through `generatePack`'s `internal` seam. Derivation is the unset-path fallback so pack fixtures keep their behavior; defensive against a stubbed/empty graph. Fixes the empty-ast-chunks/hollow-manifest bug while preserving byte-identity. +- [Pack provenance/channel fields ride outside the packHash preimage unless they change the decision set](solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md) — classify every new pack field FIRST: hash-bound iff it changes the decision set (selection/sizing/BOM content), hash-free iff it only annotates delivery. tokenizer lane + CycloneDX 1.7 citation = hash-bound (legitimately flip packHash); `--cache-channel` = hash-free (kept out of the manifest preimage so `auto` stays byte-identical). Mis-binding breaks pinned fixtures; mis-unbinding breaks the re-derivability contract. - [`@opencodehub/pack` strict `tsc -b` fails on code the test runner accepts](solutions/build-errors/pack-strict-tsc-vs-loose-test-runner-exact-optional.md) — green `test` ≠ green `build`. `noUncheckedIndexedAccess` (iterate `for..of` + accumulator, not `arr[i]`) and `exactOptionalPropertyTypes` (explicit literals, not spread-over-readonly) bite the build only. Run `build` after pack edits; strip ANSI before grepping `error TS`. - [Collapse a publish-many TS monorepo into one bundled CLI with tsup](solutions/architecture-patterns/tsup-collapse-monorepo-to-single-cli.md) — `noExternal:[/^@scope//]` + `external:[/^[^.]/]`; workers as named entries (esbuild won't follow `new URL(...,import.meta.url)`); copy import.meta.url assets in onSuccess; tsconfig.test.json → dist-test/ because tsup drops *.test.ts; convert hidden string-imports to static. Kills the pack-all-publishables bug class. - [Make a heavy native dep optional + lazy so a default install can prune it](solutions/architecture-patterns/optional-native-dep-lazy-import.md) — onnxruntime-node 254MB: deps→optionalDependencies, top-level value-import→`import type`, dynamic `import()` at use site threading the runtime constructor in; bundler must keep it `external`. @@ -79,6 +80,13 @@ development sessions. Solutions are reusable; specs are per-feature. - [Intersect filesystem-walk paths with HEAD-tracked set before git blame](solutions/best-practices/filter-fs-walk-paths-against-head-before-git-blame.md) — the scan phase's fs walk includes untracked files (everything `codehub init` writes), so the ownership phase's per-file `git blame` fails with ~45 `fatal: no such path in HEAD` warnings on a fresh analyze. Capture the tracked set once via `git ls-files -z`, expose it as an optional `ScanOutput.trackedPaths`, and filter the blame list when defined (undefined ⇒ non-git, don't filter; empty set is meaningful). Leave onWarn intact so real failures on tracked files still surface. +- [Build a characterization harness before a hash-preserving refactor](solutions/best-practices/characterization-harness-before-hash-preserving-refactor.md) — when unit tests are STRUCTURAL (set-membership) and the only field-level gate is a downstream hash over one fixture, a refactor can drift a hash-relevant value (calleeOwner/qualifiedName/startLine) and pass everything. Snapshot full canonical-JSON output of every unit × operation into a committed golden, prove it with a negative self-check, convert one unit at a time. Caveat: a snapshot only locks the paths its fixture exercises — audit each unit's fixture for the risky branch. +- [jscpd overcounts loop shells that already wrap shared helpers](solutions/best-practices/jscpd-overcounts-shells-wrapping-shared-helpers.md) — a token-level clone scan counts the loop shell around an already-shared helper as duplication. Ground-truth what's ALREADY extracted before trusting the LOC-collapse estimate; the scan's "most-duplicated file" may be the shared module itself (ts-shared.ts). Rank by dup-lines ÷ divergence-risk, and don't force a generic where every variant is genuinely distinct (extractImports regex sets). +- [Verify a per-variant guard filters before preserving it when unifying](solutions/conventions/dead-guard-branch-reduces-to-noop-strategy-when-unifying.md) — a guard can be DEAD (both branches return the same value; a `.includes(".")` gate before a `lastIndexOf(".x")` that no-ops), so folding it to the simpler shared strategy is behavior-preserving. Prove deadness from the old code. Symmetric: also prove two guards equivalent before merging — cpp `->`/`::` strip and rust `::`-then-`.` are REAL behavior, must be reproduced verbatim. +- [Config-factory collapse of N parallel implementations](solutions/architecture-patterns/config-factory-collapse-of-parallel-extractors.md) — collapse N near-duplicate implementations into ONE generic + small config factories (strategy closures, kindFromMap, predicates), gated by a characterization harness, one variant at a time. Don't over-unify genuinely-different algorithms (4 receiver algs stay separate); don't over-extract entangled variants (python defs stay custom). ~1,100 LOC removed, graphHash byte-identical, 2 reviewers found no drift. +- [Verify grammar-reachability before covering a config branch](solutions/conventions/verify-grammar-reachability-before-covering-a-config-branch.md) — a branch with "no fixture coverage" may be UNREACHABLE by grammar construction (dead defensive code), not a gap. Probe the WASM grammar before fabricating a fixture: dart has no invocation node; swift tags struct/enum as `definition.class`; ruby excludes `definition.module` from CALLABLE_SCOPE_TAGS. Document unreachable branches; only cover genuinely-reachable ones (isExported=false via `_`-prefixed decls). +- [Backtick in a template-literal-embedded DSL comment breaks tsc](solutions/build-errors/backtick-in-template-literal-embedded-dsl-comment.md) — a Markdown-style `code` span inside a backtick-delimited template literal (tree-sitter query, SQL, gql``) closes the literal early → TS1005 on the orphaned prose. The DSL's own `;` comment marker doesn't protect (JS lexer delimits first). Drop the inline-code backticks or move the note to a host `//` comment outside the literal. + ## Specs - [001-scip-replaces-lsp](specs/001-scip-replaces-lsp/spec.md) — rip-and-replace LSP with SCIP for TS/Py/Go/Rust/Java. Task map: [tasks.md](specs/001-scip-replaces-lsp/tasks.md). diff --git a/.erpaval/ROADMAP.md b/.erpaval/ROADMAP.md index 810e97a8..cf58e2d9 100644 --- a/.erpaval/ROADMAP.md +++ b/.erpaval/ROADMAP.md @@ -58,7 +58,7 @@ Sequenced by dependency only. No calendar estimates. ## M3 — LadybugDB phase-1 (PENDING, parallel with M4) -Replace recursive-CTE traversals with polymorphic rel-table-per-edge schema (**corrected 2026-05-05** — the v1 roadmap proposed a single rel-table with a `type` column; LadybugDB docs recommend one named rel table per edge kind with multiple `FROM/TO` pairs for columnar predicate pushdown). Current OCH edge-kind count is **23** (post-M2 additions `FOUND_IN`, `DEPENDS_ON`, `OWNED_BY`, `WRAPS`, `QUERIES`, `REFERENCES`, `ACCESSES`), not 21 as originally estimated. +Replace recursive-CTE traversals with polymorphic rel-table-per-edge schema (**corrected 2026-05-05** — the v1 roadmap proposed a single rel-table with a `type` column; LadybugDB docs recommend one named rel table per edge kind with multiple `FROM/TO` pairs for columnar predicate pushdown). Current OCH edge-kind count is **25** (post-M2 additions `FOUND_IN`, `DEPENDS_ON`, `OWNED_BY`, `WRAPS`, `QUERIES`, `REFERENCES`, `ACCESSES`), not 21 as originally estimated. LadybugDB = community successor to Kuzu (Apple acquisition). Pre-1.0 with ABI breaks every few months. **Current npm package: `@ladybugdb/core@0.16.1`** (released 2026-05-04, one day before roadmap review). Source-level naming uses `GraphDbStore` / `graphdb-adapter.ts` / `graphdb-pool.ts` to stay within `scripts/check-banned-strings.sh` limits — the `ladybug` and `kuzu` literals are rejected in tracked source files; the `@ladybugdb/core` dep in `package.json` is permitted under package-scope precedent. diff --git a/.erpaval/solutions/architecture-patterns/config-factory-collapse-of-parallel-extractors.md b/.erpaval/solutions/architecture-patterns/config-factory-collapse-of-parallel-extractors.md new file mode 100644 index 00000000..7b0a7886 --- /dev/null +++ b/.erpaval/solutions/architecture-patterns/config-factory-collapse-of-parallel-extractors.md @@ -0,0 +1,55 @@ +--- +title: Collapse N near-duplicate per-variant implementations into one generic + config factories, gated by a characterization harness +track: knowledge +category: architecture-patterns +module: packages/ingestion/src/providers +component: language-provider extractors +severity: info +tags: [dedup, generic, config-factory, strategy, behavior-preserving, characterization, extractors, dry] +applies_when: + - "N implementations of the same operation share a ~80% identical loop shell and diverge only in small parameterized ways" + - "behavior must be preserved exactly (a downstream hash or contract depends on the output)" + - "the per-variant tests are structural and won't catch value drift" +pattern: | + The safe recipe for collapsing N parallel near-duplicate implementations (here: 14 + language providers x 3 extractors — extractCalls, extractDefinitions, extractHeritage): + 1. Build a value-locking characterization harness FIRST (see the sibling lesson + characterization-harness-before-hash-preserving-refactor) and prove it with a + negative self-check. This is the arbiter for every subsequent step. + 2. Extract ONE generic function owning the identical loop shell, parameterized by a + small `Config` object. Keep the generic in the deepest shared module (extract-helpers.ts). + 3. For each axis of variance, provide a FACTORY that returns a closure reproducing + the original algorithm EXACTLY — e.g. receiver strategies (dotPrefixReceiver, + sepStripReceiver, multiSepReceiver), kindFromMap(map), promoteToMethod predicates, + ownerOverride. Do NOT collapse genuinely-different algorithms into one "unified" + parameter — the extractCalls receiver had 4 distinct algorithms (lastIndexOf(.name) + vs bare-name+strip vs multi-sep-preference vs none) that MUST stay separate. + 4. Accept two escape hatches: (a) a variant too entangled to fit stays custom + (python's extractDefinitions — dual property/const dedup + Variable kind), and + (b) some config providers pass a FUNCTION where others pass data (csharp/java + kindFor reads nodeType; the generic takes the function form, data-providers wrap + a Record). Forcing uniformity where the variants genuinely differ is the anti-goal. + 5. Convert ONE variant at a time, re-running the harness after each; a snapshot flip + means you drifted — fix the config, never regenerate the golden. + 6. Prove dead/defensive config branches are grammar-unreachable rather than forcing + fixtures for them (see verify-grammar-reachability-before-covering-a-config-branch). + Result across OCH: ~1,100 net lines removed with graphHash byte-identity intact, + verified by 2 adversarial reviewers finding NO drift. +example_files: + - packages/ingestion/src/providers/extract-helpers.ts + - packages/ingestion/src/providers/characterization.test.ts +--- + +# Why this matters + +This is the generalization of collapse-parallel-switches-into-record-registry to a +LOOP body, not just a switch. The load-bearing discipline is: the characterization +harness makes "did I preserve behavior?" a mechanical per-variant check instead of a +judgment call, so the collapse can proceed one variant at a time with a hard gate, +and genuinely-divergent variants (4 receiver algorithms, python defs, csharp/java +nodeType kind resolution) are left as explicit config branches or custom code rather +than smeared into a lowest-common-denominator abstraction that drifts. The two failure +modes it avoids: over-unification (merging 4 receiver algs into 1 regex → silent +drift) and over-extraction (forcing python/extractImports into a generic that becomes +a pass-through lambda — the extractImports non-win). Config-factory + harness + one-at- +a-time is how you get the DRY win without the drift. diff --git a/.erpaval/solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md b/.erpaval/solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md new file mode 100644 index 00000000..74403266 --- /dev/null +++ b/.erpaval/solutions/architecture-patterns/pack-provenance-fields-stay-out-of-packhash-preimage.md @@ -0,0 +1,49 @@ +--- +name: Pack provenance/channel fields ride outside the packHash preimage unless they change the decision set +description: New pack metadata (tokenizer lane, cache channel, provenance citations) must be classified as either hash-bound (changes what was selected) or hash-free (only annotates), or it silently breaks determinism fixtures or the decision-equivalence contract +type: architecture-patterns +--- + +When adding a new field to the pack, decide up front whether it belongs in +the `packHash` preimage. The preimage (`packages/pack/src/manifest.ts` +`toSnakeCaseManifest` → `canonicalJson` → `sha256Hex`) binds the fields +that define WHAT was selected: `commit`, `tokenizerId`, `budgetTokens`, +`determinismClass`, `pins`, per-file `fileHash`, and `contextBomHash`. Under +ADR-0020 the real contract is decision-equivalence (same inputs ⇒ same +retrieval decision set); byte-identity is a cheap witness. So the test is: +does the field change the decision set? + +Three fields added across Moves 1/2/4 sorted cleanly into the two classes: + +- **Sonnet-5 tokenizer lane (Move 1)** — HASH-BOUND. `tokenizerId` is already + in the preimage; a new lane value (`anthropic:claude-sonnet-5@2026-06-30`) + legitimately flips `packHash` because it changes chunk sizing and the + `resolveDeterminism` verdict (`index.ts:287` downgrades any `anthropic:` + prefix to `best_effort`). No fixture broke because the determinism tests + assert cross-run EQUALITY (`m1 === m2`), not golden literals. +- **CycloneDX 1.7 + per-file provenance citation (Move 2)** — HASH-BOUND, by + design. `context-bom.json` is a BOM item AND its `contextBomHash` is a named + preimage field, so bumping `specVersion`/`$schema` and adding a per-file + `externalReferences[{type:"vcs",url}]` + `opencodehub:commit` property flips + the hash. That is correct: the receipt's content genuinely changed. Only the + one hard `specVersion === "1.6"` assertion needed editing; equality-based + determinism suites passed untouched. +- **`--cache-channel` (Move 4)** — HASH-FREE. The channel only shapes the + agent-facing assembled context string (`assemblePackContext` in + `variance-probe.ts`), not the BOM. It is recorded on `PackOpts.cacheChannel` + but deliberately kept OUT of `toSnakeCaseManifest`, so the default (`auto`, + marker-free) path is byte-identical to pre-Move-4 packs and every + determinism/golden fixture stays green. If it had leaked into the preimage, + every existing pinned pack would have broken for a field that changes nothing + about what was selected. + +Mechanism: classify the field FIRST. Hash-bound iff it changes the decision +set (selection, sizing, or the recorded content of a BOM item). Hash-free iff +it only annotates delivery/consumption. A cache/delivery/rendering knob is +hash-free; a tokenizer/budget/selection/content knob is hash-bound. Getting +this wrong is silent: a mis-bound knob breaks every pinned fixture; a +mis-unbound content field breaks the re-derivability contract. + +Related: [[collapse-parallel-switches-into-record-registry]] (the channel enum ++ `cacheChannelNeedsMarkers` switch is exhaustive over the union so a new +channel forces a compile-time decision). diff --git a/.erpaval/solutions/best-practices/characterization-harness-before-hash-preserving-refactor.md b/.erpaval/solutions/best-practices/characterization-harness-before-hash-preserving-refactor.md new file mode 100644 index 00000000..337ee061 --- /dev/null +++ b/.erpaval/solutions/best-practices/characterization-harness-before-hash-preserving-refactor.md @@ -0,0 +1,52 @@ +--- +title: Build a full-value characterization harness before a hash-preserving refactor when unit tests are structural +track: knowledge +category: best-practices +module: packages/ingestion/src/providers +component: graph extractors / graphHash determinism +severity: info +tags: [characterization, golden-test, refactor, determinism, graphhash, snapshot, negative-self-check, behavior-preserving] +applies_when: + - "refactoring code whose only field-level correctness gate is a downstream hash (graphHash byte-identity)" + - "the existing unit tests assert set-membership / structure, not exact VALUES of hash-relevant fields" + - "you are collapsing N near-duplicate implementations into one parameterized generic" +pattern: | + When the per-unit tests are structural (e.g. "defs include Greeter") they do NOT + lock the hash-relevant VALUES a refactor can silently drift: calleeOwner, + qualifiedName, startLine, owner, the undefined-vs-null-vs-empty distinction. The + only true gate (incremental-determinism / graphHash parity) may run over ONE + fixture (e.g. a TS repo), leaving every other language's extractor field-drift + uncaught. Close the gap BEFORE touching the code: build a characterization test + that snapshots the FULL canonical-JSON output of every unit x every operation over + a representative fixture, asserts byte-equality against a committed golden, and + fails with a per-unit/per-operation diff. Prove the net works with a NEGATIVE + SELF-CHECK (perturb one output, confirm it fails with a precise diff, revert) + before trusting it. Then convert one unit at a time, re-running the harness after + each. Sort each output array by canonicalJson(element) so pure-reorder churn is + cancelled but value drift is caught. Gate golden regeneration behind an explicit + env flag; never let it rewrite silently. +example_files: + - packages/ingestion/src/providers/characterization.test.ts + - packages/ingestion/src/providers/characterization-golden.ts +--- + +# Why this matters + +The extractCallsGeneric refactor (session-6a05ac) collapsed 14 language providers' +extractCalls into one generic + 4 receiver-strategy factories. The per-provider +tests would have passed even if the generic drifted a `calleeOwner` value for +several languages — and that drift only shows up as a graphHash change in +production, long after the refactor. The characterization harness (65 snapshots: +16 providers × 4 extractors + a registry-count tripwire) made any field drift fail +immediately with `characterization drift: . changed value`. Its +negative self-check (perturb swift calleeName → fail → revert) proved it before a +single provider was converted. Result: all 14 conversions landed with graphHash +byte-identity intact, verified from a clean rebuild. + +A coverage caveat the harness itself surfaced: a snapshot only locks the paths its +fixture EXERCISES. Five providers had no receiver-bearing call site, so their +`calleeOwner` branch had no live tripwire — found in the Validate phase, fixed by +adding receiver-bearing calls to those fixtures (dart couldn't be fixed at fixture +level: its parse query has no @reference.call capture, so it structurally emits 0 +calls — a genuine gap to log, not force). Lesson within the lesson: after building +the harness, audit whether each unit's fixture actually exercises the risky branch. diff --git a/.erpaval/solutions/best-practices/jscpd-overcounts-shells-wrapping-shared-helpers.md b/.erpaval/solutions/best-practices/jscpd-overcounts-shells-wrapping-shared-helpers.md new file mode 100644 index 00000000..75c7b856 --- /dev/null +++ b/.erpaval/solutions/best-practices/jscpd-overcounts-shells-wrapping-shared-helpers.md @@ -0,0 +1,44 @@ +--- +title: A token-level duplication scan overcounts loop shells that already wrap shared helpers +track: knowledge +category: best-practices +module: packages/ingestion/src/providers +component: duplication analysis / dedup planning +severity: info +tags: [jscpd, duplication, dedup, scan, ground-truth, refactor-planning, false-signal] +applies_when: + - "planning a dedup effort off a jscpd (or any token-level clone) report" + - "the flagged files may already share the hard logic via a helper module" + - "estimating LOC-collapse before committing to a refactor" +pattern: | + A token-level duplication scanner counts the repeated LOOP SHELL that wraps an + already-shared helper as a clone — even when the substantive logic was extracted + long ago. Before trusting the scan's headline LOC-collapse number, ground-truth + what is ALREADY shared: read the flagged files and the helper module they import. + In OCH's providers, jscpd reported ~1,733 dup lines across the language providers, + but the generic capture-pairing / owner-derivation / enclosing-scope walk was + already centralized in extract-helpers.ts and consumed by all 14 providers. The + REAL residual was two byte-identical private helpers (findNameInside, + qualifiedForCapture — 13 verbatim copies) plus the ~20-30 line extractCalls/ + extractDefinitions loop shells that call the shared helpers. Also: the scan's + "most-duplicated file" (ts-shared.ts) was not a duplicate at all — it is the + shared TS-family module, consumed by typescript/tsx/javascript. And one extractor + (extractImports) is irreducibly per-language (hand-rolled regex sets) — forcing a + generic there adds indirection with no dedup win. Rank targets by + (dup-lines-saved ÷ divergence-risk), not by the scanner's raw clone count. +example_files: + - packages/ingestion/src/providers/extract-helpers.ts + - packages/ingestion/src/providers/ts-shared.ts +--- + +# Why this matters + +Acting on the scan's headline (~1.7k lines in providers) without grounding would +have (a) over-promised the collapse, (b) wasted effort trying to genericize +extractImports where every language's regex set is genuinely distinct, and (c) +risked "deduplicating" ts-shared.ts, which is the shared module the scan misread as +the biggest clone. Five parallel Explore agents ground-truthed the scan first; the +actual, safe collapse was the two zero-variance helpers (−247 LOC, near-zero risk) ++ extractCallsGeneric across all 14 providers (−472 LOC). The scan pointed at the +right AREA (providers, not the MCP↔CLI axis it originally emphasized) but its +per-file LOC attribution needed verification against what was already extracted. diff --git a/.erpaval/solutions/build-errors/backtick-in-template-literal-embedded-dsl-comment.md b/.erpaval/solutions/build-errors/backtick-in-template-literal-embedded-dsl-comment.md new file mode 100644 index 00000000..5d681933 --- /dev/null +++ b/.erpaval/solutions/build-errors/backtick-in-template-literal-embedded-dsl-comment.md @@ -0,0 +1,46 @@ +--- +title: A backtick inside a template-literal-embedded DSL string terminates the literal (TS1005) +track: bug +category: build-errors +module: packages/ingestion/src/parse +component: tree-sitter query strings (JS template literals) +severity: low +tags: [template-literal, backtick, tsc, TS1005, tree-sitter, query, comment, dsl] +symptoms: + - "tsc -b fails with TS1005 ',' expected on prose lines inside a template literal" + - "the error points at natural-language text that was meant to be a comment inside an embedded DSL string" +root_cause: | + A tree-sitter query (or any DSL) stored as a JS template literal is delimited by + backticks. Writing an explanatory comment INSIDE that query body — using the DSL's + own line-comment syntax (`; ...` for tree-sitter S-expressions) — is fine UNTIL the + comment prose contains a backtick (e.g. a Markdown-style `obj.field` inline-code + span). The backtick closes the JS template literal early; everything after it is + re-parsed as JavaScript, and tsc emits TS1005 (',' expected) on the now-orphaned + prose. The DSL comment marker (`;`) does NOT protect against this — comment-ness is + a property of the DSL parser, but the JS template literal is delimited by the JS + lexer FIRST, before the string ever reaches the DSL. +resolution_type: code-fix +applies_when: + - "writing an explanatory comment inside a DSL stored as a JS/TS template literal" + - "the DSL is delimited by backticks (tree-sitter queries, SQL-in-backticks, GraphQL gql``, etc.)" +--- + +# Fix + +Do not use backticks in prose that lives inside a backtick-delimited template +literal. Write `obj.field` as `obj.field` without the code-span backticks (plain +`obj.field` text), or escape as `\``, or move the explanation to a JS `//` comment +OUTSIDE the template literal. In OCH this bit when documenting why dart's +`DART_QUERY` intentionally omits `@reference.call`: the comment said "field READs +like `obj.field`" and the backticks around `obj.field` truncated the query template, +breaking `tsc -b` (unified-queries.ts). Fix was to drop the inline-code backticks +("field READs such as obj.field"). + +# Why this matters + +The build gate caught it immediately (TS1005), so it never shipped — but the error +message points at the PROSE, not the backtick, so it reads as a nonsense parse error +until you notice the stray backtick closed the literal. Any embedded-DSL-in-template- +literal comment is exposed: SQL in backticks, GraphQL `gql\`...\``, tree-sitter query +bodies. Sweep DSL-in-template-literal comments for backticks, or keep such +explanations in host-language comments outside the literal. diff --git a/.erpaval/solutions/conventions/dead-guard-branch-reduces-to-noop-strategy-when-unifying.md b/.erpaval/solutions/conventions/dead-guard-branch-reduces-to-noop-strategy-when-unifying.md new file mode 100644 index 00000000..aefd0443 --- /dev/null +++ b/.erpaval/solutions/conventions/dead-guard-branch-reduces-to-noop-strategy-when-unifying.md @@ -0,0 +1,49 @@ +--- +title: Verify a per-variant guard actually filters before preserving it when unifying implementations +track: knowledge +category: conventions +module: packages/ingestion/src/providers +component: extractCalls receiver inference +severity: info +tags: [refactor, dead-code, guard, regex, receiver-inference, unify, behavior-preserving, no-op] +applies_when: + - "collapsing N per-variant implementations into one parameterized generic" + - "a variant carries a guard (regex test, early-return, includes-check) that LOOKS like it differentiates behavior" + - "you must preserve behavior exactly (hash-identity / characterization gate)" +pattern: | + When unifying near-duplicate implementations, a per-variant guard can be DEAD — + present in the source but with no effect on output — and collapsing it to the + simpler shared strategy is then behavior-PRESERVING, not a change. Prove deadness + from the OLD code before folding. Two real cases from the OCH extractCalls unify: + - python's inferPyReceiver and ts-shared's inferTsReceiver each had a regex + guard whose BOTH branches returned the same prefix + (`if (RE.test(prefix)) return prefix; return prefix;`) — the regex never + filtered anything. Folding them to the no-regex strategy (dotPrefixNoRegex) + was exact. The packet had said "defer python/ts unless trivial"; proving the + guard dead made it trivial. + - java's receiver block was gated on `ref.text.includes(".")` before a + `lastIndexOf(".name")` — a proven no-op, because lastIndexOf returns -1 when + there is no ".", so the guard short-circuits nothing. java folded into the + plain dot-prefix strategy. + Contrast: do NOT assume all guards are dead. cpp/php use `lastIndexOf(bareName)` + + strip-trailing-separator + a REAL regex that filters; swift/ruby use + `lastIndexOf(".name")` with a REAL regex; rust tries `::` then `.` in order. + Those are genuine behavior and must each be reproduced verbatim (do NOT collapse + the four algorithms into one "unified" regex). The discriminator is: does the + guard change the output for any input the code actually sees? Read both branches; + if they converge, it's dead. +example_files: + - packages/ingestion/src/providers/extract-helpers.ts +--- + +# Why this matters + +The unify agent could have "played it safe" and preserved python/ts/java's guards +as distinct strategies, leaving three near-duplicate factories that don't earn +their existence. By proving the guards dead from the deleted code, it folded all 14 +providers into 4 honest strategy factories with the characterization harness green +at every step. Equally important is the inverse discipline: the same agent did NOT +merge cpp's `->`/`::` strip or rust's separator-preference loop into the dot-prefix +strategy, because those guards DO filter — merging them would have drifted +`calleeOwner` and failed the harness. The rule is symmetric: prove a guard dead +before dropping it, and prove two guards equivalent before merging them. diff --git a/.erpaval/solutions/conventions/verify-grammar-reachability-before-covering-a-config-branch.md b/.erpaval/solutions/conventions/verify-grammar-reachability-before-covering-a-config-branch.md new file mode 100644 index 00000000..13b03618 --- /dev/null +++ b/.erpaval/solutions/conventions/verify-grammar-reachability-before-covering-a-config-branch.md @@ -0,0 +1,55 @@ +--- +title: Verify a config/AST branch is grammar-reachable before treating it as a coverage gap +track: knowledge +category: conventions +module: packages/ingestion/src/providers +component: tree-sitter queries / extractor config branches +severity: info +tags: [tree-sitter, grammar, reachability, coverage, characterization, dead-branch, dart, promote-to-method, empirical] +applies_when: + - "a review flags a config branch or AST-handling path as having no test coverage" + - "you are about to add a fixture to exercise a per-language/per-variant branch" + - "working with tree-sitter grammars whose node vocabulary you have not empirically confirmed" +pattern: | + A branch with "no fixture coverage" is not always a coverage gap — sometimes the + branch is UNREACHABLE by the grammar's construction, i.e. defensive/dead code that + no valid input can trigger. Before writing a fixture to cover it, PROVE the branch + is reachable by parsing a candidate input through the actual (WASM) grammar and + inspecting the node types / captures. If no valid source produces the node the + branch keys on, the branch is unreachable-by-construction: document it, do NOT + fabricate invalid source to force it, and do NOT treat its absence from the golden + as a defect. + + Three real cases from OCH (session-6a05ac), all confirmed by live-parse probes: + - dart @reference.call: dart's grammar has NO invocation node (function_expression_ + invocation → "Bad node name"); calls are flat sibling chains, so a sound single- + S-expression call capture is impossible. Left absent + documented. + - promoteToMethod struct/enum (swift), module (ruby), interface (kotlin): the + provider config lists these owner tags, but the grammars never emit a + `definition.function` nested under those container tags (methods in a struct + parse as definition.method, etc.), so the promotion branch for those owners is + dead. Only the class (and dart mixin) branch actually fires. + Conversely, the isExported=false branch WAS reachable for swift/dart/kotlin/cpp/js + (a `_`-prefixed decl fires it) — that one was a genuine gap, and adding a + `_privateHelper` fixture closed it. + + The discriminator: reachability is a property of the grammar + the extractor's + tag vocabulary, not of the current fixture. Probe the grammar, don't guess. +example_files: + - packages/ingestion/src/providers/characterization.test.ts + - packages/ingestion/src/parse/unified-queries.ts +--- + +# Why this matters + +Two adversarial reviewers flagged ~8 "uncovered config branches" after the +extractor-generic refactor. A naive response fabricates a fixture for each. The +correct response, taken here: probe the dart/swift/ruby/kotlin WASM grammars, find +that most of the flagged promotion branches are unreachable-by-construction (dead +defensive code, not gaps), close only the genuinely-reachable ones (isExported=false +via `_`-prefixed decls), and document the unreachable ones so the next reviewer +doesn't re-flag them. Fabricating invalid source to hit a dead branch would add a +misleading fixture and could even mask a real future regression. The same probe +discipline killed the dart-call-capture dead end before it shipped an unsound query. +Grep the grammar; a "gap" that no valid input can reach is a documentation task, not +a fixture task. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 36d0be40..36729fbc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,8 +39,8 @@ jobs: # Parsing is WASM-only on every supported Node version (ADR 0015), so the # test suite needs no native grammar build — `--ignore-scripts` is the # single install path across the matrix. The remaining native deps - # (@duckdb/node-api, @ladybugdb/core, onnxruntime-node) ship prebuilds, so - # storage/embedder tests pass without running postinstall. + # the WASM embedder (onnxruntime-web) and node:sqlite need no native build, so + # storage/embedder tests pass without running postinstall (ADR 0019). # # Build before test: every package's `test` runs `node --test` against its # built `dist/` (and the cli compiles `src` → `dist-test/`), so the dist diff --git a/.github/workflows/verify-global-install.yml b/.github/workflows/verify-global-install.yml index 4f375845..e6b93a5e 100644 --- a/.github/workflows/verify-global-install.yml +++ b/.github/workflows/verify-global-install.yml @@ -208,7 +208,7 @@ jobs: TARBALL_DIR: ${{ runner.temp }}/opencodehub-tarballs FIXTURE_DIR: tests/fixtures/multi-lang # Use the script's documented default budget. A cold-cache global - # install of the native prebuilts (ladybug + duckdb + onnxruntime) + # install of a pure-JS + WASM package (no native storage/embedder bindings) # on a loaded shared runner legitimately varies 30-90s, so the old # hardcoded 60s tripped on slow macOS cells despite a clean install # (see scripts/verify-global-install.sh header + issue #163). The diff --git a/AGENTS.md b/AGENTS.md index e86121e0..deab0320 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,14 +13,15 @@ tiers. - `context` — inbound/outbound refs and participating flows for one symbol. - `impact` — dependents of a target up to a configurable depth, with a risk tier. - `detect_changes` — map an uncommitted or committed diff to affected symbols. -- `sql` — read-only SQL against the local temporal store (the `cochanges` and `symbol_summaries` tables), 5 s timeout. The node/edge graph lives in `graph.lbug` (ADR 0016) and is reached via the typed tools (`query`/`context`/`impact`) or Cypher via the MCP `sql` tool's `cypher` arg — NOT via this SQL path. +- `list_findings` — browse SARIF findings from the latest scan by severity and rule. +- `sql` — read-only SQL against the local temporal store (cochanges + symbol_summaries), 5 s timeout; the node/edge graph is queried via the typed tools or Cypher via the MCP `sql` tool. Run `codehub analyze` after pulling new commits so the index stays aligned with the working tree. `codehub status` reports staleness. ## Full MCP surface -The full MCP surface is **28 tools** (see `packages/mcp/src/server.ts`); +The full MCP surface is **29 tools** (see `packages/mcp/src/server.ts`); the 6 listed above are the high-frequency exploration tools. For the full inventory, use the `/opencodehub-guide` skill. @@ -81,22 +82,23 @@ This repo ships a Claude Code plugin at `plugins/opencodehub/` — it provides a `code-analyst` subagent and 11 skills. Install via `codehub init` (writes `.mcp.json` + links the plugin). -## Storage backend — lbug graph + DuckDB temporal - -The graph tier is always `@ladybugdb/core` (`graph.lbug`); the temporal -tier — cochanges, structured symbol summaries, and the -`codehub query --sql` escape hatch — is always DuckDB -(`temporal.duckdb`). Both files live under `/.codehub/`. There is -no env-var, no probe, no fallback; if the lbug binding fails to load, -`open()` throws `GraphDbBindingError` and the operation aborts. See -ADR 0016 (`docs/adr/0016-duckdb-graph-rip.md`) for the rationale and the -AGE/Memgraph/Neo4j/Neptune community-adapter contract that survives the -rip-out (the segregated `IGraphStore` / `ITemporalStore` interfaces stay -exactly because community-fork adapters are a deliberate escape hatch). - -`IGraphStore` lives only on `GraphDbStore`; `DuckDbStore` implements -`ITemporalStore` only. Embeddings live in `graph.lbug` and stream into a -per-call DuckDB temp table at pack time so the byte-identical Parquet -sidecar still works (see `packages/pack/src/embeddings-sidecar.ts`). -Future temporal swap (e.g. SQLite-WASM) only needs a new `ITemporalStore` -implementor — no graph-tier change. +## Storage backend — single-file SQLite (ADR 0019) + +The entire index lives in ONE `/.codehub/store.sqlite` file (WAL), +via Node's built-in `node:sqlite` — graph nodes, edges, embeddings, and +the temporal tables (cochanges, structured symbol summaries, and the +`codehub query --sql` escape hatch). One `SqliteStore` class implements +BOTH `IGraphStore` and `ITemporalStore`; `openStore()` returns that single +instance as both the `graph` and `temporal` views, so call sites use +`store.graph.X()` / `store.temporal.Y()` unchanged. There are zero native +storage bindings — `@ladybugdb/core` and `@duckdb/node-api` were both +removed. See ADR 0019 (`docs/adr/0019-single-file-sqlite-storage.md`) for +the rationale; it supersedes ADR 0016 +(`docs/adr/0016-duckdb-graph-rip.md`). + +The segregated `IGraphStore` / `ITemporalStore` interfaces remain as the +community-fork escape hatch: an AGE / Memgraph / Neo4j / Neptune adapter +implements `IGraphStore` and pairs with any SQL-shaped `ITemporalStore`. +Embeddings live in the `embeddings` table inside `store.sqlite` — the +write-only Parquet sidecar was dropped with DuckDB (nothing ever read it +back). diff --git a/CLAUDE.md b/CLAUDE.md index 07c4a795..67042268 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,14 +10,15 @@ tiers. - `context` — inbound/outbound refs and participating flows for one symbol. - `impact` — dependents of a target up to a configurable depth, with a risk tier. - `detect_changes` — map an uncommitted or committed diff to affected symbols. -- `sql` — read-only SQL against the single-file `store.sqlite` index (ADR 0019). `nodes`, `edges`, `embeddings`, `cochanges`, `symbol_summaries`, and `store_meta` are all directly SQL-queryable (e.g. `SELECT id, name FROM nodes WHERE kind = 'Function'`; reach kind-specific fields via SQLite JSON1, `payload->>'$.field'`). 5 s timeout. The typed tools (`query`/`context`/`impact`) remain the high-level path; the `cypher` arg is reserved for community-fork graph adapters and is not supported by the default backend. +- `list_findings` — browse SARIF findings from the latest scan by severity and rule. +- `sql` — read-only SQL against the local temporal store (cochanges + symbol_summaries), 5 s timeout; the node/edge graph is queried via the typed tools or Cypher via the MCP `sql` tool. Run `codehub analyze` after pulling new commits so the index stays aligned with the working tree. `codehub status` reports staleness. ## Full MCP surface -The full MCP surface is **28 tools** (see `packages/mcp/src/server.ts`); +The full MCP surface is **29 tools** (see `packages/mcp/src/server.ts`); the 6 listed above are the high-frequency exploration tools. For the full inventory, use the `/codehub-guide` skill. diff --git a/OBJECTIVES.md b/OBJECTIVES.md index f6784b00..f59a400c 100644 --- a/OBJECTIVES.md +++ b/OBJECTIVES.md @@ -52,9 +52,10 @@ quality bar sits, and what is deliberately out of scope. ## Non-goals -8. **Do not operate a server or SaaS.** DuckDB is embedded. The MCP server is - a stdio process. ADR 0001 rejects any engine that would need a daemon. The - product ships as a CLI plus an MCP server, nothing hosted. +8. **Do not operate a server or SaaS.** The index is an embedded single-file + SQLite store (ADR 0019). The MCP server is a stdio process. ADR 0001 + rejects any engine that would need a daemon. The product ships as a CLI + plus an MCP server, nothing hosted. 9. **Do not port to Rust before it is needed.** ADR 0002 measured p95 single-file incremental analysis at 195-250ms on the 100-file fixture, well diff --git a/README.md b/README.md index 05e19d49..a032b61a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ npm install -g @opencodehub/cli cd /path/to/your/repo codehub init && codehub analyze -# your agent now has impact, query, context, detect_changes — 28 tools over MCP +# your agent now has impact, query, context, detect_changes — 29 tools over MCP ``` ## Why this exists @@ -63,11 +63,11 @@ ten round-trips. flowchart LR A[Source tree] -->|tree-sitter parse| B[Symbol graph] B -->|resolve imports / MRO| C[Typed relations] - C -->|BM25 + HNSW index| D[Hybrid graph store] + C -->|BM25 + vector KNN| D[Hybrid graph store] C -->|detect communities + flows| E[Processes / clusters] D --> F[MCP server] E --> F - F -->|28 tools| G[AI coding agent] + F -->|29 tools| G[AI coding agent] ``` ## Design choices worth knowing @@ -79,7 +79,7 @@ flowchart LR | **Deterministic indexing** | Identical inputs produce a byte-identical graph hash. Reproducible. Auditable. Cacheable in CI. | | **First-party source only** | `analyze` honors the repo's `.gitignore` (nested files included) and always skips dependency installs, virtualenvs, build output, and tool caches — `node_modules`, `.venv`/`venv`, `__pycache__`, `dist`/`build`/`target`, `.next`/`.nuxt`/`.turbo`, `.mypy_cache`/`.pytest_cache`/`.ruff_cache`, `coverage`, and similar. Exclusion is decided once at scan time (`HARDCODED_IGNORES` in `packages/ingestion/src/pipeline/gitignore.ts`), so every retrieval surface — `query`, `context`, `impact`, `sql`, `pack` — inherits it. Ambiguous names that are often real source (`vendor`, `env`, `out`, `bin`) are left to your `.gitignore`, which supports `!`-negation a hardcoded rule can't. | | **MCP-native** | Works out-of-the-box with Claude Code, Cursor, Codex, Windsurf, OpenCode. The MCP server is the primary interface; CLI exists for scripts and CI. | -| **Single-file embedded storage** | One `store.sqlite` file holds everything — symbols, edges, embeddings, BM25 (FTS5) + HNSW traversal, and the temporal views (cochanges, summaries) — via Node's built-in `node:sqlite`. No daemon, no database to operate, and **zero native storage bindings** (ADR 0019 removed both `@ladybugdb/core` and `@duckdb/node-api`). | +| **Single-file embedded storage** | One `store.sqlite` file holds everything — symbols, edges, embeddings, BM25 (FTS5) + brute-force vector KNN, and the temporal views (cochanges, summaries) — via Node's built-in `node:sqlite`. No daemon, no database to operate, and **zero native storage bindings** (ADR 0019 removed both `@ladybugdb/core` and `@duckdb/node-api`). | | **15 languages at GA** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, C, C++, Ruby, Kotlin, Swift, PHP, Dart, COBOL — tree-sitter for the first 14 plus a regex provider for fixed-format COBOL. | | **WASM-only parse runtime** | `web-tree-sitter` WASM is the only parse runtime. The 15 grammar `.wasm` blobs are vendored at `packages/ingestion/vendor/wasms/`, so parsing does **zero grammar/native builds and zero GitHub fetches** at install time — there is no native parser opt-in. Storage is pure `node:sqlite`; the only optional native dep is the local embedder (see Platform support). | @@ -149,7 +149,7 @@ pnpm run check # lint + typecheck + test + banned-strings mise run cli:link # puts `codehub` on your PATH ``` -## MCP tool surface (28 tools) +## MCP tool surface (29 tools) | Tool | Purpose | |---|---| @@ -170,7 +170,7 @@ skills + a code-analyst subagent — install via `codehub init`. ## Repository layout -The monorepo is organised as 18 workspace packages under `packages/`: +The monorepo is organised as 19 workspace packages under `packages/`: | Package | Purpose | |---|---| @@ -178,24 +178,26 @@ The monorepo is organised as 18 workspace packages under `packages/`: | `cli` | `codehub` command — `init`, `analyze`, `status`, `setup`, scanners, group federation | | `cobol-proleap` | ProLeap-backed deep-parse path for free-format COBOL (regex provider handles fixed-format) | | `core-types` | Shared TypeScript types, Zod schemas, error codes, canonical `LanguageId` and node/edge kinds | +| `docs` | This Starlight documentation site (private workspace package) | | `embedder` | Embedding backends — local ONNX, HTTP, SageMaker; deterministic `embedderId` fingerprint | +| `eval` | Retrieval / graph-quality evaluation harness (private, test-time only) | | `frameworks` | HTTP route + MCP tool detectors used by `route_map` / `api_impact` / `tool_map` | | `ingestion` | Tree-sitter + WASM parsers, symbol extraction, import resolution, complexity phase | -| `mcp` | Model Context Protocol server — 28 tools, resources, structured error envelopes | +| `mcp` | Model Context Protocol server — 29 tools, resources, structured error envelopes | | `pack` | Deterministic Repomix-compatible code-pack generator (M5) | | `policy` | Allowlist + license-tier policy engine driving `license_audit` and CI gates | | `sarif` | SARIF schema validation and scanner output normalisation | | `scanners` | Subprocess wrappers for 19 scanners — OSV, Semgrep, hadolint, tflint, betterleaks, and the rest | | `scip-ingest` | SCIP indexer runners (TS, Python, Go, Rust, Java) — emits CALLS, REFERENCES, IMPLEMENTS, TYPE_OF | -| `search` | Hybrid BM25 + HNSW (ACORN-1 + RaBitQ) query layer | +| `search` | Hybrid BM25 (FTS5) + vector query layer over `store.sqlite` | | `storage` | One `SqliteStore` (`node:sqlite`) implementing both `IGraphStore` + `ITemporalStore` over a single `store.sqlite`; deterministic `graphHash` | | `summarizer` | Process + cluster summaries for MCP responses | | `wiki` | LLM-narrated module pages emitted by `codehub wiki --llm` | -The retrieval / graph-quality evaluation harness and the per-language F1 -regression gym used to live here as `eval` and `gym`; they were +The per-language F1 regression gym used to live here as `gym`; it was extracted into the sibling `opencodehub-testbed` repository so the -production package set ships free of test-time dependencies. +production package set ships free of that test-time dependency. The +`eval` harness stays in-tree as a private, test-time-only package. ## Embedding backends @@ -281,8 +283,8 @@ superseded. `IGraphStore` / `ITemporalStore` interface segregation), B (19-scanner fleet incl. betterleaks), C (debt sweep — embedder fingerprint, SCIP REFERENCES + TYPE_OF), and D (dogfood polish) have all merged. The -published package is `@opencodehub/cli` (currently `0.7.0`; the monorepo -root tracks `0.8.0`); `1.0.0` is cut once schema + tool-surface stability +published package is `@opencodehub/cli` (currently `0.10.6`; the monorepo +root tracks `0.10.6`); `1.0.0` is cut once schema + tool-surface stability is signed off. While on `0.x`, **any release may contain breaking changes** to the diff --git a/SPECS.md b/SPECS.md index bbba8926..e3ccac7b 100644 --- a/SPECS.md +++ b/SPECS.md @@ -4,9 +4,9 @@ OpenCodeHub is an Apache-2.0, local-first code-intelligence toolchain for AI coding agents. It ingests a source tree into a hybrid knowledge graph -(structural relations + semantic vectors) stored as a two-tier split — an -lbug graph (`@ladybugdb/core`, `graph.lbug`) plus a DuckDB temporal sibling -(`temporal.duckdb`), both under `/.codehub/` (ADR 0016) — and exposes +(structural relations + semantic vectors) stored in a single-file SQLite +index (`store.sqlite`, WAL, via Node's built-in `node:sqlite`) under +`/.codehub/` (ADR 0019, superseding ADR 0016) — and exposes that graph over the Model Context Protocol and a `codehub` CLI. Agents use it to answer "what breaks if I change this, what depends on it, where does this data flow" *before* they produce a diff. @@ -20,7 +20,7 @@ Communities and Processes, and optionally populates embeddings from a pinned F2LLM-v2-80M ONNX model (320-dim; fp32 ~321 MB or int8 ~81 MB) or an OpenAI-compatible HTTP endpoint. -At query time it exposes an MCP server with 28 tools (`query`, `context`, +At query time it exposes an MCP server with 29 tools (`query`, `context`, `impact`, `signature`, `detect_changes`, `sql`, scanner / finding / dependency / verdict / route tools, and cross-repo `group_*` tools), along with a CLI that mirrors the main tools plus administrative @@ -32,9 +32,8 @@ working tree. - Not a language server. It runs SCIP indexers as one-shot artifact producers and does not speak LSP to editors directly. -- Not a SaaS. There is no server to operate; the graph lives as two - embedded files under `/.codehub/` (the lbug `graph.lbug` plus the - DuckDB `temporal.duckdb`). +- Not a SaaS. There is no server to operate; the graph lives as one + embedded file under `/.codehub/` (`store.sqlite`). - Not a hosted vector DB. Embeddings are optional and local; there is no network dependency for analyze or query. - Not a ranking / recommendation product. The graph is precomputed at index @@ -130,35 +129,38 @@ indexers agree on `package{manager,name,version}`. ## 3. Storage & schema -3.1 The system shall persist the graph tier to an lbug graph file -(`graph.lbug`, `@ladybugdb/core`) and the temporal tier — cochanges and -structured symbol summaries — to a DuckDB file (`temporal.duckdb`), both -under `/.codehub/`. Both files are written on every analyze; there is -no `CODEHUB_STORE` env var, no backend probe, and no single-file DuckDB -graph layout (ADR 0016). - -3.2 The storage layer shall segregate `IGraphStore` (graph workload: nodes, -edges, embeddings, multi-hop traversal) from `ITemporalStore` (cochanges, -summary cache). `IGraphStore` lives only on `GraphDbStore`; `DuckDbStore` -implements `ITemporalStore` only; `openStore()` composes them. The -segregated interfaces are the v1.0 contract for community-fork adapters -(AGE / Memgraph / Neo4j / Neptune target `IGraphStore`). If the lbug -binding fails to load, `open()` throws `GraphDbBindingError`. +3.1 The system shall persist the entire index — graph nodes, edges, +embeddings, and the temporal tables (cochanges and structured symbol +summaries) — to a single `store.sqlite` file (WAL, via `node:sqlite`) under +`/.codehub/`. The file is written on every analyze; there is no +`CODEHUB_STORE` env var, no backend probe, and no separate graph/temporal +file split (ADR 0019, superseding ADR 0016). + +3.2 The storage layer shall retain the segregated `IGraphStore` (graph +workload: nodes, edges, embeddings, multi-hop traversal) and `ITemporalStore` +(cochanges, summary cache) interfaces. One `SqliteStore` class implements +BOTH over the single file, and `openStore()` returns that instance as both +views. The segregated interfaces are the v1.0 contract for community-fork +adapters (AGE / Memgraph / Neo4j / Neptune implement `IGraphStore` and pair +with any SQL-shaped `ITemporalStore`). There is no native storage binding to +load, so `open()` cannot fail on a missing binding. 3.3 While executing the `sql` MCP tool or `codehub sql` CLI, the system shall reject non-read-only statements and apply a 5-second default timeout. -The `sql` path targets the DuckDB temporal store (`cochanges` + -`symbol_summaries`); the node/edge graph is queried via the typed tools or -via Cypher (the `sql` tool's `cypher` argument), not this SQL path. +The `sql` path targets the SQLite index directly (`nodes`, `edges`, +`embeddings`, `cochanges`, `symbol_summaries`, `store_meta` are all +SQL-queryable); the typed tools remain the high-level path. The `cypher` +argument is reserved for community-fork graph adapters and is unsupported by +the default backend. -3.4 The vector search path shall use the lbug graph's filter-aware -nearest-neighbour traversal when embeddings are populated. +3.4 The vector search path shall use filter-aware nearest-neighbour search +over the `embeddings` table when embeddings are populated. -3.5 The full-text search path shall use BM25 scoring over the indexed -symbols. +3.5 The full-text search path shall use BM25 scoring (SQLite FTS5) over the +indexed symbols. -3.6 Multi-hop graph traversal shall be expressed in the lbug graph's Cypher -dialect rather than recursive SQL CTEs. +3.6 Multi-hop graph traversal shall be expressed as recursive SQL CTEs over +the `edges` table. 3.7 The storage layer shall write metadata (schema version, graph hash, last-analyzed commit) atomically and expose it via `getMeta`. @@ -215,12 +217,12 @@ merge-safe tiers, 1 for review-required tiers, and 2 for `block`. 6.1 The MCP server shall advertise itself as `opencodehub` over stdio with an `instructions` block steering clients to call `list_repos` first. -6.2 The server shall register 28 tools: `list_repos`, `query`, `context`, +6.2 The server shall register 29 tools: `list_repos`, `query`, `context`, `impact`, `signature`, `detect_changes`, `sql`, `group_list`, `group_query`, `group_status`, `group_contracts`, `group_cross_repo_links`, `group_sync`, `project_profile`, `dependencies`, `license_audit`, `owners`, `list_findings`, `list_findings_delta`, `list_dead_code`, -`scan`, `verdict`, `risk_trends`, `route_map`, +`scan`, `verdict`, `change_pack`, `risk_trends`, `route_map`, `api_impact`, `shape_check`, `tool_map`, and `pack_codebase`. No registered tool mutates user source files; the MCP surface is read-only with respect to the working tree. @@ -258,7 +260,8 @@ shall reject it with `SqlGuardError`. and `sql`. 7.2 The CLI shall lazy-load every subcommand via `await import(...)` so -`codehub --help` does not transitively load DuckDB or tree-sitter. +`codehub --help` does not transitively load the WASM parser or the embedder +runtime. 7.3 The `setup` command shall write MCP configuration stanzas for claude-code, cursor, codex, windsurf, and opencode; pass `--undo` to diff --git a/commitlint.config.mjs b/commitlint.config.mjs index 29dbda1b..037c9636 100644 --- a/commitlint.config.mjs +++ b/commitlint.config.mjs @@ -40,6 +40,7 @@ export default { "analysis", "cli", "cobol-proleap", + "core-ops", "core-types", "embedder", "eval", diff --git a/mise.toml b/mise.toml index 65344279..b8cba988 100644 --- a/mise.toml +++ b/mise.toml @@ -3,7 +3,7 @@ node = "24" pnpm = "11.1.0" python = "3.12" uv = "latest" -"npm:node-gyp" = "latest" # fallback native build for @duckdb/node-api / onnxruntime-node when a platform prebuild is missing (parsing is WASM-only — ADR 0015) +"npm:node-gyp" = "latest" # defensive fallback: OCH's own deps are pure JS + WASM (no native storage binding after ADR 0019; parsing is WASM-only per ADR 0015), but a transitive optional native dep could still need a source build on a platform with no prebuild "aqua:betterleaks/betterleaks" = "1.2.0" # secret scanner — used by analyze + pre-release gate lefthook = "2.1.8" # git hooks — must satisfy lefthook.yml min_version (2.1.6); matches root devDep so a stale global mise install can't shadow it diff --git a/packages/analysis/src/test-utils.ts b/packages/analysis/src/test-utils.ts index da1ee086..fd696940 100644 --- a/packages/analysis/src/test-utils.ts +++ b/packages/analysis/src/test-utils.ts @@ -132,7 +132,7 @@ function sortNodesById(nodes: readonly FakeNode[]): FakeNode[] { /** * Sort edges by `(from, to, type)` so callers see the same order as - * `listEdges` returns from DuckDb/GraphDb. + * `listEdges` returns from the store. */ function sortEdges(edges: readonly FakeEdge[]): FakeEdge[] { return [...edges].sort((a, b) => { diff --git a/packages/cli/README.md b/packages/cli/README.md index d28098f0..d83a224f 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -69,7 +69,7 @@ top-level subcommands by phase of the workflow. ## Design - **Lazy loading** — each `.action()` does `await import(...)` so cold - startup is bounded by Commander, not DuckDB or the parse pool + startup is bounded by Commander, not the store or the parse pool (`packages/cli/src/index.ts:78-81`). - **No stateful daemon** — `analyze` runs to completion and exits; `mcp` is the only long-running process. @@ -81,13 +81,12 @@ top-level subcommands by phase of the workflow. - **`mcp` is launched, never embedded** — agents that need the MCP surface spawn `codehub mcp` over stdio (`packages/cli/src/commands/mcp.ts`). -See ADR 0016 for the lbug-graph + DuckDB-temporal storage layout and the +See ADR 0019 for the single-file `store.sqlite` storage layout and the root README's "MCP tool surface" section for the agent-facing tool inventory. -The bundled graph backend is `@ladybugdb/core` **0.17.1** or newer. 0.17.0 -changed empty-`STRING[]` serialization (empty lists now round-trip as a typed -empty array rather than collapsing to NULL); the adapter decodes a bare empty -array as an absent field on every supported lbug version, so `graphHash` -byte-identity — and the Parquet embeddings sidecar `packHash` that depends on -it — is preserved across the upgrade. +Storage is one `store.sqlite` file (WAL) via Node's built-in `node:sqlite`, +with zero native bindings (ADR 0019). Empty `keywords: []` round-trips as a +typed empty array distinct from an absent field — stored in the node's JSON +`payload` column — so `graphHash` byte-identity is preserved. Embeddings live +in the `embeddings` table (no Parquet sidecar). diff --git a/packages/cli/package.json b/packages/cli/package.json index 269db729..e640c349 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -1,7 +1,7 @@ { "name": "@opencodehub/cli", "version": "0.10.7", - "description": "OpenCodeHub — codehub CLI (analyze, setup, mcp, list, status, clean, query, context, impact, sql)", + "description": "OpenCodeHub \u2014 codehub CLI (analyze, setup, mcp, list, status, clean, query, context, impact, sql)", "license": "Apache-2.0", "repository": { "type": "git", @@ -39,7 +39,7 @@ "test": "pnpm run build:test && node --test \"./dist-test/**/*.test.js\"", "clean": "rm -rf dist dist-test *.tsbuildinfo" }, - "//deps": "The 15 @opencodehub/* workspace libs are INLINED into the bundle at build time (tsup noExternal) — they are devDependencies, not runtime deps. `dependencies` below is exactly the third-party set the bundle imports at runtime (kept `external`), plus the two @sourcegraph/scip-* indexers the parse pipeline spawns as subprocesses. onnxruntime-web (prebuilt WASM, no native binding) is optional (lazy-loaded only when embeddings are enabled).", + "//deps": "The 15 @opencodehub/* workspace libs are INLINED into the bundle at build time (tsup noExternal) \u2014 they are devDependencies, not runtime deps. `dependencies` below is exactly the third-party set the bundle imports at runtime (kept `external`), plus the two @sourcegraph/scip-* indexers the parse pipeline spawns as subprocesses. onnxruntime-web (prebuilt WASM, no native binding) is optional (lazy-loaded only when embeddings are enabled).", "dependencies": { "@apidevtools/swagger-parser": "12.1.0", "@aws-sdk/client-bedrock-runtime": "3.1076.0", @@ -68,6 +68,7 @@ }, "devDependencies": { "@opencodehub/analysis": "workspace:*", + "@opencodehub/core-ops": "workspace:*", "@opencodehub/core-types": "workspace:*", "@opencodehub/embedder": "workspace:*", "@opencodehub/eval": "workspace:*", diff --git a/packages/cli/src/commands/analyze-carry-forward.test.ts b/packages/cli/src/commands/analyze-carry-forward.test.ts index cd2657d8..163179b4 100644 --- a/packages/cli/src/commands/analyze-carry-forward.test.ts +++ b/packages/cli/src/commands/analyze-carry-forward.test.ts @@ -3,10 +3,10 @@ * {@link loadPreviousGraph}. * * What this exercises: - * - After a prior DuckDB index + scan-state.json are on disk, + * - After a prior index (`store.sqlite`) + scan-state.json are on disk, * `loadPreviousGraph` returns a {@link pipeline.PreviousGraph} whose * `nodes` AND `edges` fields are populated (non-empty, round-tripped - * through the `rowToGraphNode` / `rowToCodeRelation` mappers). + * through the store's typed `listNodes` / `listEdges` finders). * - That shape is the exact precondition `resolveIncrementalView` * (`packages/ingestion/src/pipeline/phases/incremental-helper.ts:95-102`) * checks before it flips `active=true`. A `PreviousGraph` satisfying @@ -15,7 +15,7 @@ * run their carry-forward codepath. * - The negative case (missing DB) still returns `undefined`. * - * The test builds its own DuckDB from scratch via a synthetic + * The test builds its own `store.sqlite` from scratch via a synthetic * `KnowledgeGraph` rather than running the full `runIngestion` pipeline — * keeps the test fast (no tree-sitter / SCIP invocations) and isolates the * storage ↔ `loadPreviousGraph` round-trip being exercised. @@ -190,7 +190,7 @@ async function seedPriorIndex(repoPath: string): Promise<{ return { nodeCount: graph.nodeCount(), edgeCount: graph.edgeCount() }; } -test("loadPreviousGraph: returns full nodes + edges from a seeded DuckDB", async () => { +test("loadPreviousGraph: returns full nodes + edges from a seeded store", async () => { const repoPath = await mkdtemp(join(tmpdir(), "och-carry-forward-")); const seeded = await seedPriorIndex(repoPath); diff --git a/packages/cli/src/commands/analyze-findings-survival.test.ts b/packages/cli/src/commands/analyze-findings-survival.test.ts new file mode 100644 index 00000000..33927d98 --- /dev/null +++ b/packages/cli/src/commands/analyze-findings-survival.test.ts @@ -0,0 +1,171 @@ +/** + * Regression test for the incremental-analyze findings-wipe bug. + * + * `runAnalyze` rebuilds the graph with a replace-mode `bulkLoad` (ADR 0019), + * which truncates EVERY node — including the `Finding` nodes and `FOUND_IN` + * edges that a prior `codehub scan` ingested. On the scan-skip fast-path + * (fingerprint match + `scan.sarif` present) the scanners do NOT re-run, so + * before the fix nothing re-populated those findings and the freshly-rebuilt + * graph reported zero findings — `list_findings`, `list_findings_delta`, and + * `verdict` all silently saw a clean scan. + * + * The fix re-ingests the cached `scan.sarif` on the skip path. `runIngestSarif` + * is idempotent (fingerprint-stable enrichment + upsert-mode bulkLoad), so it + * restores exactly the findings the wipe removed. + * + * These tests exercise the store-level composition directly — seed findings, + * simulate the replace-mode graph wipe, then run the skip-path re-ingest — so + * the regression is caught without driving a full git+scanner analyze run + * (which the determinism suite never exercised, which is why the bug shipped). + */ + +import assert from "node:assert/strict"; +import { mkdir, mkdtemp, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "node:test"; +import { KnowledgeGraph } from "@opencodehub/core-types"; +import type { SarifLog } from "@opencodehub/sarif"; +import { openStore, resolveGraphPath, resolveRepoMetaDir } from "@opencodehub/storage"; +import { runIngestSarif } from "./ingest-sarif.js"; + +/** A SARIF log with two findings on two files — the cached `scan.sarif`. */ +function scanLog(): SarifLog { + return { + version: "2.1.0", + runs: [ + { + tool: { driver: { name: "semgrep", version: "1.0.0" } }, + results: [ + { + ruleId: "r.xss", + message: { text: "XSS risk" }, + locations: [ + { + physicalLocation: { + artifactLocation: { uri: "web/a.ts" }, + region: { startLine: 10 }, + }, + }, + ], + partialFingerprints: { "opencodehub/v1": "a".repeat(32) }, + }, + { + ruleId: "r.sqli", + message: { text: "SQLi risk" }, + locations: [ + { + physicalLocation: { + artifactLocation: { uri: "api/b.ts" }, + region: { startLine: 20 }, + }, + }, + ], + partialFingerprints: { "opencodehub/v1": "b".repeat(32) }, + }, + ], + }, + ], + }; +} + +async function countFindings(repoPath: string): Promise { + const store = await openStore({ path: resolveGraphPath(repoPath) }); + try { + await store.graph.open(); + let n = 0; + for (const node of await store.graph.listNodes()) { + if (node.kind === "Finding") n += 1; + } + return n; + } finally { + await store.close(); + } +} + +/** + * Write the cached `scan.sarif` and seed its findings into the graph the way + * a prior `codehub scan` would have (via the same idempotent ingest path). + * Returns the sarif path so the test can re-ingest it on the skip branch. + */ +async function seedRepoWithFindings(repoPath: string): Promise { + await mkdir(resolveRepoMetaDir(repoPath), { recursive: true }); + const sarifPath = join(resolveRepoMetaDir(repoPath), "scan.sarif"); + await writeFile(sarifPath, `${JSON.stringify(scanLog(), null, 2)}\n`, "utf8"); + await runIngestSarif(sarifPath, { repo: repoPath }); + return sarifPath; +} + +/** + * Reproduce the replace-mode graph rebuild that `runAnalyze` performs: a + * `bulkLoad` in the default replace mode truncates all nodes/edges. This is + * the step that wipes the seeded Finding nodes. + */ +async function simulateGraphRebuildWipe(repoPath: string): Promise { + const store = await openStore({ path: resolveGraphPath(repoPath) }); + try { + await store.graph.open(); + await store.temporal.open(); + await store.graph.createSchema(); + // Empty graph in replace mode == the "rebuilt from the pipeline" graph + // that carries no Finding nodes (findings come only from the scan step). + await store.graph.bulkLoad(new KnowledgeGraph()); + } finally { + await store.close(); + } +} + +test("scan-skip path: replace-mode graph rebuild wipes seeded Finding nodes", async () => { + const repoPath = await mkdtemp(join(tmpdir(), "och-findings-wipe-")); + await seedRepoWithFindings(repoPath); + assert.equal(await countFindings(repoPath), 2, "seed should ingest two findings"); + + await simulateGraphRebuildWipe(repoPath); + + // This asserts the BUG precondition: after the replace-mode rebuild the + // findings are gone. If a future change makes the rebuild preserve findings + // this assertion flips and the guard below becomes redundant — update both. + assert.equal( + await countFindings(repoPath), + 0, + "replace-mode bulkLoad must truncate the prior Finding nodes", + ); +}); + +test("scan-skip path: re-ingesting the cached SARIF restores findings after the wipe", async () => { + const repoPath = await mkdtemp(join(tmpdir(), "och-findings-restore-")); + const sarifPath = await seedRepoWithFindings(repoPath); + + // Graph rebuild wipes the findings... + await simulateGraphRebuildWipe(repoPath); + assert.equal(await countFindings(repoPath), 0); + + // ...and the fix re-ingests the reused scan.sarif on the fingerprint-match + // skip branch (exactly what analyze.ts now does instead of only logging). + const ingested = await runIngestSarif(sarifPath, { repo: repoPath }); + + assert.equal(ingested.findingsEmitted, 2, "re-ingest must emit both cached findings"); + assert.equal( + await countFindings(repoPath), + 2, + "findings must survive an incremental re-analyze that skips the scanners", + ); +}); + +test("scan-skip re-ingest is idempotent — no duplicate Finding nodes", async () => { + const repoPath = await mkdtemp(join(tmpdir(), "och-findings-idem-")); + const sarifPath = await seedRepoWithFindings(repoPath); + await simulateGraphRebuildWipe(repoPath); + + // Two consecutive skip-path re-ingests (two incremental analyze runs) must + // not double-count — the ingest bulkLoad runs in upsert mode keyed on the + // finding fingerprint. + await runIngestSarif(sarifPath, { repo: repoPath }); + await runIngestSarif(sarifPath, { repo: repoPath }); + + assert.equal( + await countFindings(repoPath), + 2, + "repeated skip-path re-ingests must stay at two findings (idempotent upsert)", + ); +}); diff --git a/packages/cli/src/commands/analyze.test.ts b/packages/cli/src/commands/analyze.test.ts index 98226567..7c52612f 100644 --- a/packages/cli/src/commands/analyze.test.ts +++ b/packages/cli/src/commands/analyze.test.ts @@ -101,7 +101,7 @@ test("resolveMaxSummariesCap: auto clamps at the 500 cap for large repos", async }); test("resolveMaxSummariesCap: auto falls back to 50 on first run (no prior seed)", async () => { - // `undefined` models "no prior DuckDB store at the expected path". + // `undefined` models "no prior store at the expected path". const cap = await resolveMaxSummariesCap("/unused", "auto", true, async () => undefined); assert.equal(cap, 50); }); diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index e930aedb..669e84ee 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -20,17 +20,7 @@ import { spawn } from "node:child_process"; import { mkdir } from "node:fs/promises"; import { basename, join, resolve } from "node:path"; -import { - type CodeRelation, - type EdgeId, - type GraphNode, - NODE_KINDS, - type NodeId, - type NodeKind, - RELATION_TYPES, - type RelationType, - SCHEMA_VERSION, -} from "@opencodehub/core-types"; +import { type CodeRelation, type GraphNode, SCHEMA_VERSION } from "@opencodehub/core-types"; import { embedderModelId } from "@opencodehub/embedder"; import { pipeline } from "@opencodehub/ingestion"; import { @@ -47,7 +37,6 @@ import { type RepoEntry, readRegistry, upsertRegistry } from "../registry.js"; import { generateSkills } from "../skills-gen.js"; import { computeScanFingerprint, - countSarifFindings, readScanFingerprint, shouldSkipScan, writeScanFingerprint, @@ -556,9 +545,25 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi sarifExists, }) ) { - const priorCount = await countSarifFindings(sarifPath); - const countNote = priorCount !== undefined ? `, reusing ${priorCount} finding(s)` : ""; - log(`codehub analyze: scan — up to date (fingerprint match)${countNote}`); + // The graph bulkLoad above ran in replace mode (ADR 0019), which + // truncated every node — including the `Finding` nodes and + // `FOUND_IN` edges from the prior scan. When we skip re-running the + // scanners we still MUST re-ingest the reused `scan.sarif`, or the + // freshly-rebuilt graph ends up with zero findings and + // `list_findings`/`verdict`/`list_findings_delta` silently report a + // clean scan. `runIngestSarif` is idempotent (fingerprint-stable + // enrichment + upsert-mode bulkLoad), so re-ingesting the unchanged + // SARIF restores exactly the findings the wipe removed. + const { runIngestSarif } = await import("./ingest-sarif.js"); + const ingestOpts: { repo: string; home?: string } = { + repo: repoName, + ...(opts.home !== undefined ? { home: opts.home } : {}), + }; + const ingested = await runIngestSarif(sarifPath, ingestOpts); + log( + `codehub analyze: scan — up to date (fingerprint match), ` + + `re-ingested ${ingested.findingsEmitted} finding(s) from cached SARIF`, + ); } else { await runScanAndLog(); // Refresh the sidecar only after a successful scan so a thrown @@ -598,9 +603,9 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi * `.codehub/scan-state.json` (written at the tail of the prior run), * - IMPORTS + EXTENDS + IMPLEMENTS edges recovered from the `relations` * table by stripping each endpoint id back to its enclosing file path, - * - the FULL prior node and edge snapshot, mapped back into - * {@link GraphNode} / {@link CodeRelation} via {@link rowToGraphNode} - * and {@link rowToCodeRelation}. Shipping these two arrays is what + * - the FULL prior node and edge snapshot as {@link GraphNode} / + * {@link CodeRelation} arrays (via the store's typed `listNodes` / + * `listEdges` finders). Shipping these two arrays is what * flips `resolveIncrementalView` * (`packages/ingestion/src/pipeline/phases/incremental-helper.ts:95-102`) * from `active=false` (passive mode) to `active=true`, so the four @@ -629,10 +634,7 @@ export async function loadPreviousGraph( // Full node + edge dumps via typed finders. For a typical OCH repo // this is 10K-50K nodes and 20K-100K edges — fits in memory in one // shot. The `listNodes` / `listEdges` finders already return - // rehydrated `GraphNode` / `CodeRelation` objects, so the legacy - // `rowToGraphNode` / `rowToCodeRelation` adapters are no longer - // needed on this read path — they remain exported for external - // consumers that hand-roll over the wide-column shape. + // rehydrated `GraphNode` / `CodeRelation` objects. const nodes = [...(await store.graph.listNodes())]; const edges = [...(await store.graph.listEdges())]; // Derive the legacy file-granular projections from the full edge set so @@ -985,12 +987,11 @@ async function openEmbeddingHashCacheAdapter( adapter: { // listEmbeddingHashes is on the graph-tier interface — embeddings // travel with the graph view, not the temporal cochange table. - // Wrapped in try/catch: on a freshly-created lbug db that has no - // schema yet, the Cypher query inside listEmbeddingHashes() can - // throw "Cannot create an empty database under READ ONLY mode" - // because lbug defers some internal initialization until first - // query. Returning an empty map matches the interface contract - // ("Empty map on a fresh database or any error"). + // Wrapped in try/catch: querying a freshly-created store that has no + // schema yet (or a read-only handle on a not-yet-initialized file) can + // throw before the embeddings table exists. Returning an empty map + // matches the interface contract ("Empty map on a fresh database or + // any error"). list: async () => { try { return await store.graph.listEmbeddingHashes(); @@ -1018,350 +1019,6 @@ function fileFromNodeId(id: string): string | undefined { return rest.slice(0, second); } -// `PREV_NODE_SELECT_COLUMNS` was the explicit column whitelist used by the -// legacy SQL `SELECT * FROM nodes` round-trip in {@link loadPreviousGraph}. -// That read path now goes through `store.graph.listNodes()`, which already -// returns rehydrated `GraphNode` objects, so the constant is no longer -// load-bearing here. The `rowToGraphNode` / `rowToCodeRelation` adapters -// below remain exported for external consumers that hand-roll over the -// SQLite wide-column shape. - -const NODE_KIND_SET: ReadonlySet = new Set(NODE_KINDS); -const RELATION_TYPE_SET: ReadonlySet = new Set(RELATION_TYPES); - -function strField(r: Record, col: string): string | undefined { - const v = r[col]; - return typeof v === "string" && v.length > 0 ? v : undefined; -} - -function numField(r: Record, col: string): number | undefined { - const v = r[col]; - if (typeof v === "number" && Number.isFinite(v)) return v; - if (typeof v === "bigint") return Number(v); - return undefined; -} - -function boolField(r: Record, col: string): boolean | undefined { - const v = r[col]; - return typeof v === "boolean" ? v : undefined; -} - -function stringArrayField(r: Record, col: string): readonly string[] | undefined { - // Preserve `[]` distinct from absent. The SQLite TEXT[] binder returns - // a 0-length JS array for an empty SQL array literal and `null` for - // SQL NULL; mirror the storage adapter's `setStringArrayField` and - // return the array verbatim so a Community / Route node written as - // `{keywords: []}` (or `{responseKeys: []}`) survives the carry-forward - // load with its empty array intact — required so canonical-JSON / - // graphHash byte-identity holds across the incremental re-index. - const v = r[col]; - if (!Array.isArray(v)) return undefined; - const out: string[] = []; - for (const item of v) { - if (typeof item === "string") out.push(item); - } - return out; -} - -function parseJsonStringArrayField( - r: Record, - col: string, -): readonly string[] | undefined { - const raw = r[col]; - if (typeof raw !== "string" || raw.length === 0) return undefined; - try { - const parsed = JSON.parse(raw) as unknown; - if (!Array.isArray(parsed)) return undefined; - return parsed.filter((x): x is string => typeof x === "string"); - } catch { - return undefined; - } -} - -function parseJsonObjectField( - r: Record, - col: string, -): Record | undefined { - const raw = r[col]; - if (typeof raw !== "string" || raw.length === 0) return undefined; - try { - const parsed = JSON.parse(raw) as unknown; - if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) return undefined; - return parsed as Record; - } catch { - return undefined; - } -} - -/** - * Reverse of `nodeToRow` (`packages/storage/src/duckdb-adapter.ts:1169`): - * translate one row of the polymorphic `nodes` table back into a - * {@link GraphNode}. Only the `nodes`/`edges` fidelity required by the four - * incremental consumer phases (`cross-file`, `mro`, `communities`, - * `processes`) is load-bearing — Community / Process nodes are re-added - * verbatim by `communities.ts:90-94` / `processes.ts:306-310`, so their - * `name` / `filePath` / `inferredLabel` / `keywords` / `symbolCount` / - * `cohesion` / `entryPointId` / `stepCount` must round-trip. Other kinds - * survive the round trip best-effort; fields we can't recover stay - * `undefined` and the caller treats the resulting node as lossy — safe - * because the carry-forward only lives long enough to be hashed into the - * next graph. - * - * Returns `undefined` when the row carries a `kind` we don't recognise or - * when required scalar slots (`id`, `name`, `file_path`) are missing. - * - * Exported for tests; the production call site is {@link loadPreviousGraph}. - */ -export function rowToGraphNode(row: Record): GraphNode | undefined { - const idRaw = row["id"]; - const nameRaw = row["name"]; - const fileRaw = row["file_path"]; - const kindRaw = row["kind"]; - if (typeof idRaw !== "string" || idRaw.length === 0) return undefined; - if (typeof nameRaw !== "string") return undefined; - if (typeof fileRaw !== "string") return undefined; - if (typeof kindRaw !== "string" || !NODE_KIND_SET.has(kindRaw)) return undefined; - const kind = kindRaw as NodeKind; - - // Build a permissive record keyed by TS field names. The discriminated- - // union cast at the end is safe because every `GraphNode` member only - // requires `id`/`kind`/`name`/`filePath` plus optional fields beyond that; - // required fields unique to a kind (e.g. `FindingNode.propertiesBag`) are - // populated explicitly in the per-kind branches below. - const node: Record = { - id: idRaw as NodeId, - kind, - name: nameRaw, - filePath: fileRaw, - }; - - // LocatedNode fields — set only when non-NULL because some non-LocatedNode - // kinds (Community / Process / File / Folder) intentionally leave them - // NULL and re-hydrating a spurious zero would change the graph hash. - const startLine = numField(row, "start_line"); - if (startLine !== undefined) node["startLine"] = startLine; - const endLine = numField(row, "end_line"); - if (endLine !== undefined) node["endLine"] = endLine; - - const isExported = boolField(row, "is_exported"); - if (isExported !== undefined) node["isExported"] = isExported; - const signature = strField(row, "signature"); - if (signature !== undefined) node["signature"] = signature; - const parameterCount = numField(row, "parameter_count"); - if (parameterCount !== undefined) node["parameterCount"] = parameterCount; - const returnType = strField(row, "return_type"); - if (returnType !== undefined) node["returnType"] = returnType; - const declaredType = strField(row, "declared_type"); - if (declaredType !== undefined) node["declaredType"] = declaredType; - const owner = strField(row, "owner"); - if (owner !== undefined) node["owner"] = owner; - const description = strField(row, "description"); - if (description !== undefined) node["description"] = description; - const contentHash = strField(row, "content_hash"); - if (contentHash !== undefined) node["contentHash"] = contentHash; - const content = strField(row, "content"); - if (content !== undefined) node["content"] = content; - - // Community / Process — the two carry-forward-critical kinds. - const inferredLabel = strField(row, "inferred_label"); - if (inferredLabel !== undefined) node["inferredLabel"] = inferredLabel; - const symbolCount = numField(row, "symbol_count"); - if (symbolCount !== undefined) node["symbolCount"] = symbolCount; - const cohesion = numField(row, "cohesion"); - if (cohesion !== undefined) node["cohesion"] = cohesion; - const keywords = stringArrayField(row, "keywords"); - if (keywords !== undefined) node["keywords"] = keywords; - const entryPointId = strField(row, "entry_point_id"); - if (entryPointId !== undefined) node["entryPointId"] = entryPointId; - const stepCount = numField(row, "step_count"); - if (stepCount !== undefined) node["stepCount"] = stepCount; - - // Section (markdown heading) — `level` round-trips for completeness. - const level = numField(row, "level"); - if (level !== undefined) node["level"] = level; - - // Route: `url` + `responseKeys` + `method` (shared column with Tool / Operation). - const url = strField(row, "url"); - if (url !== undefined) node["url"] = url; - const responseKeys = stringArrayField(row, "response_keys"); - if (responseKeys !== undefined) node["responseKeys"] = responseKeys; - - if (kind === "Tool") { - const toolName = strField(row, "tool_name"); - if (toolName !== undefined) node["toolName"] = toolName; - const inputSchemaJson = strField(row, "input_schema_json"); - if (inputSchemaJson !== undefined) node["inputSchemaJson"] = inputSchemaJson; - } else if (kind === "Route") { - const method = strField(row, "method"); - if (method !== undefined) node["method"] = method; - } - - if (kind === "Finding") { - const ruleId = strField(row, "rule_id"); - const severity = strField(row, "severity"); - const scannerId = strField(row, "scanner_id"); - const message = strField(row, "message"); - const propertiesBag = parseJsonObjectField(row, "properties_bag"); - if (ruleId !== undefined) node["ruleId"] = ruleId; - if (severity !== undefined) node["severity"] = severity; - if (scannerId !== undefined) node["scannerId"] = scannerId; - if (message !== undefined) node["message"] = message; - // propertiesBag is REQUIRED on FindingNode; default to {} on lossy reads - // so the resulting object still structurally satisfies the union. - node["propertiesBag"] = propertiesBag ?? {}; - const partialFingerprint = strField(row, "partial_fingerprint"); - if (partialFingerprint !== undefined) node["partialFingerprint"] = partialFingerprint; - const baselineState = strField(row, "baseline_state"); - if (baselineState !== undefined) node["baselineState"] = baselineState; - const suppressedJson = strField(row, "suppressed_json"); - if (suppressedJson !== undefined) node["suppressedJson"] = suppressedJson; - } - - if (kind === "Dependency") { - const version = strField(row, "version"); - const ecosystem = strField(row, "ecosystem"); - const lockfileSource = strField(row, "lockfile_source"); - const license = strField(row, "license"); - // version / ecosystem / lockfileSource are REQUIRED on the type; default - // to safe values when NULL so the object still passes the structural - // union at runtime. The carry-forward path only hashes these fields. - node["version"] = version ?? ""; - node["ecosystem"] = ecosystem ?? "npm"; - node["lockfileSource"] = lockfileSource ?? ""; - if (license !== undefined) node["license"] = license; - } - - if (kind === "Operation") { - const httpMethod = strField(row, "http_method"); - const httpPath = strField(row, "http_path"); - node["method"] = httpMethod ?? "GET"; - node["path"] = httpPath ?? "/"; - const summary = strField(row, "summary"); - if (summary !== undefined) node["summary"] = summary; - const operationId = strField(row, "operation_id"); - if (operationId !== undefined) node["operationId"] = operationId; - } - - if (kind === "Contributor") { - const emailHash = strField(row, "email_hash"); - node["emailHash"] = emailHash ?? ""; - const emailPlain = strField(row, "email_plain"); - if (emailPlain !== undefined) node["emailPlain"] = emailPlain; - } - - // ProjectProfile — JSON-encoded array columns plus a polymorphic - // `frameworks_json` (flat `string[]` OR `{ flat, detected }`). - if (kind === "ProjectProfile") { - node["languages"] = parseJsonStringArrayField(row, "languages_json") ?? []; - const frameworksRaw = strField(row, "frameworks_json"); - let frameworksFlat: readonly string[] = []; - if (frameworksRaw !== undefined) { - try { - const parsed = JSON.parse(frameworksRaw) as unknown; - if (Array.isArray(parsed)) { - frameworksFlat = parsed.filter((x): x is string => typeof x === "string"); - } else if (typeof parsed === "object" && parsed !== null) { - const rec = parsed as Record; - const flat = rec["flat"]; - if (Array.isArray(flat)) { - frameworksFlat = flat.filter((x): x is string => typeof x === "string"); - } - const detected = rec["detected"]; - if (Array.isArray(detected)) node["frameworksDetected"] = detected; - } - } catch { - /* ignore — leave frameworks as [] */ - } - } - node["frameworks"] = frameworksFlat; - node["iacTypes"] = parseJsonStringArrayField(row, "iac_types_json") ?? []; - node["apiContracts"] = parseJsonStringArrayField(row, "api_contracts_json") ?? []; - node["manifests"] = parseJsonStringArrayField(row, "manifests_json") ?? []; - node["srcDirs"] = parseJsonStringArrayField(row, "src_dirs_json") ?? []; - } - - // File ownership (H.5) + Community ownership (H.4) — shared across kinds. - const orphanGrade = strField(row, "orphan_grade"); - if (orphanGrade !== undefined) node["orphanGrade"] = orphanGrade; - const isOrphan = boolField(row, "is_orphan"); - if (isOrphan !== undefined) node["isOrphan"] = isOrphan; - const truckFactor = numField(row, "truck_factor"); - if (truckFactor !== undefined) node["truckFactor"] = truckFactor; - const od30 = numField(row, "ownership_drift_30d"); - if (od30 !== undefined) node["ownershipDrift30d"] = od30; - const od90 = numField(row, "ownership_drift_90d"); - if (od90 !== undefined) node["ownershipDrift90d"] = od90; - const od365 = numField(row, "ownership_drift_365d"); - if (od365 !== undefined) node["ownershipDrift365d"] = od365; - - // v1.2 extensions - const deadness = strField(row, "deadness"); - if (deadness !== undefined) node["deadness"] = deadness; - const coveragePercent = numField(row, "coverage_percent"); - if (coveragePercent !== undefined) node["coveragePercent"] = coveragePercent; - const coveredLinesJson = strField(row, "covered_lines_json"); - if (coveredLinesJson !== undefined) node["coveredLinesJson"] = coveredLinesJson; - const cyclomaticComplexity = numField(row, "cyclomatic_complexity"); - if (cyclomaticComplexity !== undefined) node["cyclomaticComplexity"] = cyclomaticComplexity; - const nestingDepth = numField(row, "nesting_depth"); - if (nestingDepth !== undefined) node["nestingDepth"] = nestingDepth; - const nloc = numField(row, "nloc"); - if (nloc !== undefined) node["nloc"] = nloc; - const halsteadVolume = numField(row, "halstead_volume"); - if (halsteadVolume !== undefined) node["halsteadVolume"] = halsteadVolume; - - return node as unknown as GraphNode; -} - -/** - * Reverse of the relations row builder at - * `packages/storage/src/duckdb-adapter.ts:299-340`. Relations round-trip - * cleanly because their schema is 7 scalar columns with no polymorphism. - * Returns `undefined` when `type` is not a known {@link RelationType} or - * when required scalars are missing. - * - * Exported for tests; the production call site is {@link loadPreviousGraph}. - */ -export function rowToCodeRelation(row: Record): CodeRelation | undefined { - const id = row["id"]; - const from = row["from_id"]; - const to = row["to_id"]; - const type = row["type"]; - const confidence = row["confidence"]; - if (typeof id !== "string" || id.length === 0) return undefined; - if (typeof from !== "string" || from.length === 0) return undefined; - if (typeof to !== "string" || to.length === 0) return undefined; - if (typeof type !== "string" || !RELATION_TYPE_SET.has(type)) return undefined; - const conf = - typeof confidence === "number" && Number.isFinite(confidence) ? confidence : Number(confidence); - if (!Number.isFinite(conf)) return undefined; - - const reason = row["reason"]; - const step = row["step"]; - const base = { - id: id as EdgeId, - from: from as NodeId, - to: to as NodeId, - type: type as RelationType, - confidence: conf, - }; - const stepNum: number | undefined = - typeof step === "number" && Number.isFinite(step) - ? step - : typeof step === "bigint" - ? Number(step) - : undefined; - const hasReason = typeof reason === "string" && reason.length > 0; - // Build the final record in a single statement so we match the optional- - // field discipline required by `exactOptionalPropertyTypes`. - if (hasReason && stepNum !== undefined) { - return { ...base, reason: reason as string, step: stepNum }; - } - if (hasReason) return { ...base, reason: reason as string }; - if (stepNum !== undefined) return { ...base, step: stepNum }; - return base; -} - /** Per-file record persisted to `.codehub/scan-state.json`. */ interface ScanStateFile { readonly relPath: string; diff --git a/packages/cli/src/commands/api-impact.test.ts b/packages/cli/src/commands/api-impact.test.ts index 65eb875b..ba26ebf9 100644 --- a/packages/cli/src/commands/api-impact.test.ts +++ b/packages/cli/src/commands/api-impact.test.ts @@ -69,8 +69,8 @@ function makeFakeStore( const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/augment.test.ts b/packages/cli/src/commands/augment.test.ts index 77ded290..ad6e5da2 100644 --- a/packages/cli/src/commands/augment.test.ts +++ b/packages/cli/src/commands/augment.test.ts @@ -4,7 +4,7 @@ * Coverage matches the P0-1 contract: * - empty output when no repo is registered for the cwd * - empty output for sub-threshold patterns (<3 chars) - * - surface callers + processes when a real DuckDB fixture has them + * - surface callers + processes when a real store fixture has them * - never throws, regardless of registry corruption or missing index * - cold-start budget (<750ms) on a 10k-node fixture */ @@ -113,11 +113,11 @@ test("augment: returns empty when cwd maps to no registered repo", async () => { assert.equal(out, ""); }); -test("augment: returns empty when the registered repo has no DuckDB file", async () => { +test("augment: returns empty when the registered repo has no store file", async () => { const home = await scratch("no-db"); const repoPath = resolve(home, "ghost"); await mkdir(join(repoPath, ".codehub"), { recursive: true }); - // Registry entry points at a repo whose graph.duckdb does not exist. + // Registry entry points at a repo whose store.sqlite does not exist. await upsertRegistry( { name: "ghost", diff --git a/packages/cli/src/commands/baseline.test.ts b/packages/cli/src/commands/baseline.test.ts index ff8f2511..e6587eb3 100644 --- a/packages/cli/src/commands/baseline.test.ts +++ b/packages/cli/src/commands/baseline.test.ts @@ -3,7 +3,7 @@ * * These tests write SARIF files on disk under a scratch tmp dir, run the * command handlers directly (no commander round-trip), and assert on the - * returned summary + on-disk artifact. No registry or DuckDB is touched. + * returned summary + on-disk artifact. No registry or store is touched. */ import assert from "node:assert/strict"; diff --git a/packages/cli/src/commands/change-pack.test.ts b/packages/cli/src/commands/change-pack.test.ts index 9d851306..ad1a4286 100644 --- a/packages/cli/src/commands/change-pack.test.ts +++ b/packages/cli/src/commands/change-pack.test.ts @@ -11,7 +11,7 @@ * 4. The store is always closed (finally), even on the summary path. * * Each test injects an `_openStore` factory + an `_runChangePack` stand-in - * so nothing hits lbug/DuckDB or git. The CLI's contract under test is the + * so nothing hits the store or git. The CLI's contract under test is the * exit-code passthrough (`pack.verdict.exitCode`) and the JSON shape — not * the analysis module's compose logic, which has its own suite. */ @@ -39,8 +39,8 @@ function fakeStore(): FakeStoreHandle { const store = { graph: FAKE_GRAPH, temporal: {} as unknown, - graphFile: "/tmp/fake-repo/.codehub/graph.lbug", - temporalFile: "/tmp/fake-repo/.codehub/temporal.duckdb", + graphFile: "/tmp/fake-repo/.codehub/store.sqlite", + temporalFile: "/tmp/fake-repo/.codehub/store.sqlite", close: async () => { wasClosed = true; }, diff --git a/packages/cli/src/commands/code-pack.ts b/packages/cli/src/commands/code-pack.ts index 7fcbcf8d..4442bb2d 100644 --- a/packages/cli/src/commands/code-pack.ts +++ b/packages/cli/src/commands/code-pack.ts @@ -35,15 +35,13 @@ import { existsSync, statSync } from "node:fs"; import { mkdir, mkdtemp, readFile, rename, rm, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join, resolve } from "node:path"; -import type { FileNode, GraphNode, RepoNode } from "@opencodehub/core-types"; -import { sha256Hex } from "@opencodehub/core-types"; -import { parse as ingestionParse } from "@opencodehub/ingestion"; import { buildContextAttestation, type CacheChannel, DEFAULT_CACHE_CHANNEL, generatePack, type PackManifest, + resolvePackProvenance, serializeAttestation, } from "@opencodehub/pack"; import { type IGraphStore, openStore, resolveGraphPath, type Store } from "@opencodehub/storage"; @@ -285,111 +283,6 @@ async function runRepomixEngine(repoPath: string, args: CodePackArgs): Promise; - readonly grammarCommits: Readonly>; -} - -/** - * Derive {@link PackProvenance} from the opened graph and the repo working - * tree. - * - * - commit / repoOriginUrl: read from the singleton `Repo` node, so the - * pack stays a pure read of the indexed state (no `git` spawn here). - * - chunkerFiles: every indexed `File` node's bytes, read from disk and - * **hash-verified against the node's `contentHash`**. A file whose - * working-tree bytes drifted from the index is skipped, so the pack never - * chunks content that disagrees with what was analyzed — preserving the - * "pack reflects the indexed commit" contract. - * - grammarCommits: the vendored grammar version pins. - * - * A `graph` of `undefined` (no store) or one lacking `listNodes` (a bare test - * stub) yields empty file/commit provenance but still returns grammar pins. - */ -async function resolvePackProvenance( - graph: IGraphStore | undefined, - repoPath: string, -): Promise { - const grammarCommits = await loadGrammarCommits(); - - const canList = typeof graph?.listNodes === "function"; - if (graph === undefined || !canList) { - return { commit: "", repoOriginUrl: null, chunkerFiles: [], grammarCommits }; - } - - const [repoNodes, fileNodes] = await Promise.all([ - graph.listNodes({ kinds: ["Repo"] }), - graph.listNodes({ kinds: ["File"] }), - ]); - - const repo = repoNodes.find((n): n is RepoNode => n.kind === "Repo"); - const commit = repo?.commitSha ?? ""; - const repoOriginUrl = repo?.originUrl ?? null; - - const chunkerFiles = await collectChunkerFiles(fileNodes, repoPath); - return { commit, repoOriginUrl, chunkerFiles, grammarCommits }; -} - -/** - * Read + hash-verify the bytes of every indexed `File` node. Only files whose - * on-disk sha256 matches the indexed `contentHash` are returned, so a pack run - * against a dirty working tree silently drops drifted files rather than - * chunking stale bytes. Files with no recorded `contentHash` are read as-is - * (the index never claimed a hash to verify against). - */ -async function collectChunkerFiles( - fileNodes: readonly GraphNode[], - repoPath: string, -): Promise { - const out: Array<{ path: string; bytes: Uint8Array; language?: string }> = []; - for (const node of fileNodes) { - if (node.kind !== "File") continue; - const file = node as FileNode; - let buf: Buffer; - try { - buf = await readFile(resolve(repoPath, file.filePath)); - } catch { - continue; // file vanished from the tree since indexing — skip it - } - const bytes = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength); - if (file.contentHash !== undefined && sha256Hex(bytes) !== file.contentHash) { - continue; // working-tree bytes drifted from the indexed state — skip - } - out.push({ - path: file.filePath, - bytes, - ...(file.language !== undefined ? { language: file.language } : {}), - }); - } - return out; -} - -/** - * Load the vendored grammar version pins for the manifest. Best-effort: an - * unreadable manifest yields `{}` rather than failing the pack. - */ -async function loadGrammarCommits(): Promise>> { - try { - return await ingestionParse.grammarVersions(); - } catch { - return {}; - } -} - /** * Read the on-disk size of `path`. Exported so the CLI's user-facing * recap can format byte counts without re-walking the dir tree. @@ -531,9 +424,9 @@ export function formatContextSummary(s: ContextSummary): string { * via `_store`; production passes the full Store envelope from * {@link openStore}. The composed envelope is the only shape carrying both * a `graph` and a `temporal` view, so the presence of both uniquely - * identifies it. (The pre-ADR-0016 envelope also carried a `backend` - * discriminator; that field was removed when the DuckDB-as-graph backend was - * ripped out, so this no longer keys off it.) + * identifies it. (An earlier envelope also carried a `backend` discriminator; + * the single-backend collapse in ADR 0019 removed it, so this no longer keys + * off it.) */ function isStoreShape(s: Store | IGraphStore | undefined): s is Store { if (s === undefined) return false; diff --git a/packages/cli/src/commands/context.test.ts b/packages/cli/src/commands/context.test.ts index bd08a02e..71b088c0 100644 --- a/packages/cli/src/commands/context.test.ts +++ b/packages/cli/src/commands/context.test.ts @@ -100,8 +100,8 @@ function makeFakeStore(opts: FakeStoreOptions = {}): FakeStoreHandle { const composed: Store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.duckdb", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/context.ts b/packages/cli/src/commands/context.ts index 02551610..dc6980e2 100644 --- a/packages/cli/src/commands/context.ts +++ b/packages/cli/src/commands/context.ts @@ -15,6 +15,7 @@ * `IGraphStore` typed-finder surface stays the only contract. */ +import { contextCapability } from "@opencodehub/core-ops"; import type { GraphNode, NodeKind } from "@opencodehub/core-types"; import type { IGraphStore, SearchResult } from "@opencodehub/storage"; import { type OpenStoreResult, openStoreForCommand } from "./open-store.js"; @@ -32,12 +33,6 @@ export interface ContextRuntimeHooks { readonly openStore?: (opts: ContextOptions) => Promise; } -interface ProcessParticipation { - readonly id: string; - readonly label: string; - readonly step: number | null; -} - interface ResolvedNode { readonly nodeId: string; readonly name: string; @@ -55,62 +50,6 @@ type Resolution = | { readonly kind: "ambiguous"; readonly candidates: readonly ResolvedNode[] } | { readonly kind: "not_found" }; -/** - * Find Process-kind partners reachable from the target via `PROCESS_STEP` - * edges. Mirrors the post-A-6c MCP equivalent in - * `packages/mcp/src/tools/context.ts:567` so the two surfaces stay in - * lockstep on edge semantics + ordering. - */ -async function fetchProcessParticipation( - graph: IGraphStore, - targetId: string, -): Promise { - const [outEdges, inEdges] = await Promise.all([ - graph.listEdgesByType("PROCESS_STEP", { fromIds: [targetId] }), - graph.listEdgesByType("PROCESS_STEP", { toIds: [targetId] }), - ]); - const partnerIds = new Set(); - for (const e of [...outEdges, ...inEdges]) { - const id = e.from === targetId ? e.to : e.from; - partnerIds.add(id); - } - if (partnerIds.size === 0) return []; - const partners = await graph.listNodes({ ids: [...partnerIds] }); - const partnerById = new Map(); - for (const p of partners) partnerById.set(p.id, p); - const dedup = new Map(); - for (const e of [...outEdges, ...inEdges]) { - const partnerId = e.from === targetId ? e.to : e.from; - const partner = partnerById.get(partnerId); - if (partner?.kind !== "Process") continue; - if (dedup.has(partner.id)) continue; - const inferredLabelRaw = (partner as unknown as { inferredLabel?: unknown }).inferredLabel; - const label = - typeof inferredLabelRaw === "string" && inferredLabelRaw.length > 0 - ? inferredLabelRaw - : partner.name; - const stepRaw = e.step; - const stepNum = - typeof stepRaw === "number" && Number.isFinite(stepRaw) && stepRaw > 0 - ? Math.trunc(stepRaw) - : null; - dedup.set(partner.id, { label, step: stepNum }); - } - const items = Array.from(dedup.entries()).map(([id, v]) => ({ - id, - label: v.label, - step: v.step, - })); - // Match the prior `ORDER BY r.step` then deterministic id tiebreak. - items.sort((a, b) => { - const as = a.step ?? Number.POSITIVE_INFINITY; - const bs = b.step ?? Number.POSITIVE_INFINITY; - if (as !== bs) return as - bs; - return a.id < b.id ? -1 : a.id > b.id ? 1 : 0; - }); - return items.slice(0, 20); -} - function nodeToResolved(n: GraphNode): ResolvedNode { return { nodeId: n.id, @@ -250,7 +189,7 @@ export async function runContext( const target = resolution.target; - const [up, down, processes] = await Promise.all([ + const [up, down, ctxOut] = await Promise.all([ graph.traverse({ startId: target.nodeId, direction: "up", @@ -263,8 +202,10 @@ export async function runContext( maxDepth: 1, relationTypes: ["CALLS"], }), - fetchProcessParticipation(graph, target.nodeId), + // Shared PROCESS_STEP reader — the one piece both surfaces run identically. + contextCapability.execute({ targetId: target.nodeId }, { store, repoName: repoPath }), ]); + const processes = ctxOut.processes; if (opts.json) { console.log( diff --git a/packages/cli/src/commands/dead-code.test.ts b/packages/cli/src/commands/dead-code.test.ts index 76c4a42b..c3e4847e 100644 --- a/packages/cli/src/commands/dead-code.test.ts +++ b/packages/cli/src/commands/dead-code.test.ts @@ -53,8 +53,8 @@ function makeFakeStore(syms: readonly FakeSym[]): { store: Store; closed: () => const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/dependencies.test.ts b/packages/cli/src/commands/dependencies.test.ts index d7829f9d..beb93ff4 100644 --- a/packages/cli/src/commands/dependencies.test.ts +++ b/packages/cli/src/commands/dependencies.test.ts @@ -54,8 +54,8 @@ function makeFakeStore(deps: readonly DependencyNode[]): FakeHandle { handle.store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/findings.test.ts b/packages/cli/src/commands/findings.test.ts index e95324d9..43a5a1e2 100644 --- a/packages/cli/src/commands/findings.test.ts +++ b/packages/cli/src/commands/findings.test.ts @@ -56,8 +56,8 @@ function makeFakeStore(rows: readonly FindingNode[]): FakeHandle { handle.store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/findings.ts b/packages/cli/src/commands/findings.ts index 4bc19749..ef816f08 100644 --- a/packages/cli/src/commands/findings.ts +++ b/packages/cli/src/commands/findings.ts @@ -1,18 +1,14 @@ /** * `codehub findings` — enumerate SARIF Finding nodes for an indexed repo. * - * CLI sibling of the MCP `list_findings` tool. Reuses the same storage - * reader (`store.graph.listFindings`) plus the identical TS post-finder for - * `scanner` / `filePath` substring narrowing and the `severity==="none"` - * filter. Only `note|warning|error` are pushed into `listFindings`; the - * `none` severity is handled entirely in the TS post-finder (both halves — - * we never pass it to the storage tier and we drop rows whose severity is - * not `none` when the caller asked for `none`). - * - * Mirrors `packages/mcp/src/tools/list-findings.ts:runListFindings`. Does NOT - * emit the MCP next_steps / staleness envelope — that is MCP-only. + * CLI sibling of the MCP `list_findings` tool. The shared reader/filter/ + * projection now lives in `@opencodehub/core-ops` `findingsCapability` — this + * command is the thin CLI adapter: open the store, run the capability, render + * to stdout (text or `--json`). Does NOT emit the MCP next_steps / staleness + * envelope — that is MCP-only. */ +import { type FindingsInput, findingsCapability } from "@opencodehub/core-ops"; import type { Store } from "@opencodehub/storage"; import { openStoreForCommand } from "./open-store.js"; @@ -29,72 +25,34 @@ export interface FindingsOptions { readonly storeFactory?: () => Promise<{ store: Store; repoPath: string }>; } -interface FindingRow { - readonly id: string; - readonly scanner: string; - readonly ruleId: string; - readonly severity: string; - readonly message: string; - readonly filePath: string; - readonly startLine?: number; - readonly endLine?: number; - readonly properties: Record; -} - export async function runFindings(opts: FindingsOptions = {}): Promise { - const limit = opts.limit ?? 500; const factory = opts.storeFactory ?? (() => openStoreForCommand({ ...opts, readOnly: true })); - const { store } = await factory(); + const { store, repoPath } = await factory(); try { - const findingsOpts: { - severity?: readonly ("note" | "warning" | "error")[]; - ruleId?: string; - limit?: number; - } = { limit }; - if ( - opts.severity !== undefined && - (opts.severity === "note" || opts.severity === "warning" || opts.severity === "error") - ) { - findingsOpts.severity = [opts.severity]; - } - if (opts.ruleId !== undefined) findingsOpts.ruleId = opts.ruleId; - const all = await store.graph.listFindings(findingsOpts); - - const filtered = all.filter((f) => { - if (opts.severity === "none" && f.severity !== "none") return false; - if (opts.scanner !== undefined && f.scannerId !== opts.scanner) return false; - if (opts.filePath !== undefined && !f.filePath.includes(opts.filePath)) return false; - return true; + const input: FindingsInput = { + ...(opts.severity !== undefined ? { severity: opts.severity } : {}), + ...(opts.scanner !== undefined ? { scanner: opts.scanner } : {}), + ...(opts.ruleId !== undefined ? { ruleId: opts.ruleId } : {}), + ...(opts.filePath !== undefined ? { filePath: opts.filePath } : {}), + ...(opts.limit !== undefined ? { limit: opts.limit } : {}), + }; + const out = await findingsCapability.execute(input, { + store, + repoName: opts.repo ?? repoPath, }); - const rows: FindingRow[] = filtered.map((f) => ({ - id: f.id, - scanner: stringOr(f.scannerId, "unknown"), - ruleId: stringOr(f.ruleId, ""), - severity: stringOr(f.severity, "note"), - message: stringOr(f.message, ""), - filePath: stringOr(f.filePath, ""), - properties: f.propertiesBag, - ...(typeof f.startLine === "number" && Number.isFinite(f.startLine) - ? { startLine: f.startLine } - : {}), - ...(typeof f.endLine === "number" && Number.isFinite(f.endLine) - ? { endLine: f.endLine } - : {}), - })); - if (opts.json) { - console.log(JSON.stringify({ findings: rows, total: rows.length }, null, 2)); + console.log(JSON.stringify({ findings: out.findings, total: out.total }, null, 2)); return; } - if (rows.length === 0) { + if (out.total === 0) { console.warn( "findings: no findings matched — run `codehub scan` or `codehub ingest-sarif ` to populate Finding nodes", ); return; } - for (const f of rows) { + for (const f of out.findings) { const loc = f.startLine !== undefined ? `:${f.startLine}` : ""; const msg = f.message ? ` — ${f.message}` : ""; console.log(`[${f.severity}] ${f.scanner}:${f.ruleId} at ${f.filePath}${loc}${msg}`); @@ -103,9 +61,3 @@ export async function runFindings(opts: FindingsOptions = {}): Promise { await store.close(); } } - -function stringOr(v: unknown, fallback: string): string { - if (typeof v === "string") return v; - if (typeof v === "number" || typeof v === "boolean") return String(v); - return fallback; -} diff --git a/packages/cli/src/commands/license-audit.test.ts b/packages/cli/src/commands/license-audit.test.ts index 16fa2f6a..5f8ff19d 100644 --- a/packages/cli/src/commands/license-audit.test.ts +++ b/packages/cli/src/commands/license-audit.test.ts @@ -38,8 +38,8 @@ function makeFakeStore(deps: readonly DependencyNode[]): { store: Store; closed: const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/open-store.ts b/packages/cli/src/commands/open-store.ts index 37078fb6..93e31b4a 100644 --- a/packages/cli/src/commands/open-store.ts +++ b/packages/cli/src/commands/open-store.ts @@ -7,8 +7,8 @@ * so callers can route graph-tier queries through `store.graph` and * temporal-tier queries (cochanges, summaries, `--sql` escape hatch) * through `store.temporal`. Post-ADR 0019 both views are one `SqliteStore` - * over a single `/.codehub/store.sqlite`; the legacy backend selector - * was removed when the lbug + DuckDB pair was replaced (see ADR 0019). + * over a single `/.codehub/store.sqlite`; the prior two-backend + * selector was removed in that single-file migration (see ADR 0019). */ import { resolve } from "node:path"; diff --git a/packages/cli/src/commands/owners.test.ts b/packages/cli/src/commands/owners.test.ts index e1387d6d..15f23bec 100644 --- a/packages/cli/src/commands/owners.test.ts +++ b/packages/cli/src/commands/owners.test.ts @@ -63,8 +63,8 @@ function makeFakeStore( const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/project-profile.test.ts b/packages/cli/src/commands/project-profile.test.ts index f994c6b2..c308b508 100644 --- a/packages/cli/src/commands/project-profile.test.ts +++ b/packages/cli/src/commands/project-profile.test.ts @@ -29,8 +29,8 @@ function makeFakeStore(profile: ProjectProfileNode | undefined): { const store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { closed = true; }, diff --git a/packages/cli/src/commands/query.test.ts b/packages/cli/src/commands/query.test.ts index 14ec185b..c4666abf 100644 --- a/packages/cli/src/commands/query.test.ts +++ b/packages/cli/src/commands/query.test.ts @@ -10,7 +10,7 @@ * - `--bm25-only` skips the embedder probe entirely. * * The fake store intercepts the `embeddings` count probe so we can steer - * the hybrid-vs-BM25 branch without staging DuckDB or ONNX weights. + * the hybrid-vs-BM25 branch without staging the store or ONNX weights. */ import assert from "node:assert/strict"; @@ -153,8 +153,8 @@ function makeFakeStore(opts: FakeStoreOptions = {}): FakeStoreHandle { const composed: Store = { graph: graph as unknown as IGraphStore, temporal: temporal as unknown as ITemporalStore, - graphFile: "/tmp/fake.duckdb", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/replay.test.ts b/packages/cli/src/commands/replay.test.ts index 00b7e889..3aa3c2be 100644 --- a/packages/cli/src/commands/replay.test.ts +++ b/packages/cli/src/commands/replay.test.ts @@ -215,7 +215,7 @@ describe("loadPack (real on-disk)", () => { await rm(dir, { recursive: true, force: true }); }); - it("parses manifest (schema 2, no duckdb pin), ast-chunks, and context-bom ranges", async () => { + it("parses manifest (schema 2, no legacy backend pin), ast-chunks, and context-bom ranges", async () => { const loaded = await loadPack(dir); assert.equal(loaded.manifest.packHash, "deadbeef"); assert.equal(loaded.manifest.budgetTokens, 100); diff --git a/packages/cli/src/commands/replay.ts b/packages/cli/src/commands/replay.ts index 634d8a77..afdb0b64 100644 --- a/packages/cli/src/commands/replay.ts +++ b/packages/cli/src/commands/replay.ts @@ -202,8 +202,8 @@ export async function loadPack(dir: string): Promise { /** * Parse the on-disk snake_case manifest into the fields `replay` needs. - * Corrected for schema 2 (ADR 0019): no `duckdb_version` pin, `budget_tokens` - * is read for the decision set. + * Corrected for schema 2 (ADR 0019): no legacy native-backend version pin, + * `budget_tokens` is read for the decision set. */ function parseManifest(json: string): ReplayManifest { const w = JSON.parse(json) as Record; diff --git a/packages/cli/src/commands/route-map.test.ts b/packages/cli/src/commands/route-map.test.ts index 5d14710a..c9791507 100644 --- a/packages/cli/src/commands/route-map.test.ts +++ b/packages/cli/src/commands/route-map.test.ts @@ -61,8 +61,8 @@ function makeFakeStore(routes: readonly RouteNode[], edges: readonly CodeRelatio handle.store = { graph: graph as unknown as IGraphStore, temporal: {} as unknown as ITemporalStore, - graphFile: "/tmp/fake.lbug", - temporalFile: "/tmp/fake.duckdb", + graphFile: "/tmp/fake.sqlite", + temporalFile: "/tmp/fake.sqlite", close: async () => { handle.closed = true; }, diff --git a/packages/cli/src/commands/status.test.ts b/packages/cli/src/commands/status.test.ts index 5c899c71..ba063696 100644 --- a/packages/cli/src/commands/status.test.ts +++ b/packages/cli/src/commands/status.test.ts @@ -157,7 +157,7 @@ test("status degrades to summaries:- / vectors:unknown when the store can't open const repoPath = await seedRepo(home, "degraded"); const cap = captureStdout(); try { - // Default probe: no graph.lbug exists in the seeded repo → undefined. + // Default probe: no store.sqlite exists in the seeded repo → undefined. await runStatus(repoPath, { home, probeRetrieval: async () => undefined }); } finally { cap.restore(); diff --git a/packages/cli/src/commands/verdict.test.ts b/packages/cli/src/commands/verdict.test.ts index 7ec53ba0..87049c97 100644 --- a/packages/cli/src/commands/verdict.test.ts +++ b/packages/cli/src/commands/verdict.test.ts @@ -9,7 +9,7 @@ * 5. `--exit-code` on auto_merge tier → exit 0. * * Each test injects a stub `computeVerdictFn` + a fake store so nothing - * hits DuckDB or git. The CLI's real exit-code ladder (0/1/2/3) is what + * hits the store or git. The CLI's real exit-code ladder (0/1/2/3) is what * the assertions target, so the test pinsbehavior — not the * analysis module's 0/1/2 mapping. */ @@ -88,7 +88,7 @@ function verdictFixture( communitiesTouched: ["c1", "c2", "c3"], changedFileCount: 7, changedFiles: [ - "packages/storage/src/duckdb-adapter.ts", + "packages/storage/src/sqlite-adapter.ts", "packages/cli/src/index.ts", "README.md", ], @@ -451,7 +451,7 @@ test("runVerdict: ownership_required rule passes when approvals are supplied", a ], }; // touchedPaths now comes from the verdict pipeline (verdict.changedFiles). - // The auto_merge fixture touches `packages/storage/src/duckdb-adapter.ts`, + // The auto_merge fixture touches `packages/storage/src/sqlite-adapter.ts`, // which matches the rule glob — so the rule fires, but the supplied // @storage-team approval satisfies require_approval_from → pass. const { exitCode } = await withExitCode(async () => { @@ -598,7 +598,7 @@ test("runVerdict: ownership_required blocks (exit 3) when a changed path lacks a }, ], }; - // The auto_merge fixture touches packages/storage/src/duckdb-adapter.ts, + // The auto_merge fixture touches packages/storage/src/sqlite-adapter.ts, // which matches the rule glob. No approval supplied → block, proving the // rule sees the real changedFiles threaded through touchedPaths. const { exitCode } = await withExitCode(async () => { @@ -618,7 +618,7 @@ test("runVerdict: ownership_required blocks (exit 3) when a changed path lacks a assert.match(output, /Policy: block/); assert.match( output, - /storage-owner: path "packages\/storage\/src\/duckdb-adapter.ts" requires approval from one of: @storage-team/, + /storage-owner: path "packages\/storage\/src\/sqlite-adapter.ts" requires approval from one of: @storage-team/, ); assert.equal(exitCode, 3); }); diff --git a/packages/cli/src/skills-gen.test.ts b/packages/cli/src/skills-gen.test.ts index d9041533..6215a9d8 100644 --- a/packages/cli/src/skills-gen.test.ts +++ b/packages/cli/src/skills-gen.test.ts @@ -6,7 +6,7 @@ * "listNodesByEntryPoint" | "listEdgesByType">`). The fake store below * implements those four methods over an in-memory fixture so the tests * exercise the real code path down to the markdown renderer and the - * filesystem writer without standing up DuckDB. + * filesystem writer without standing up the store. */ import { strict as assert } from "node:assert"; diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index fb355d33..1fb506ef 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -8,6 +8,7 @@ "include": ["src/**/*", "test/**/*"], "references": [ { "path": "../analysis" }, + { "path": "../core-ops" }, { "path": "../core-types" }, { "path": "../embedder" }, { "path": "../eval" }, diff --git a/packages/cli/tsup.config.ts b/packages/cli/tsup.config.ts index 4331b9e9..81c95ca2 100644 --- a/packages/cli/tsup.config.ts +++ b/packages/cli/tsup.config.ts @@ -50,8 +50,8 @@ const distDir = join(here, "dist"); * bundled source are still followed. `noExternal` takes precedence for the * `@opencodehub/*` scope, so our workspace libs are still inlined. * - * This implicitly covers the native bindings (`@ladybugdb/core`, - * `@duckdb/node-api`, `onnxruntime-node`, `web-tree-sitter`), the worker host + * This implicitly covers the WASM/optional runtimes (`web-tree-sitter`, + * the `onnxruntime-web` embedder), the worker host * (`piscina`), the CJS MCP SDK, and the lazily-imported packages * (`@chonkiejs/core`, `@apidevtools/swagger-parser`, * `@aws-sdk/client-sagemaker-runtime`, `ts-morph`). diff --git a/packages/core-ops/package.json b/packages/core-ops/package.json new file mode 100644 index 00000000..61c66f9c --- /dev/null +++ b/packages/core-ops/package.json @@ -0,0 +1,64 @@ +{ + "name": "@opencodehub/core-ops", + "version": "0.1.0", + "private": true, + "description": "OpenCodeHub — transport-free capability core shared by the CLI and MCP surfaces", + "license": "Apache-2.0", + "repository": { + "type": "git", + "url": "git+https://github.com/theagenticguy/opencodehub.git", + "directory": "packages/core-ops" + }, + "homepage": "https://github.com/theagenticguy/opencodehub#readme", + "bugs": { + "url": "https://github.com/theagenticguy/opencodehub/issues" + }, + "type": "module", + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": [ + "dist/**/*.js", + "!dist/**/*.test.js", + "dist/**/*.d.ts", + "!dist/**/*.test.d.ts", + "dist/**/*.js.map", + "!dist/**/*.test.js.map", + "dist/**/*.d.ts.map", + "!dist/**/*.test.d.ts.map" + ], + "scripts": { + "build": "tsc -b", + "test": "node --test \"./dist/**/*.test.js\"", + "clean": "rm -rf dist *.tsbuildinfo" + }, + "dependencies": { + "@opencodehub/analysis": "workspace:*", + "@opencodehub/core-types": "workspace:*", + "@opencodehub/storage": "workspace:*" + }, + "devDependencies": { + "@types/node": "26.0.1", + "typescript": "6.0.3" + }, + "publishConfig": { + "access": "public" + }, + "keywords": [ + "opencodehub", + "code-intelligence", + "mcp", + "model-context-protocol", + "cli", + "capability", + "typescript" + ], + "engines": { + "node": ">=24.15.0" + } +} diff --git a/packages/core-ops/src/capability.ts b/packages/core-ops/src/capability.ts new file mode 100644 index 00000000..c0d20e13 --- /dev/null +++ b/packages/core-ops/src/capability.ts @@ -0,0 +1,73 @@ +/** + * The `Capability` contract — a transport-free unit of work shared by the MCP + * tool and the CLI command for one code-intelligence operation. + * + * WHY THIS EXISTS. Today the MCP tool (`packages/mcp/src/tools/.ts`) and the + * CLI command (`packages/cli/src/commands/.ts`) for the same operation run + * byte-identical resolve → open-store → typed-finder → filter → row-projection + * logic and diverge ONLY at output (an MCP `CallToolResult` envelope vs the + * CLI's `console.log` / `--json`). A `Capability` owns the shared middle and + * returns a PLAIN typed `Output`; each surface keeps a thin adapter that maps + * that `Output` into its own transport. A filter fix then lands once, not twice. + * + * SCOPE (v1, the findings proof-of-concept). `execute` receives an ALREADY-OPEN + * store view plus the resolved repo's display name — both of which each surface + * already has at its call site (MCP via `withStore`, CLI via + * `openStoreForCommand`). Repo resolution and store lifecycle stay in the two + * surfaces for now because their resolvers differ meaningfully (the MCP side + * carries `AMBIGUOUS_REPO` semantics the CLI does not). Unifying resolution + + * lifecycle behind a `StoreProvider`, and folding the register/try-catch + * boilerplate into `defineTool`/`defineCommand` factories, is the natural + * follow-up once this seam is proven — see `artifacts/och-shared-core/`. + * + * A capability NEVER touches `console`, NEVER builds a `CallToolResult`, and + * NEVER renders. + * + * INPUT VALIDATION stays at each surface's boundary, deliberately. The MCP + * tool validates via the SDK's zod `inputSchema` (raw-shape idiom); the CLI + * validates + coerces commander flags. Both then hand `execute` a plain, + * already-validated `Input` object. Keeping the zod schema out of the + * capability keeps this core package dependency-light and lets each surface + * own the schema shape its transport requires — the shared, duplicated part + * was always the `execute` body (finder → filter → projection), never the + * schema. (A future revision may thread a shared schema through once the two + * surfaces' validation needs are unified; not required for the dedup win.) + */ + +import type { IGraphStore, ITemporalStore } from "@opencodehub/storage"; + +/** + * The already-open store views a capability's `execute` reads. Mirrors the + * `store.graph` / `store.temporal` split every call site uses today, so an + * `execute` body reads exactly like the inline code it replaces. (When the + * deferred A1 accessor-collapse lands, this interface is its single flip + * point: change it to one `store` and update the `execute` bodies, not the + * ~28 adapter files.) + */ +export interface CapabilityStore { + readonly graph: IGraphStore; + readonly temporal: ITemporalStore; +} + +/** Everything an `execute` needs beyond the validated input. */ +export interface CapabilityContext { + /** The open store views. */ + readonly store: CapabilityStore; + /** The resolved repo's display name, for `Output` headers/labels. */ + readonly repoName: string; +} + +/** + * A transport-free operation shared by the MCP tool and CLI command. + * + * - `id` is a stable identifier (e.g. "findings"), used for logging and as + * the default tool/command name. + * - `execute` receives an already-validated, plain `Input` (each surface + * validates at its own boundary), does finder → filter → project, and + * returns a PLAIN `Output`. It must not import commander, the MCP SDK, or + * `console`. + */ +export interface Capability { + readonly id: string; + readonly execute: (input: Input, ctx: CapabilityContext) => Promise; +} diff --git a/packages/core-ops/src/caps/context.test.ts b/packages/core-ops/src/caps/context.test.ts new file mode 100644 index 00000000..73f6def4 --- /dev/null +++ b/packages/core-ops/src/caps/context.test.ts @@ -0,0 +1,182 @@ +/** + * Unit tests for `contextCapability.execute` — the shared PROCESS_STEP reader + * lifted from the (behaviourally identical) `fetchProcessParticipation` in the + * MCP `context` tool and CLI `codehub context` command. Exercises `execute` + * directly against a fake `CapabilityStore`, so it needs no real store, no repo + * resolution, and no transport. This is the one place the shared reader is now + * tested; the two surfaces' resolvers + presenters keep their own tests. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { CodeRelation, GraphNode, NodeId } from "@opencodehub/core-types"; +import type { IGraphStore, ListEdgesByTypeOptions, ListNodesOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { type ContextInput, contextCapability } from "./context.js"; + +interface EdgeSpec { + readonly from: string; + readonly to: string; + readonly step?: number; +} + +interface NodeSpec { + readonly id: string; + readonly name: string; + readonly kind: string; + readonly inferredLabel?: string; +} + +/** + * A fake store implementing only the two finders the capability calls — + * `listEdgesByType` (PROCESS_STEP, filtered by fromIds/toIds) and `listNodes` + * (by ids). Everything else on IGraphStore throws so an accidental new read is + * caught loudly. + */ +function fakeStore(edges: readonly EdgeSpec[], nodes: readonly NodeSpec[]): CapabilityStore { + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listEdgesByType") { + return async ( + type: string, + opts?: ListEdgesByTypeOptions, + ): Promise => { + assert.equal(type, "PROCESS_STEP"); + const fromIds = opts?.fromIds ? new Set(opts.fromIds.map(String)) : undefined; + const toIds = opts?.toIds ? new Set(opts.toIds.map(String)) : undefined; + return edges + .filter( + (e) => (fromIds ? fromIds.has(e.from) : true) && (toIds ? toIds.has(e.to) : true), + ) + .map( + (e) => + ({ + from: e.from as NodeId, + to: e.to as NodeId, + type: "PROCESS_STEP", + ...(e.step !== undefined ? { step: e.step } : {}), + }) as unknown as CodeRelation, + ); + }; + } + if (prop === "listNodes") { + return async (opts?: ListNodesOptions): Promise => { + const ids = opts?.ids ? new Set(opts.ids.map(String)) : undefined; + return nodes + .filter((n) => (ids ? ids.has(n.id) : true)) + .map( + (n) => + ({ + id: n.id as NodeId, + name: n.name, + kind: n.kind, + filePath: "src/x.ts", + ...(n.inferredLabel !== undefined ? { inferredLabel: n.inferredLabel } : {}), + }) as unknown as GraphNode, + ); + }; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in context capability test`); + }, + }); + return { graph, temporal: {} as CapabilityStore["temporal"] }; +} + +async function run(input: ContextInput, edges: readonly EdgeSpec[], nodes: readonly NodeSpec[]) { + const ctx: CapabilityContext = { store: fakeStore(edges, nodes), repoName: "demo-repo" }; + return contextCapability.execute(input, ctx); +} + +test("context: no PROCESS_STEP edges yields empty processes and echoes repoName", async () => { + const out = await run({ targetId: "F:foo" }, [], []); + assert.equal(out.repoName, "demo-repo"); + assert.deepEqual(out.processes, []); +}); + +test("context: collects Process partners in both edge directions", async () => { + const out = await run( + { targetId: "F:foo" }, + [ + { from: "F:foo", to: "P:out", step: 2 }, + { from: "P:in", to: "F:foo", step: 1 }, + ], + [ + { id: "P:out", name: "outbound-process", kind: "Process" }, + { id: "P:in", name: "inbound-process", kind: "Process" }, + ], + ); + // Sorted by step asc: P:in (1) before P:out (2). + assert.deepEqual(out.processes, [ + { id: "P:in", label: "inbound-process", step: 1 }, + { id: "P:out", label: "outbound-process", step: 2 }, + ]); +}); + +test("context: non-Process partners are dropped", async () => { + const out = await run( + { targetId: "F:foo" }, + [ + { from: "F:foo", to: "F:notaprocess", step: 1 }, + { from: "F:foo", to: "P:real", step: 2 }, + ], + [ + { id: "F:notaprocess", name: "sibling", kind: "Function" }, + { id: "P:real", name: "real-process", kind: "Process" }, + ], + ); + assert.equal(out.processes.length, 1); + assert.equal(out.processes[0]?.id, "P:real"); +}); + +test("context: inferredLabel wins over name when present, else falls back to name", async () => { + const out = await run( + { targetId: "F:foo" }, + [ + { from: "F:foo", to: "P:a", step: 1 }, + { from: "F:foo", to: "P:b", step: 2 }, + ], + [ + { id: "P:a", name: "raw-name-a", kind: "Process", inferredLabel: "Nice Label A" }, + { id: "P:b", name: "raw-name-b", kind: "Process" }, + ], + ); + assert.equal(out.processes[0]?.label, "Nice Label A"); + assert.equal(out.processes[1]?.label, "raw-name-b"); +}); + +test("context: step is null when absent or non-positive; nulls sort last, id tiebreak", async () => { + const out = await run( + { targetId: "F:foo" }, + [ + { from: "F:foo", to: "P:nostep" }, + { from: "F:foo", to: "P:zero", step: 0 }, + { from: "F:foo", to: "P:one", step: 1 }, + ], + [ + { id: "P:nostep", name: "no-step", kind: "Process" }, + { id: "P:zero", name: "zero-step", kind: "Process" }, + { id: "P:one", name: "one-step", kind: "Process" }, + ], + ); + // step=1 first; the two null-step (absent + zero) sort by id: P:nostep < P:zero. + assert.deepEqual( + out.processes.map((p) => [p.id, p.step]), + [ + ["P:one", 1], + ["P:nostep", null], + ["P:zero", null], + ], + ); +}); + +test("context: caps participation at 20 partners", async () => { + const edges: EdgeSpec[] = []; + const nodes: NodeSpec[] = []; + for (let i = 0; i < 30; i += 1) { + const id = `P:${String(i).padStart(2, "0")}`; + edges.push({ from: "F:foo", to: id, step: i + 1 }); + nodes.push({ id, name: `proc-${i}`, kind: "Process" }); + } + const out = await run({ targetId: "F:foo" }, edges, nodes); + assert.equal(out.processes.length, 20); +}); diff --git a/packages/core-ops/src/caps/context.ts b/packages/core-ops/src/caps/context.ts new file mode 100644 index 00000000..804199f0 --- /dev/null +++ b/packages/core-ops/src/caps/context.ts @@ -0,0 +1,111 @@ +/** + * `contextCapability` — the shared graph-read middle behind the MCP `context` + * tool and the CLI `codehub context` command. + * + * Only the PROCESS_STEP reader is shared: `fetchProcessParticipation` was + * byte-identical (behaviourally) in both surfaces + * (`cli/src/commands/context.ts` ⇄ `mcp/src/tools/context.ts`) and is lifted + * here VERBATIM. The two surfaces' `resolveTarget` and CALLS traversal are + * NOT shared — they diverge meaningfully (the CLI filters synthetic + * ``/`CodeElement` stubs and falls back to BM25, then reads + * callers/callees via `graph.traverse`; the MCP side short-circuits on `uid`, + * carries line/coverage metadata, and reads callers/callees via categorised + * `listEdges` buckets), so each keeps its own. The capability therefore + * exposes exactly the one provably-identical piece. + * + * Each surface keeps its own input validation, store lifecycle, resolver, and + * presenter. The MCP-only enrichment (owner / cochange / confidence / buckets / + * next_steps / staleness) STAYS in the MCP presenter. + */ + +import type { GraphNode } from "@opencodehub/core-types"; +import type { Capability, CapabilityContext } from "../capability.js"; + +/** + * The validated, plain input `contextCapability.execute` consumes. Each surface + * resolves its target to a concrete node id BEFORE `execute` runs (the CLI via + * its stub-filtering + BM25 resolver, the MCP tool via its uid/name resolver), + * then passes that id here. `repo`/`repo_uri` are resolved to a store by the + * surface upstream, so they are not read here — they live on the input only so + * a surface can pass its parsed args object through unchanged. + */ +export interface ContextInput { + readonly repo?: string; + readonly repo_uri?: string; + /** The resolved graph node id of the target symbol. */ + readonly targetId: string; +} + +/** One Process-kind partner reachable from the target via `PROCESS_STEP`. */ +export interface ContextProcessParticipation { + readonly id: string; + readonly label: string; + readonly step: number | null; +} + +export interface ContextOutput { + readonly repoName: string; + readonly processes: readonly ContextProcessParticipation[]; +} + +/** + * Find Process-kind partners reachable from the target via `PROCESS_STEP` + * edges. PROCESS_STEP edges are emitted symbol-to-symbol under a Process node, + * so we accept either direction on the join and filter on `kind = 'Process'`. + * Ordering matches the prior `ORDER BY r.step` with a deterministic id + * tiebreak; the result is capped at 20 partners. + * + * Lifted verbatim from the (behaviourally identical) bodies in + * `cli/src/commands/context.ts` and `mcp/src/tools/context.ts`. + */ +export const contextCapability: Capability = { + id: "context", + async execute(input: ContextInput, ctx: CapabilityContext): Promise { + const graph = ctx.store.graph; + const targetId = input.targetId; + const [outEdges, inEdges] = await Promise.all([ + graph.listEdgesByType("PROCESS_STEP", { fromIds: [targetId] }), + graph.listEdgesByType("PROCESS_STEP", { toIds: [targetId] }), + ]); + const partnerIds = new Set(); + for (const e of [...outEdges, ...inEdges]) { + const id = e.from === targetId ? e.to : e.from; + partnerIds.add(id); + } + if (partnerIds.size === 0) return { repoName: ctx.repoName, processes: [] }; + const partners = await graph.listNodes({ ids: [...partnerIds] }); + const partnerById = new Map(); + for (const p of partners) partnerById.set(p.id, p); + const dedup = new Map(); + for (const e of [...outEdges, ...inEdges]) { + const partnerId = e.from === targetId ? e.to : e.from; + const partner = partnerById.get(partnerId); + if (partner?.kind !== "Process") continue; + if (dedup.has(partner.id)) continue; + const inferredLabelRaw = (partner as unknown as { inferredLabel?: unknown }).inferredLabel; + const label = + typeof inferredLabelRaw === "string" && inferredLabelRaw.length > 0 + ? inferredLabelRaw + : partner.name; + const stepRaw = e.step; + const stepNum = + typeof stepRaw === "number" && Number.isFinite(stepRaw) && stepRaw > 0 + ? Math.trunc(stepRaw) + : null; + dedup.set(partner.id, { label, step: stepNum }); + } + const items = Array.from(dedup.entries()).map(([id, v]) => ({ + id, + label: v.label, + step: v.step, + })); + // Match the prior `ORDER BY r.step` then deterministic id tiebreak. + items.sort((a, b) => { + const as = a.step ?? Number.POSITIVE_INFINITY; + const bs = b.step ?? Number.POSITIVE_INFINITY; + if (as !== bs) return as - bs; + return a.id < b.id ? -1 : a.id > b.id ? 1 : 0; + }); + return { repoName: ctx.repoName, processes: items.slice(0, 20) }; + }, +}; diff --git a/packages/core-ops/src/caps/dependencies.test.ts b/packages/core-ops/src/caps/dependencies.test.ts new file mode 100644 index 00000000..7db49f70 --- /dev/null +++ b/packages/core-ops/src/caps/dependencies.test.ts @@ -0,0 +1,114 @@ +/** + * Unit tests for `dependenciesCapability.execute` — the shared reader/filter/ + * projection lifted from the MCP `dependencies` tool. Exercises `execute` + * directly against a fake `CapabilityStore`, so it needs no real store, no repo + * resolution, and no transport. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { DependencyNode, NodeId } from "@opencodehub/core-types"; +import type { IGraphStore, ListDependenciesOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { type DependenciesInput, dependenciesCapability } from "./dependencies.js"; + +/** Build a Dependency fixture from a plain string id (kept verbatim). */ +function dep(over: Omit, "id"> & { id: string }): DependencyNode { + return { + kind: "Dependency", + name: over.id, + filePath: "package.json", + version: "1.0.0", + ecosystem: "npm", + lockfileSource: "package-lock.json", + ...over, + id: over.id as NodeId, + } as DependencyNode; +} + +function fakeStore(corpus: readonly DependencyNode[]): { + store: CapabilityStore; + lastOpts: () => ListDependenciesOptions | undefined; +} { + let captured: ListDependenciesOptions | undefined; + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listDependencies") { + return async (opts?: ListDependenciesOptions): Promise => { + captured = opts; + let rows = corpus; + if (opts?.ecosystem !== undefined) + rows = rows.filter((d) => d.ecosystem === opts.ecosystem); + if (opts?.limit !== undefined) rows = rows.slice(0, opts.limit); + return rows; + }; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in dependencies capability test`); + }, + }); + const store: CapabilityStore = { graph, temporal: {} as CapabilityStore["temporal"] }; + return { store, lastOpts: () => captured }; +} + +async function run(input: DependenciesInput, corpus: readonly DependencyNode[]) { + const { store, lastOpts } = fakeStore(corpus); + const ctx: CapabilityContext = { store, repoName: "demo-repo" }; + const out = await dependenciesCapability.execute(input, ctx); + return { out, lastOpts }; +} + +test("dependencies: projects rows, echoes repoName, defaults limit to 500", async () => { + const { out, lastOpts } = await run({}, [dep({ id: "a" }), dep({ id: "b" })]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.total, 2); + assert.equal(out.dependencies.length, 2); + assert.equal(lastOpts()?.limit, 500, "default limit pushed to the storage tier"); + const r = out.dependencies[0]; + assert.equal(r?.id, "a"); + assert.equal(r?.ecosystem, "npm"); + assert.equal(r?.version, "1.0.0"); +}); + +test("dependencies: ecosystem is pushed to the storage tier", async () => { + const { out, lastOpts } = await run({ ecosystem: "cargo" }, [ + dep({ id: "rust", ecosystem: "cargo" }), + dep({ id: "node", ecosystem: "npm" }), + ]); + assert.equal(lastOpts()?.ecosystem, "cargo", "ecosystem pushed down"); + assert.equal(out.total, 1); + assert.equal(out.dependencies[0]?.id, "rust"); +}); + +test("dependencies: filePath substring is applied in the TS post-finder over lockfileSource", async () => { + const { out } = await run({ filePath: "apps/web/" }, [ + dep({ id: "hit", lockfileSource: "apps/web/package-lock.json" }), + dep({ id: "miss", lockfileSource: "apps/api/package-lock.json" }), + ]); + assert.equal(out.total, 1); + assert.equal(out.dependencies[0]?.id, "hit"); +}); + +test("dependencies: missing/loose fields fall back through stringOr; lockfile falls back to filePath", async () => { + // A deliberately loose runtime row built WITHOUT the `dep()` defaults: no + // lockfileSource, no license. Production rehydration can produce rows looser + // than the typed shape, which is exactly why the projection uses `stringOr` + + // the `lockfileSource ?? filePath` guard. + const loose = { + kind: "Dependency", + id: "loose" as NodeId, + name: "loose", + filePath: "pkg.json", + version: "1.0.0", + ecosystem: "npm", + } as unknown as DependencyNode; + const { out } = await run({}, [loose]); + const r = out.dependencies[0]; + assert.equal(r?.license, "UNKNOWN", "missing license → UNKNOWN sentinel"); + assert.equal(r?.lockfileSource, "pkg.json", "missing lockfileSource → filePath fallback"); +}); + +test("dependencies: empty corpus yields total 0", async () => { + const { out } = await run({}, []); + assert.equal(out.total, 0); + assert.equal(out.dependencies.length, 0); +}); diff --git a/packages/core-ops/src/caps/dependencies.ts b/packages/core-ops/src/caps/dependencies.ts new file mode 100644 index 00000000..171aec15 --- /dev/null +++ b/packages/core-ops/src/caps/dependencies.ts @@ -0,0 +1,92 @@ +/** + * `dependenciesCapability` — the shared reader/filter/projection behind the MCP + * `dependencies` tool (and, once the CLI adopts it, `codehub dependencies`). + * + * Lifted verbatim from the body of `mcp/src/tools/dependencies.ts`: the typed + * `listDependencies({ecosystem?, limit})` finder, the TS `filePath` substring + * post-filter over `lockfileSource ?? filePath`, and the row projection through + * the one canonical `stringOr`. The surface maps `DependenciesOutput` into its + * own transport (text body + next_steps + staleness envelope). + */ + +import type { Capability, CapabilityContext } from "../capability.js"; +import { stringOr } from "../string-or.js"; + +/** + * The validated, plain input `dependenciesCapability.execute` consumes. + * `repo`/`repo_uri` are resolved to a concrete store by the surface BEFORE + * `execute` runs; they live on the input only so a surface can pass its parsed + * args object through unchanged. + */ +export interface DependenciesInput { + readonly repo?: string; + readonly repo_uri?: string; + readonly ecosystem?: "npm" | "pypi" | "go" | "cargo" | "maven" | "nuget"; + readonly filePath?: string; + readonly limit?: number; +} + +/** + * One projected dependency row — the flat shape the surface renders. Kept as a + * flat object so clients that only inspect `structuredContent` can grok it + * without crawling the graph. + */ +export interface DependencyRow { + readonly id: string; + readonly name: string; + readonly version: string; + readonly ecosystem: string; + readonly license: string; + readonly lockfileSource: string; +} + +/** The applied filters, echoed back so presenters can label the output. */ +export interface DependenciesFilters { + readonly ecosystem?: string; + readonly filePath?: string; +} + +export interface DependenciesOutput { + readonly repoName: string; + readonly dependencies: readonly DependencyRow[]; + readonly total: number; + readonly filters: DependenciesFilters; +} + +export const dependenciesCapability: Capability = { + id: "dependencies", + async execute(input: DependenciesInput, ctx: CapabilityContext): Promise { + const limit = input.limit ?? 500; + + // Typed `listDependencies` finder reads the Dependency rows directly, + // already rehydrated into the typed shape. The `filePath` substring + // filter is applied in TS because the finder doesn't expose a LIKE + // option — dependencies are bounded per repo so a TS filter is fine. + const opts: { ecosystem?: string; limit?: number } = { limit }; + if (input.ecosystem !== undefined) opts.ecosystem = input.ecosystem; + const all = await ctx.store.graph.listDependencies(opts); + const filtered = + input.filePath === undefined + ? all + : all.filter((d) => { + const lf = d.lockfileSource ?? d.filePath; + return lf.includes(input.filePath as string); + }); + + const dependencies: DependencyRow[] = filtered.map((d) => ({ + id: d.id, + name: d.name, + version: stringOr(d.version, "UNKNOWN"), + ecosystem: stringOr(d.ecosystem, "unknown"), + license: stringOr(d.license, "UNKNOWN"), + lockfileSource: stringOr(d.lockfileSource, d.filePath), + })); + + const filters: DependenciesFilters = { + ...(input.ecosystem !== undefined ? { ecosystem: input.ecosystem } : {}), + ...(input.filePath !== undefined ? { filePath: input.filePath } : {}), + }; + + return { repoName: ctx.repoName, dependencies, total: dependencies.length, filters }; + }, +}; diff --git a/packages/core-ops/src/caps/findings.test.ts b/packages/core-ops/src/caps/findings.test.ts new file mode 100644 index 00000000..6ee747b2 --- /dev/null +++ b/packages/core-ops/src/caps/findings.test.ts @@ -0,0 +1,150 @@ +/** + * Unit tests for `findingsCapability.execute` — the shared reader/filter/ + * projection lifted from the (byte-identical) MCP `list_findings` tool and CLI + * `codehub findings` command. Exercises `execute` directly against a fake + * `CapabilityStore`, so it needs no real store, no repo resolution, and no + * transport. This is the one place the shared logic is now tested. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { FindingNode, NodeId } from "@opencodehub/core-types"; +import type { IGraphStore, ListFindingsOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { type FindingsInput, findingsCapability } from "./findings.js"; + +/** + * Build a Finding fixture from a plain string id. The id is kept verbatim (not + * a real `${kind}:${path}:${name}` node id) so assertions can compare against + * the literal; the cast satisfies the `NodeId` brand without altering the value. + */ +function finding(over: Omit, "id"> & { id: string }): FindingNode { + return { + kind: "Finding", + name: over.id, + filePath: "src/a.ts", + ruleId: "rule-x", + severity: "warning", + scannerId: "semgrep", + message: "msg", + propertiesBag: {}, + ...over, + id: over.id as NodeId, + } as FindingNode; +} + +/** + * A fake store whose `listFindings` records the opts it was called with and + * returns a fixed corpus filtered by the storage-tier predicates the capability + * pushes down (severity + ruleId + limit). Everything else on IGraphStore + * throws so an accidental new read is caught loudly. + */ +function fakeStore(corpus: readonly FindingNode[]): { + store: CapabilityStore; + lastOpts: () => ListFindingsOptions | undefined; +} { + let captured: ListFindingsOptions | undefined; + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listFindings") { + return async (opts?: ListFindingsOptions): Promise => { + captured = opts; + let rows = corpus; + if (opts?.severity !== undefined) { + const set = new Set(opts.severity); + rows = rows.filter((f) => set.has(f.severity as "note" | "warning" | "error")); + } + if (opts?.ruleId !== undefined) rows = rows.filter((f) => f.ruleId === opts.ruleId); + if (opts?.limit !== undefined) rows = rows.slice(0, opts.limit); + return rows; + }; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in findings capability test`); + }, + }); + const store: CapabilityStore = { + graph, + temporal: {} as CapabilityStore["temporal"], + }; + return { store, lastOpts: () => captured }; +} + +function ctxFor(corpus: readonly FindingNode[]): { + ctx: CapabilityContext; + lastOpts: () => ListFindingsOptions | undefined; +} { + const { store, lastOpts } = fakeStore(corpus); + return { ctx: { store, repoName: "demo-repo" }, lastOpts }; +} + +async function run(input: FindingsInput, corpus: readonly FindingNode[]) { + const { ctx, lastOpts } = ctxFor(corpus); + const out = await findingsCapability.execute(input, ctx); + return { out, lastOpts }; +} + +test("findings: projects rows, echoes repoName, defaults limit to 500", async () => { + const { out, lastOpts } = await run({}, [finding({ id: "f1" }), finding({ id: "f2" })]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.total, 2); + assert.equal(out.findings.length, 2); + assert.equal(lastOpts()?.limit, 500, "default limit pushed to the storage tier"); + const r = out.findings[0]; + assert.equal(r?.id, "f1"); + assert.equal(r?.scanner, "semgrep"); + assert.equal(r?.severity, "warning"); +}); + +test("findings: severity + ruleId are pushed to the storage tier", async () => { + const { out, lastOpts } = await run({ severity: "error", ruleId: "rule-x" }, [ + finding({ id: "e1", severity: "error", ruleId: "rule-x" }), + finding({ id: "w1", severity: "warning", ruleId: "rule-x" }), + finding({ id: "e2", severity: "error", ruleId: "rule-y" }), + ]); + assert.deepEqual(lastOpts()?.severity, ["error"], "severity pushed down"); + assert.equal(lastOpts()?.ruleId, "rule-x", "ruleId pushed down"); + assert.equal(out.total, 1); + assert.equal(out.findings[0]?.id, "e1"); + assert.deepEqual(out.filters, { severity: "error", ruleId: "rule-x" }); +}); + +test("findings: severity='none' is NOT pushed down; filtered in TS to none-only", async () => { + const { out, lastOpts } = await run({ severity: "none" }, [ + finding({ id: "n1", severity: "none" }), + finding({ id: "w1", severity: "warning" }), + ]); + assert.equal(lastOpts()?.severity, undefined, "'none' must not reach the storage tier"); + assert.equal(out.total, 1); + assert.equal(out.findings[0]?.id, "n1"); +}); + +test("findings: scanner + filePath substring are applied in the TS post-finder", async () => { + const { out } = await run({ scanner: "osv-scanner", filePath: "pkg/" }, [ + finding({ id: "a", scannerId: "osv-scanner", filePath: "pkg/dep.ts" }), + finding({ id: "b", scannerId: "semgrep", filePath: "pkg/dep.ts" }), // wrong scanner + finding({ id: "c", scannerId: "osv-scanner", filePath: "src/app.ts" }), // path miss + ]); + assert.equal(out.total, 1); + assert.equal(out.findings[0]?.id, "a"); + assert.deepEqual(out.filters, { scanner: "osv-scanner", filePath: "pkg/" }); +}); + +test("findings: startLine/endLine included only when finite; missing fields fall back", async () => { + const { out } = await run({}, [ + finding({ id: "withLines", startLine: 3, endLine: 7 }), + finding({ id: "noLines" }), + ]); + const withLines = out.findings.find((f) => f.id === "withLines"); + const noLines = out.findings.find((f) => f.id === "noLines"); + assert.equal(withLines?.startLine, 3); + assert.equal(withLines?.endLine, 7); + assert.equal(noLines?.startLine, undefined, "absent startLine stays absent"); + assert.equal(noLines?.endLine, undefined); +}); + +test("findings: empty corpus yields total 0 and empty filters when unfiltered", async () => { + const { out } = await run({}, []); + assert.equal(out.total, 0); + assert.equal(out.findings.length, 0); + assert.deepEqual(out.filters, {}); +}); diff --git a/packages/core-ops/src/caps/findings.ts b/packages/core-ops/src/caps/findings.ts new file mode 100644 index 00000000..f8aeadb2 --- /dev/null +++ b/packages/core-ops/src/caps/findings.ts @@ -0,0 +1,116 @@ +/** + * `findingsCapability` — the shared reader/filter/projection behind the MCP + * `list_findings` tool and the CLI `codehub findings` command. + * + * Lifted verbatim from the byte-identical bodies of + * `mcp/src/tools/list-findings.ts` and `cli/src/commands/findings.ts` (audit + * findings D4/D7): the `listFindings` push-down (severity + ruleId narrowed at + * the storage tier), the TS post-finder (`severity==="none"`, `scanner`, and + * `filePath` substring), and the row projection through the one canonical + * `stringOr`. Each surface now maps `FindingsOutput` into its own transport. + */ + +import type { Capability, CapabilityContext } from "../capability.js"; +import { stringOr } from "../string-or.js"; + +/** + * The validated, plain input `findingsCapability.execute` consumes. Each + * surface validates its own transport shape into this: the MCP tool via its + * SDK zod `inputSchema`, the CLI via coerced commander flags. `repo`/`repo_uri` + * are resolved to a concrete store by the surface BEFORE `execute` runs, so + * they are not read here — they live on the input only so a surface can pass + * its parsed args object through unchanged. + */ +export interface FindingsInput { + readonly repo?: string; + readonly repo_uri?: string; + readonly severity?: "error" | "warning" | "note" | "none"; + readonly scanner?: string; + readonly ruleId?: string; + readonly filePath?: string; + readonly limit?: number; +} + +/** One projected finding row — the plain shape both surfaces render. */ +export interface FindingRow { + readonly id: string; + readonly scanner: string; + readonly ruleId: string; + readonly severity: string; + readonly message: string; + readonly filePath: string; + readonly startLine?: number; + readonly endLine?: number; + readonly properties: Record; +} + +/** The applied filters, echoed back so presenters can label the output. */ +export interface FindingsFilters { + readonly severity?: string; + readonly scanner?: string; + readonly ruleId?: string; + readonly filePath?: string; +} + +export interface FindingsOutput { + readonly repoName: string; + readonly findings: readonly FindingRow[]; + readonly total: number; + readonly filters: FindingsFilters; +} + +export const findingsCapability: Capability = { + id: "findings", + async execute(input: FindingsInput, ctx: CapabilityContext): Promise { + const limit = input.limit ?? 500; + + // Push severity + ruleId into the storage tier; scanner + filePath + // substring + the `severity==="none"` case are applied in the TS + // post-finder below (we never pass `none` to listFindings). + const findingsOpts: { + severity?: readonly ("note" | "warning" | "error")[]; + ruleId?: string; + limit?: number; + } = { limit }; + if ( + input.severity !== undefined && + (input.severity === "note" || input.severity === "warning" || input.severity === "error") + ) { + findingsOpts.severity = [input.severity]; + } + if (input.ruleId !== undefined) findingsOpts.ruleId = input.ruleId; + const all = await ctx.store.graph.listFindings(findingsOpts); + + const filtered = all.filter((f) => { + if (input.severity === "none" && f.severity !== "none") return false; + if (input.scanner !== undefined && f.scannerId !== input.scanner) return false; + if (input.filePath !== undefined && !f.filePath.includes(input.filePath)) return false; + return true; + }); + + const findings: FindingRow[] = filtered.map((f) => ({ + id: f.id, + scanner: stringOr(f.scannerId, "unknown"), + ruleId: stringOr(f.ruleId, ""), + severity: stringOr(f.severity, "note"), + message: stringOr(f.message, ""), + filePath: stringOr(f.filePath, ""), + properties: f.propertiesBag, + ...(typeof f.startLine === "number" && Number.isFinite(f.startLine) + ? { startLine: f.startLine } + : {}), + ...(typeof f.endLine === "number" && Number.isFinite(f.endLine) + ? { endLine: f.endLine } + : {}), + })); + + const filters: FindingsFilters = { + ...(input.severity !== undefined ? { severity: input.severity } : {}), + ...(input.scanner !== undefined ? { scanner: input.scanner } : {}), + ...(input.ruleId !== undefined ? { ruleId: input.ruleId } : {}), + ...(input.filePath !== undefined ? { filePath: input.filePath } : {}), + }; + + return { repoName: ctx.repoName, findings, total: findings.length, filters }; + }, +}; diff --git a/packages/core-ops/src/caps/license-audit.test.ts b/packages/core-ops/src/caps/license-audit.test.ts new file mode 100644 index 00000000..4262f4ac --- /dev/null +++ b/packages/core-ops/src/caps/license-audit.test.ts @@ -0,0 +1,80 @@ +/** + * Unit tests for `licenseAuditCapability.execute` — the shared reader/classifier + * lifted from the MCP `license_audit` tool. Exercises `execute` directly against + * a fake `CapabilityStore`; the tier logic itself is covered exhaustively in + * `@opencodehub/analysis` `license-classify.test.ts`, so here we assert only the + * read + projection + hand-off to `classifyDependencies`. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { DependencyNode, NodeId } from "@opencodehub/core-types"; +import type { IGraphStore, ListDependenciesOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { type LicenseAuditInput, licenseAuditCapability } from "./license-audit.js"; + +function dep(over: Omit, "id"> & { id: string }): DependencyNode { + return { + kind: "Dependency", + name: over.id, + filePath: "package.json", + version: "1.0.0", + ecosystem: "npm", + lockfileSource: "package-lock.json", + ...over, + id: over.id as NodeId, + } as DependencyNode; +} + +function fakeStore(corpus: readonly DependencyNode[]): CapabilityStore { + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listDependencies") { + return async (_opts?: ListDependenciesOptions): Promise => + corpus; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in license-audit capability test`); + }, + }); + return { graph, temporal: {} as CapabilityStore["temporal"] }; +} + +async function run(corpus: readonly DependencyNode[], input: LicenseAuditInput = {}) { + const ctx: CapabilityContext = { store: fakeStore(corpus), repoName: "demo-repo" }; + return licenseAuditCapability.execute(input, ctx); +} + +test("license-audit: echoes repoName and classifies an all-clear set as OK", async () => { + const out = await run([ + dep({ id: "lodash", license: "MIT" }), + dep({ id: "axios", license: "Apache-2.0" }), + ]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.result.tier, "OK"); + assert.equal(out.result.summary.total, 2); + assert.equal(out.result.summary.flaggedCount, 0); +}); + +test("license-audit: missing license → UNKNOWN sentinel → WARN tier", async () => { + // `dep()` sets no default license, so `mystery` arrives with license absent — + // the projection's `stringOr(d.license, "UNKNOWN")` yields the UNKNOWN sentinel. + const out = await run([dep({ id: "mystery" }), dep({ id: "good", license: "MIT" })]); + assert.equal(out.result.tier, "WARN"); + assert.equal(out.result.flagged.unknown.length, 1); + assert.equal(out.result.flagged.unknown[0]?.name, "mystery"); +}); + +test("license-audit: a copyleft dep drives BLOCK", async () => { + const out = await run([ + dep({ id: "readline", license: "GPL-3.0" }), + dep({ id: "good", license: "MIT" }), + ]); + assert.equal(out.result.tier, "BLOCK"); + assert.equal(out.result.flagged.copyleft.length, 1); +}); + +test("license-audit: empty corpus classifies as OK with zero total", async () => { + const out = await run([]); + assert.equal(out.result.tier, "OK"); + assert.equal(out.result.summary.total, 0); +}); diff --git a/packages/core-ops/src/caps/license-audit.ts b/packages/core-ops/src/caps/license-audit.ts new file mode 100644 index 00000000..266389c0 --- /dev/null +++ b/packages/core-ops/src/caps/license-audit.ts @@ -0,0 +1,52 @@ +/** + * `licenseAuditCapability` — the shared reader/classifier behind the MCP + * `license_audit` tool (and, once the CLI adopts it, `codehub license-audit`). + * + * Lifted verbatim from the body of `mcp/src/tools/license-audit.ts`: read every + * Dependency node, project each into a `DependencyRef` through the one canonical + * `stringOr`, then hand the set to `classifyDependencies` (the pure tier logic in + * `@opencodehub/analysis`). The surface maps `LicenseAuditOutput` into its own + * transport (text body + next_steps + staleness envelope). + */ + +import { + classifyDependencies, + type DependencyRef, + type LicenseAuditResult, +} from "@opencodehub/analysis"; +import type { Capability, CapabilityContext } from "../capability.js"; +import { stringOr } from "../string-or.js"; + +/** + * The validated, plain input `licenseAuditCapability.execute` consumes. + * `repo`/`repo_uri` are resolved to a concrete store by the surface BEFORE + * `execute` runs; they live on the input only so a surface can pass its parsed + * args object through unchanged. + */ +export interface LicenseAuditInput { + readonly repo?: string; + readonly repo_uri?: string; +} + +export interface LicenseAuditOutput { + readonly repoName: string; + readonly result: LicenseAuditResult; +} + +export const licenseAuditCapability: Capability = { + id: "license_audit", + async execute(_input: LicenseAuditInput, ctx: CapabilityContext): Promise { + const all = await ctx.store.graph.listDependencies(); + const deps: DependencyRef[] = all.map((d) => ({ + id: d.id, + name: d.name, + version: stringOr(d.version, "UNKNOWN"), + ecosystem: stringOr(d.ecosystem, "unknown"), + license: stringOr(d.license, "UNKNOWN"), + lockfileSource: stringOr(d.lockfileSource, d.filePath), + })); + + const result = classifyDependencies(deps); + return { repoName: ctx.repoName, result }; + }, +}; diff --git a/packages/core-ops/src/caps/project-profile.test.ts b/packages/core-ops/src/caps/project-profile.test.ts new file mode 100644 index 00000000..9d10519c --- /dev/null +++ b/packages/core-ops/src/caps/project-profile.test.ts @@ -0,0 +1,94 @@ +/** + * Unit tests for `projectProfileCapability.execute` — the shared singleton + * reader/decoder lifted from the MCP `project_profile` tool. Exercises `execute` + * directly against a fake `CapabilityStore`. + */ + +import assert from "node:assert/strict"; +import { test } from "node:test"; +import type { NodeId, ProjectProfileNode } from "@opencodehub/core-types"; +import type { IGraphStore, ListNodesByKindOptions } from "@opencodehub/storage"; +import type { CapabilityContext, CapabilityStore } from "../capability.js"; +import { projectProfileCapability } from "./project-profile.js"; + +function profileNode(over: Partial): ProjectProfileNode { + return { + kind: "ProjectProfile", + id: "ProjectProfile:.:profile" as NodeId, + name: "profile", + filePath: ".", + languages: [], + frameworks: [], + iacTypes: [], + apiContracts: [], + manifests: [], + srcDirs: [], + ...over, + } as ProjectProfileNode; +} + +function fakeStore(nodes: readonly ProjectProfileNode[]): CapabilityStore { + const graph = new Proxy({} as IGraphStore, { + get(_t, prop) { + if (prop === "listNodesByKind") { + return async ( + _kind: string, + _opts?: ListNodesByKindOptions, + ): Promise => nodes; + } + throw new Error(`unexpected IGraphStore.${String(prop)} in project-profile capability test`); + }, + }); + return { graph, temporal: {} as CapabilityStore["temporal"] }; +} + +async function run(nodes: readonly ProjectProfileNode[]) { + const ctx: CapabilityContext = { store: fakeStore(nodes), repoName: "demo-repo" }; + return projectProfileCapability.execute({}, ctx); +} + +test("project-profile: decodes arrays, echoes repoName, flags profileExists", async () => { + const out = await run([ + profileNode({ + languages: ["typescript", "python"], + frameworks: ["nextjs"], + iacTypes: ["terraform"], + apiContracts: ["openapi"], + manifests: ["package.json"], + srcDirs: ["src"], + }), + ]); + assert.equal(out.repoName, "demo-repo"); + assert.equal(out.profileExists, true); + assert.deepEqual([...out.profile.languages], ["typescript", "python"]); + assert.deepEqual([...out.profile.frameworks], ["nextjs"]); + assert.deepEqual([...out.profile.iacTypes], ["terraform"]); + assert.deepEqual([...out.profile.apiContracts], ["openapi"]); + assert.equal(out.profile.frameworksDetected.length, 0, "absent frameworksDetected → empty array"); +}); + +test("project-profile: carries structured frameworksDetected when present", async () => { + const out = await run([ + profileNode({ + frameworks: ["nextjs"], + frameworksDetected: [ + { + name: "nextjs", + category: "meta", + variant: "app-router", + confidence: "deterministic", + evidence: [], + }, + ], + }), + ]); + assert.equal(out.profile.frameworksDetected.length, 1); + assert.equal(out.profile.frameworksDetected[0]?.variant, "app-router"); +}); + +test("project-profile: no node → profileExists false with empty arrays", async () => { + const out = await run([]); + assert.equal(out.profileExists, false); + assert.equal(out.profile.languages.length, 0); + assert.equal(out.profile.srcDirs.length, 0); +}); diff --git a/packages/core-ops/src/caps/project-profile.ts b/packages/core-ops/src/caps/project-profile.ts new file mode 100644 index 00000000..d798101e --- /dev/null +++ b/packages/core-ops/src/caps/project-profile.ts @@ -0,0 +1,65 @@ +/** + * `projectProfileCapability` — the shared reader behind the MCP + * `project_profile` tool (and, once the CLI adopts it, `codehub profile`). + * + * Lifted verbatim from the body of `mcp/src/tools/project-profile.ts`: read the + * singleton ProjectProfile node, decode every array column back into a plain + * array, and report whether the node existed at all (so the surface can nudge + * toward `codehub analyze --force`). The surface maps `ProjectProfileOutput` + * into its own transport (text body + next_steps + staleness envelope). + */ + +import type { FrameworkDetection } from "@opencodehub/core-types"; +import type { Capability, CapabilityContext } from "../capability.js"; + +/** + * The validated, plain input `projectProfileCapability.execute` consumes. + * `repo`/`repo_uri` are resolved to a concrete store by the surface BEFORE + * `execute` runs; they live on the input only so a surface can pass its parsed + * args object through unchanged. + */ +export interface ProjectProfileInput { + readonly repo?: string; + readonly repo_uri?: string; +} + +export interface ProjectProfilePayload { + readonly languages: readonly string[]; + /** Flat-string framework view (backward-compat). */ + readonly frameworks: readonly string[]; + /** Structured framework detections with variant / version / confidence / parent. */ + readonly frameworksDetected: readonly FrameworkDetection[]; + readonly iacTypes: readonly string[]; + readonly apiContracts: readonly string[]; + readonly manifests: readonly string[]; + readonly srcDirs: readonly string[]; +} + +export interface ProjectProfileOutput { + readonly repoName: string; + /** Whether a ProjectProfile node was present (drives the surface's hint). */ + readonly profileExists: boolean; + readonly profile: ProjectProfilePayload; +} + +export const projectProfileCapability: Capability = { + id: "project_profile", + async execute( + _input: ProjectProfileInput, + ctx: CapabilityContext, + ): Promise { + const nodes = await ctx.store.graph.listNodesByKind("ProjectProfile", { limit: 1 }); + const profile = nodes[0]; + const payload: ProjectProfilePayload = { + languages: profile?.languages ? [...profile.languages] : [], + frameworks: profile?.frameworks ? [...profile.frameworks] : [], + frameworksDetected: profile?.frameworksDetected ? [...profile.frameworksDetected] : [], + iacTypes: profile?.iacTypes ? [...profile.iacTypes] : [], + apiContracts: profile?.apiContracts ? [...profile.apiContracts] : [], + manifests: profile?.manifests ? [...profile.manifests] : [], + srcDirs: profile?.srcDirs ? [...profile.srcDirs] : [], + }; + + return { repoName: ctx.repoName, profileExists: profile !== undefined, profile: payload }; + }, +}; diff --git a/packages/core-ops/src/index.ts b/packages/core-ops/src/index.ts new file mode 100644 index 00000000..f4a35424 --- /dev/null +++ b/packages/core-ops/src/index.ts @@ -0,0 +1,33 @@ +export type { Capability, CapabilityContext, CapabilityStore } from "./capability.js"; +export { + type ContextInput, + type ContextOutput, + type ContextProcessParticipation, + contextCapability, +} from "./caps/context.js"; +export { + type DependenciesFilters, + type DependenciesInput, + type DependenciesOutput, + type DependencyRow, + dependenciesCapability, +} from "./caps/dependencies.js"; +export { + type FindingRow, + type FindingsFilters, + type FindingsInput, + type FindingsOutput, + findingsCapability, +} from "./caps/findings.js"; +export { + type LicenseAuditInput, + type LicenseAuditOutput, + licenseAuditCapability, +} from "./caps/license-audit.js"; +export { + type ProjectProfileInput, + type ProjectProfileOutput, + type ProjectProfilePayload, + projectProfileCapability, +} from "./caps/project-profile.js"; +export { stringOr } from "./string-or.js"; diff --git a/packages/core-ops/src/string-or.ts b/packages/core-ops/src/string-or.ts new file mode 100644 index 00000000..564e4a10 --- /dev/null +++ b/packages/core-ops/src/string-or.ts @@ -0,0 +1,13 @@ +/** + * The one canonical `stringOr`. Coerces a value to a string: passes strings + * through, stringifies numbers/booleans, and falls back otherwise. + * + * This was copy-pasted byte-identically across the MCP tools and CLI commands + * (tech-debt audit finding D7 — 7 files). Capabilities and their adapters + * import it from here so a change lands once. + */ +export function stringOr(v: unknown, fallback: string): string { + if (typeof v === "string") return v; + if (typeof v === "number" || typeof v === "boolean") return String(v); + return fallback; +} diff --git a/packages/core-ops/tsconfig.json b/packages/core-ops/tsconfig.json new file mode 100644 index 00000000..293f2428 --- /dev/null +++ b/packages/core-ops/tsconfig.json @@ -0,0 +1,14 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "composite": true + }, + "references": [ + { "path": "../core-types" }, + { "path": "../storage" }, + { "path": "../analysis" } + ], + "include": ["src/**/*", "test/**/*"] +} diff --git a/packages/core-types/src/index.ts b/packages/core-types/src/index.ts index 638f72d2..be351fc7 100644 --- a/packages/core-types/src/index.ts +++ b/packages/core-types/src/index.ts @@ -7,6 +7,7 @@ export type { EdgeId, MakeNodeIdOptions, NodeId, ParsedNodeId } from "./id.js"; export { makeEdgeId, makeNodeId, parseNodeId } from "./id.js"; export type { LanguageId } from "./language-id.js"; export { PROVENANCE_PREFIXES, SCIP_PROVENANCE_PREFIXES } from "./lsp-provenance.js"; +export { NODE_COLUMNS, RELATION_COLUMNS } from "./node-columns.js"; export type { AnnotationNode, ClassNode, diff --git a/packages/core-types/src/node-columns.ts b/packages/core-types/src/node-columns.ts new file mode 100644 index 00000000..82bf59c6 --- /dev/null +++ b/packages/core-types/src/node-columns.ts @@ -0,0 +1,145 @@ +/** + * Canonical column rosters for the polymorphic graph store — pure, + * dependency-free. + * + * These two lists are the SINGLE SOURCE OF TRUTH for the logical field + * vocabulary of the graph. They live in `@opencodehub/core-types` (the + * deepest, zero-runtime-dep package) so every downstream consumer stays in + * lock-step: + * - `@opencodehub/storage` re-exports {@link NODE_COLUMNS} from + * `column-encode.ts` as the encoder's canonical field ordering, which a + * community-fork `IGraphStore` adapter (AGE / Memgraph / Neo4j / Neptune) + * consumes when it stores the universal base as typed columns. + * - `@opencodehub/mcp` advertises both lists in the + * `codehub://repo/{name}/schema` resource so SQL-authoring agents see the + * full logical field vocabulary they may filter on. + * + * Keeping one copy here fixes the staleness class where a hand-maintained + * duplicate silently truncated the advertised roster. + */ + +/** + * Canonical field ordering for the polymorphic `nodes` table (73 entries). + * The shared reference a community-fork adapter (AGE / Memgraph / Neo4j / + * Neptune) consumes when it stores the universal base as typed columns. + * + * The in-tree `SqliteStore` (ADR 0019) stores only the universal base + * (`id, kind, name, file_path, start_line, end_line`) as typed columns and + * folds every remaining kind-specific field into a single canonical-JSON + * `payload` column, so adding a kind-specific field needs NO schema change + * there — it round-trips through `payload` automatically. The `[]`-vs-absent + * and `{}`-vs-absent distinctions are preserved by `canonicalJson` over + * `payload`, not by per-column encoding. + * + * Rules for a fork that DOES store a new field as a typed column: + * 1. Append to the END of this list — reordering rewrites every prepared + * statement parameter slot and breaks already-persisted graphs. + * 2. Append the writer in `nodeToColumns` (`@opencodehub/storage`). + * 3. Append the reader in the adapter's row decoder. + * 4. Update that adapter's CREATE TABLE DDL to keep the on-disk schema in + * lock step with this list. + * + * ORDER IS APPEND-ONLY AND LOAD-BEARING — never reorder. + */ +export const NODE_COLUMNS: readonly string[] = [ + "id", + "kind", + "name", + "file_path", + "start_line", + "end_line", + "is_exported", + "signature", + "parameter_count", + "return_type", + "declared_type", + "owner", + "url", + "method", + "tool_name", + "content", + "content_hash", + "inferred_label", + "symbol_count", + "cohesion", + "keywords", + "entry_point_id", + "step_count", + "level", + "response_keys", + "description", + // Finding + "severity", + "rule_id", + "scanner_id", + "message", + "properties_bag", + // Dependency + "version", + "license", + "lockfile_source", + "ecosystem", + // Operation + "http_method", + "http_path", + "summary", + "operation_id", + // Contributor + "email_hash", + "email_plain", + // ProjectProfile + "languages_json", + "frameworks_json", + "iac_types_json", + "api_contracts_json", + "manifests_json", + "src_dirs_json", + // File ownership (H.5) + Community ownership (H.4) + "orphan_grade", + "is_orphan", + "truck_factor", + "ownership_drift_30d", + "ownership_drift_90d", + "ownership_drift_365d", + // v1.2 extensions (append-only). + "deadness", + "coverage_percent", + "covered_lines_json", + "cyclomatic_complexity", + "nesting_depth", + "nloc", + "halstead_volume", + "input_schema_json", + "partial_fingerprint", + "baseline_state", + "suppressed_json", + // Repo. + "origin_url", + "repo_uri", + "default_branch", + "commit_sha", + "index_time", + "repo_group", + "visibility", + "indexer", + "language_stats_json", +]; + +/** + * Logical column roster for the polymorphic `relations` (edges) table + * (7 entries) as advertised to SQL-authoring agents. + * + * These are LOGICAL names. The physical SQLite DDL names the endpoint columns + * `src`/`dst`, but the advertised/logical roster uses `from_id`/`to_id` — do + * not "fix" these to `src`/`dst`; that would change the schema resource's + * advertised output and break the honest logical vocabulary. + */ +export const RELATION_COLUMNS: readonly string[] = [ + "id", + "from_id", + "to_id", + "type", + "confidence", + "reason", + "step", +]; diff --git a/packages/docs/astro.config.mjs b/packages/docs/astro.config.mjs index 96bb099b..dfd62e69 100644 --- a/packages/docs/astro.config.mjs +++ b/packages/docs/astro.config.mjs @@ -20,7 +20,7 @@ export default defineConfig({ starlight({ title: "OpenCodeHub", description: - "Apache-2.0 code intelligence graph + MCP server for AI coding agents. 30 tools, 15 GA languages, lbug graph + DuckDB temporal, WASM-only parsing, deterministic, offline-capable.", + "Apache-2.0 code intelligence graph + MCP server for AI coding agents. 29 tools, 15 GA languages, single-file SQLite storage, WASM-only parsing, deterministic, offline-capable.", logo: { src: "./src/assets/logo.svg", replacesTitle: false, @@ -48,7 +48,7 @@ export default defineConfig({ description: "Apache-2.0 code intelligence graph + MCP server for AI coding agents. Gives agents callers, callees, processes, and blast radius in one MCP tool call — local, offline-capable, deterministic.", details: - "OpenCodeHub indexes a repository into a hybrid structural + semantic knowledge graph and exposes it over the Model Context Protocol (MCP) to AI coding agents. The MCP server registers 30 tools across five families — exploration (list_repos, query, context, impact, detect_changes, rename, sql, signature), group / federation (group_list, group_query, group_status, group_contracts, group_cross_repo_links, group_sync), scan / findings / verdict (scan, list_findings, list_findings_delta, list_dead_code, remove_dead_code, license_audit, verdict, risk_trends), HTTP / routing (route_map, api_impact, shape_check, tool_map), and meta (project_profile, dependencies, owners, pack_codebase). The CLI binary is `codehub`. Runtime: Node 20, 22, or 24, pnpm 11, lbug graph store (graph.lbug) + DuckDB temporal sibling (temporal.duckdb), always both, no backend selector (ADR 0016), web-tree-sitter (WASM) is the only parse runtime with all 15 grammar `.wasm` blobs vendored at packages/ingestion/vendor/wasms/, 15 GA languages, SCIP indexers for TypeScript / TSX / JavaScript / Python / Go / Rust / Java / C# / C / C++ / Kotlin / Ruby. 19-scanner inventory. Apache-2.0 end to end. Repos are first-class graph nodes (`repo_uri`); the cross-repo `group_*` family fans out over named groups; AMBIGUOUS_REPO error envelope returns `choices[]` so a caller can retry deterministically.", + "OpenCodeHub indexes a repository into a hybrid structural + semantic knowledge graph and exposes it over the Model Context Protocol (MCP) to AI coding agents. The MCP server registers 29 tools across five families — exploration (list_repos, query, context, impact, detect_changes, sql, signature), group / federation (group_list, group_query, group_status, group_contracts, group_cross_repo_links, group_sync), scan / findings / verdict (scan, list_findings, list_findings_delta, list_dead_code, license_audit, verdict, change_pack, risk_trends), HTTP / routing (route_map, api_impact, shape_check, tool_map), and meta (project_profile, dependencies, owners, pack_codebase). The CLI binary is `codehub`. Runtime: Node 24.15+, pnpm 11, a single-file SQLite store (`/.codehub/store.sqlite`, via Node's built-in node:sqlite) that backs graph nodes, edges, embeddings, and the temporal tables — zero native storage bindings (ADR 0019), web-tree-sitter (WASM) is the only parse runtime with all 15 grammar `.wasm` blobs vendored at packages/ingestion/vendor/wasms/, 15 GA languages, SCIP indexers for TypeScript / TSX / JavaScript / Python / Go / Rust / Java / C# / C / C++ / Kotlin / Ruby. 19-scanner inventory. Apache-2.0 end to end. Repos are first-class graph nodes (`repo_uri`); the cross-repo `group_*` family fans out over named groups; AMBIGUOUS_REPO error envelope returns `choices[]` so a caller can retry deterministically.", promote: [ "start-here/**", "agents/**", @@ -80,7 +80,7 @@ export default defineConfig({ label: "agents", paths: ["agents/**", "mcp/**"], description: - "Agent-side reference: per-editor MCP setup, the 30-tool catalog, tool decision matrix, idiomatic prompts.", + "Agent-side reference: per-editor MCP setup, the 29-tool catalog, tool decision matrix, idiomatic prompts.", }, { label: "mcp", diff --git a/packages/docs/public/tool-catalog.json b/packages/docs/public/tool-catalog.json index 33d7b1b6..eac10b6a 100644 --- a/packages/docs/public/tool-catalog.json +++ b/packages/docs/public/tool-catalog.json @@ -1,14 +1,14 @@ { "$schema": "https://opencodehub.dev/schemas/tool-catalog-v1.json", "version": "1.0.0", - "description": "Machine-readable catalog of the 28 MCP tools the OpenCodeHub server registers. Every tool is read-only with respect to user source — no tool edits the working tree. Generated to be fetched by an AI coding agent that wants the catalog without scraping the docs.", + "description": "Machine-readable catalog of the 29 MCP tools the OpenCodeHub server registers. Every tool is read-only with respect to user source — no tool edits the working tree. Generated to be fetched by an AI coding agent that wants the catalog without scraping the docs.", "server": { "name": "opencodehub", "transport": "stdio", "launch_command": "codehub mcp", "capabilities": ["tools", "resources"] }, - "tool_count": 28, + "tool_count": 29, "families": { "exploration": "High-frequency code-graph tools.", "group": "Cross-repo federation tools (require a named group).", @@ -65,7 +65,7 @@ { "name": "sql", "family": "exploration", - "description": "Read-only SQL against the DuckDB temporal store (cochanges + symbol_summaries). 5-second timeout. The node/edge graph is queried via the typed tools or Cypher.", + "description": "Read-only SQL against the temporal store (cochanges + symbol_summaries). 5-second timeout. The node/edge graph is queried via the typed tools or Cypher.", "when_to_use": "Custom view of the temporal store (cochanges + symbol_summaries) that no other tool exposes.", "when_not_to_use": "A typed tool (context, impact, query) already covers the question, or you need the node/edge graph (reach it via the typed tools or Cypher).", "signature_sketch": "sql({query, repo?, repo_uri?}) -> {rows, row_count, next_steps}", @@ -188,6 +188,15 @@ "signature_sketch": "verdict({repo?, repo_uri?, base?, head?}) -> {tier, exit_code, reasons, signals}", "example": "verdict({base: 'main', head: 'HEAD'})" }, + { + "name": "change_pack", + "family": "scan", + "description": "Deterministic, diff-scoped context pack: changed symbols plus their upstream impacted subgraph, the 5-tier verdict, affected tests, and a token-cost estimate.", + "when_to_use": "Hand a CI agent everything a diff touches in one read-only payload.", + "when_not_to_use": "Whole-repo snapshot — call pack_codebase; plain merge gate — call verdict.", + "signature_sketch": "change_pack({repo?, repo_uri?, base?, head?, depth?, budget?}) -> {changed, impacted_subgraph, verdict, affected_tests, cost_estimate}", + "example": "change_pack({base: 'main', head: 'HEAD'})" + }, { "name": "risk_trends", "family": "scan", diff --git a/packages/docs/src/content/docs/agents/discovery-and-resources.mdx b/packages/docs/src/content/docs/agents/discovery-and-resources.mdx index 1d14c4e5..a0af99e1 100644 --- a/packages/docs/src/content/docs/agents/discovery-and-resources.mdx +++ b/packages/docs/src/content/docs/agents/discovery-and-resources.mdx @@ -57,7 +57,7 @@ submission status on the [MCP registries page](/opencodehub/agents/registries/). ## Source of truth for tool inventory -The MCP server registers 28 tools at +The MCP server registers 29 tools at [`packages/mcp/src/server.ts`](https://github.com/theagenticguy/opencodehub/blob/main/packages/mcp/src/server.ts). Grep for `register[A-Z][a-zA-Z]+Tool\(server` to see the live list. If this site or any registry disagrees with the file, the file wins. diff --git a/packages/docs/src/content/docs/agents/editors/claude-code.mdx b/packages/docs/src/content/docs/agents/editors/claude-code.mdx index 907d7c6e..0353d36d 100644 --- a/packages/docs/src/content/docs/agents/editors/claude-code.mdx +++ b/packages/docs/src/content/docs/agents/editors/claude-code.mdx @@ -36,7 +36,7 @@ The `.mcp.json` shape: } ``` -`codehub mcp` runs the stdio MCP server. The 28 tools register under +`codehub mcp` runs the stdio MCP server. The 29 tools register under the `mcp__opencodehub__*` namespace. Every one is read-only with respect to your source — no tool edits the working tree. @@ -122,7 +122,7 @@ In a Claude Code session, ask: which OpenCodeHub tools do you see? ``` -The agent should list 28 tools, all under `mcp__opencodehub__*`. If it +The agent should list 29 tools, all under `mcp__opencodehub__*`. If it sees zero, the most common causes are: Claude Code wasn't restarted after `codehub init`, or `codehub` is not on PATH for the editor's process (try launching the editor from a shell that has `codehub` diff --git a/packages/docs/src/content/docs/agents/editors/cursor.mdx b/packages/docs/src/content/docs/agents/editors/cursor.mdx index 52709a08..010ba014 100644 --- a/packages/docs/src/content/docs/agents/editors/cursor.mdx +++ b/packages/docs/src/content/docs/agents/editors/cursor.mdx @@ -32,7 +32,7 @@ define the same server name. ``` That is the entire config. `codehub mcp` runs the stdio MCP server -and registers all 28 tools under `mcp__opencodehub__*`. +and registers all 29 tools under `mcp__opencodehub__*`. If `codehub` is not on your shell PATH (Cursor inherits the GUI app's environment, not your shell's), substitute the absolute path: @@ -55,7 +55,7 @@ Find the path with `which codehub` in your terminal. 1. Restart Cursor (the agent only loads MCP servers at startup). 2. Open the chat panel. 3. Ask: `which OpenCodeHub tools do you see?` -4. Expect 28 tools listed under `mcp__opencodehub__*`. +4. Expect 29 tools listed under `mcp__opencodehub__*`. If you see zero tools, check Cursor's MCP debug pane (Settings → MCP) for the server's stderr. The most common cause is `codehub` not being diff --git a/packages/docs/src/content/docs/agents/editors/opencode.mdx b/packages/docs/src/content/docs/agents/editors/opencode.mdx index 0241ff84..7fe38256 100644 --- a/packages/docs/src/content/docs/agents/editors/opencode.mdx +++ b/packages/docs/src/content/docs/agents/editors/opencode.mdx @@ -62,7 +62,7 @@ If you need env vars: 1. Restart OpenCode (or reload the workspace). 2. Open a chat session. 3. Ask: `which OpenCodeHub tools do you see?` -4. Expect 28 tools under `mcp__opencodehub__*`. +4. Expect 29 tools under `mcp__opencodehub__*`. OpenCode logs MCP server stderr to its dev console — open it if the server fails to register. diff --git a/packages/docs/src/content/docs/agents/editors/windsurf.mdx b/packages/docs/src/content/docs/agents/editors/windsurf.mdx index 0b0ffcda..07cdaec7 100644 --- a/packages/docs/src/content/docs/agents/editors/windsurf.mdx +++ b/packages/docs/src/content/docs/agents/editors/windsurf.mdx @@ -38,7 +38,7 @@ servers, add `codehub` as a sibling key under `mcpServers`. 1. Fully restart Windsurf — Cascade only loads MCP servers at boot. 2. Open Cascade in any project. 3. Ask: `which OpenCodeHub tools do you see?` -4. Expect 28 tools under `mcp__opencodehub__*`. +4. Expect 29 tools under `mcp__opencodehub__*`. If Cascade reports zero tools, check the MCP server status pane in Cascade's settings — failed servers list their stderr there. The diff --git a/packages/docs/src/content/docs/agents/index.mdx b/packages/docs/src/content/docs/agents/index.mdx index 323caa36..b4b326ae 100644 --- a/packages/docs/src/content/docs/agents/index.mdx +++ b/packages/docs/src/content/docs/agents/index.mdx @@ -9,7 +9,7 @@ import { Card, CardGrid, LinkCard } from "@astrojs/starlight/components"; OpenCodeHub gives an AI coding agent a code graph it can query: callers, callees, processes, blast radius, owners, scanner findings, and a 5-tier -PR verdict — all behind 28 MCP tools served by one local binary. The +PR verdict — all behind 29 MCP tools served by one local binary. The graph is built deterministically from your repo and stored next to it. Other docs sections answer "what is OCH" and "how is it built." This @@ -40,7 +40,7 @@ codehub init # writes .mcp.json + links the Claude Code plugin codehub analyze # first index — 30s to a few minutes ``` -Restart your editor. Your agent now has 28 MCP tools, all prefixed +Restart your editor. Your agent now has 29 MCP tools, all prefixed `mcp__opencodehub__*`. See [Install](/opencodehub/agents/install/) for the full path or jump to the per-editor card below. @@ -65,7 +65,7 @@ the full path or jump to the per-editor card below. =20.0.0`. `npm install -g @opencodehub/cli@latest` does zero native -builds and zero GitHub fetches. Supersedes ADR 0013 (parse runtime). +(`web-tree-sitter`) is now the only parse runtime on Node ≥24.15. All +15 grammar `.wasm` blobs are vendored at +`packages/ingestion/vendor/wasms/`. `npm install -g @opencodehub/cli@latest` +does zero native builds and zero GitHub fetches. Supersedes ADR 0013 +(parse runtime). [Read ADR 0015](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0015-wasm-only-parser-at-the-npm-distributed-boundary.md) -### ADR 0016 — DuckDB graph rip-out +### ADR 0016 — Graph-backend rip-out -Remove the DuckDB graph backend, the `CODEHUB_STORE` env var, the -backend probe, and the single-file `graph.duckdb` layout. The graph -tier is always `@ladybugdb/core` (`graph.lbug`); the temporal tier is -always DuckDB (`temporal.duckdb`); both files are written on every -`analyze`, with no selector. A missing graph binding hard-fails with -`GraphDbBindingError`. The segregated `IGraphStore` / `ITemporalStore` -interfaces stay as the community-fork adapter contract. +Removes the `CODEHUB_STORE` env var, the backend probe, and the +selector, settling storage on a two-file native pair with the segregated +`IGraphStore` / `ITemporalStore` interfaces preserved for community +forks. **Superseded by ADR 0019**, which collapses that pair into one +`store.sqlite` file and removes both native storage bindings. The +segregated interfaces it kept survive unchanged. [Read ADR 0016](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0016-duckdb-graph-rip.md) +### ADR 0018 — Cleanroom tool-name provenance + +Records the cleanroom provenance of the route / tool / contract tool +names, documenting the independent-derivation trail for each name. + +[Read ADR 0018](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0018-cleanroom-tool-name-provenance.md) + +### ADR 0019 — Single-file SQLite storage + +Collapses the entire index into one `/.codehub/store.sqlite` file +(WAL mode) via Node's built-in `node:sqlite` (`DatabaseSync`, enabled by +default on Node ≥24.15). One `SqliteStore` implements both `IGraphStore` +and `ITemporalStore`; `openStore()` returns that single instance as both +the `graph` and `temporal` views, so call sites use `store.graph.X()` / +`store.temporal.Y()` unchanged. Both native storage bindings are removed +and the write-only Parquet embeddings sidecar is dropped, so the +code-pack becomes an 8-item BOM and the install carries zero native +storage dependencies. Every platform is supported, including Windows +arm64 and Linux musl (Alpine). Supersedes ADR 0016 in its entirety; the +segregated interfaces stay as the community-fork escape hatch. + +[Read ADR 0019](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) + +### ADR 0020 — Decision-equivalence supersedes byte-identity + +Makes decision-equivalence the pack contract and treats byte-identity as +a witness rather than the contract itself. Pairs with the pack +determinism spec. + +[Read ADR 0020](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0020-decision-equivalence-supersedes-byte-identity.md) + ### ADR 0017 — Drop detect-secrets, tune betterleaks Remove `detect-secrets` from the scanner fleet in favour of diff --git a/packages/docs/src/content/docs/architecture/determinism.md b/packages/docs/src/content/docs/architecture/determinism.md index e11d44f7..aebc64d4 100644 --- a/packages/docs/src/content/docs/architecture/determinism.md +++ b/packages/docs/src/content/docs/architecture/determinism.md @@ -33,7 +33,7 @@ Three concrete reasons: An input is: - Source tree contents at the current commit. -- Toolchain versions (Node 22 or 24, pnpm 11.x, tree-sitter grammars +- Toolchain versions (Node ≥24.15, pnpm 11.x, tree-sitter grammars pinned in `packages/ingestion/package.json`, SCIP indexer versions pinned in `.github/workflows/gym.yml` per ADR 0006). - OpenCodeHub version (the monorepo version pinned in @@ -45,9 +45,9 @@ Anything outside that list — wall-clock time, process ID, file-system inode ordering — must not influence the hash. The ingestion phases are pure: inputs in, relations out, no ambient state. -The `graphHash` invariant covers everything the graph store -(`graph.lbug`) owns; the temporal signals in the DuckDB sibling -(`temporal.duckdb`) are statistical and never enter the hash. A parity +The `graphHash` invariant covers the graph nodes and edges in +`store.sqlite`; the temporal signals in the same file (cochanges, +symbol summaries) are statistical and never enter the hash. A parity gate in CI asserts the invariant on every PR that touches the storage layer. @@ -120,9 +120,9 @@ bytes?" If the answer is not obviously yes, the phase is wrong. ## Related -- [ADR 0001 — Storage backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0001-storage-backend.md) — - "Deterministic writes given identical INSERT order" is a listed - positive of DuckDB vs. engines with random header UUIDs. +- [ADR 0019 — Single-file SQLite storage](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) — + the graphHash byte-identity gate (`sqlite-parity.test.ts`) that a + rebuilt `KnowledgeGraph` must hash identically to the original. - [ADR 0002 — Rust core deferred](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0002-rust-core-deferred.md) — calls out the "full vs incremental `graphHash` byte-identical" determinism CI gate explicitly. diff --git a/packages/docs/src/content/docs/architecture/embeddings.md b/packages/docs/src/content/docs/architecture/embeddings.md index 5ec0841e..10735b7e 100644 --- a/packages/docs/src/content/docs/architecture/embeddings.md +++ b/packages/docs/src/content/docs/architecture/embeddings.md @@ -1,6 +1,6 @@ --- title: Embeddings -description: Three backends in a priority cascade, three tiers keyed by a granularity discriminator, one HNSW index with filter-aware traversal. +description: Three backends in a priority cascade, three tiers keyed by a granularity discriminator, one embeddings table with exact brute-force cosine KNN. sidebar: order: 50 --- @@ -8,8 +8,8 @@ sidebar: Embeddings are optional. When enabled, the pipeline produces vectors at three granularities (symbol, file, community) from one of three backends (ONNX local, HTTP/OpenAI-compat, SageMaker) and persists -them in the graph backend's embeddings table served by one HNSW -index. This page covers the backend cascade, the tier model, the +them in the `embeddings` table in `store.sqlite`, searched by exact +brute-force cosine KNN. This page covers the backend cascade, the tier model, the storage shape, and why `WHERE granularity='symbol'` does not collapse recall. @@ -123,39 +123,30 @@ embedded text when an LLM summary exists for the node. See [Summarization and fusion](/opencodehub/architecture/summarization-and-fusion/) for the formula. -## Single HNSW index +## Single embeddings table -The storage shape is deliberately simple: one embeddings table, one -HNSW index over the `vector` column, one `granularity` column as a -discriminator. All three tiers share this index. Granularity filtering -is pushed as `WHERE e.granularity IN (…)` into the index predicate, so -selective filters narrow the candidate set during traversal rather -than being applied after the fact. +The storage shape is deliberately simple: one `embeddings` table inside +`store.sqlite`, with the `vector` stored as a BLOB (BLOB-exact +`Float32Array`) and one `granularity` column as a discriminator. All +three tiers share this table. Granularity filtering is pushed as +`WHERE e.granularity IN (…)` into the query predicate, so selective +filters narrow the candidate set rather than being applied after the +fact. -## Filter-aware HNSW +## Filter-aware vector search -The graph backend's HNSW index supports filter-aware traversal — the -predicate is pushed into the graph walk so filters like +Vector search runs directly over the `embeddings` table with the +predicate applied in the SQL query, so filters like `WHERE language='python'` or `WHERE granularity='community'` actually return results. A naive post-filter walks the top-k by cosine distance and drops rows that fail the predicate, which collapses to -zero recall under selective filters; the OCH index avoids that by -construction. +zero recall under selective filters; querying with the predicate +inline avoids that by construction. -On the legacy DuckDB layout, the same property holds via the -`hnsw_acorn` community extension's ACORN-1 algorithm. If -`hnsw_acorn` fails to install or load (first-run requires network to -pull from the DuckDB community extension repo), the adapter falls -back to `vss` with a post-filter warning. If both fail, -`vectorExtension='none'` disables vector search entirely — queries -return zero rows plus a surfaced warning rather than crashing. - -## RaBitQ quantization - -`hnsw_acorn` supports RaBitQ quantization, documented at 21-30× -memory reduction versus fp32 vectors. It is a capability of the -extension rather than a separately-configured knob in OpenCodeHub — -enabling `hnsw_acorn` enables it. +The default is a brute-force KNN over the stored BLOB vectors, which is +exact and adds zero native dependencies. `node:sqlite` exposes a +`loadExtension` seam for `sqlite-vec` if brute-force is ever outgrown; +that is a deferred fast-follow, not a shipping requirement. ## Configuration knobs @@ -177,9 +168,10 @@ enabling `hnsw_acorn` enables it. remote-env-var-set + offline=true combination throws. A missing SageMaker endpoint with no env vars just picks ONNX — that is the intended cascade, not a failure. -- **`vectorExtension='none'` is a real state.** Queries return no - rows and surface an extension warning. This is the air-gapped / - offline / extension-broken state; it is not an exception. +- **No-embeddings is a real state.** When embeddings were never + computed (the default, or an air-gapped / offline run), vector search + returns no rows and surfaces a warning. This is expected, not an + exception; lexical BM25 search still works. - **Graph-hash independence.** The embeddings phase does not contribute to `graphHash` — embeddings are optional and probabilistic across backends. Gate 10 (the embeddings determinism @@ -191,10 +183,10 @@ enabling `hnsw_acorn` enables it. ## Further reading -- [ADR 0001 — Storage backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0001-storage-backend.md) - — why DuckDB + `hnsw_acorn`. +- [ADR 0019 — Single-file SQLite storage](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) + — where the `embeddings` table lives and why there is no native binding. - [ADR 0004 — Hierarchical embeddings](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0004-hierarchical-embeddings.md) - — one table, three granularities, one HNSW index. + — one table, three granularities, one discriminator column. - [Summarization and fusion](/opencodehub/architecture/summarization-and-fusion/) — where the symbol-tier text comes from. - Durable lesson: `api-patterns/sagemaker-embedder-backend.md` — diff --git a/packages/docs/src/content/docs/architecture/monorepo-map.md b/packages/docs/src/content/docs/architecture/monorepo-map.md index 1bce8010..cd85193d 100644 --- a/packages/docs/src/content/docs/architecture/monorepo-map.md +++ b/packages/docs/src/content/docs/architecture/monorepo-map.md @@ -22,14 +22,14 @@ package is a library imported by `cli`, `mcp`, `ingestion`, or | `@opencodehub/embedder` | `packages/embedder` | Deterministic ONNX embedder (`F2LLM-v2-80M`, 320-dim), modelId fingerprint, three-backend cascade. | | `@opencodehub/frameworks` | `packages/frameworks` | Five-stage framework detector (manifest → lockfile → config-AST → folder → import/SCIP) over a curated registry. | | `@opencodehub/ingestion` | `packages/ingestion` | The indexing pipeline (parse, resolve, scip-index, embeddings, communities, processes, summaries, ...). | -| `@opencodehub/mcp` | `packages/mcp` | The stdio MCP server, 28 tool registrations (all read-only with respect to user source), 7 resources, the error envelope, the staleness `_meta` block. | -| `@opencodehub/pack` | `packages/pack` | Deterministic 9-item code-pack BOM (the artifact attached to every release). | +| `@opencodehub/mcp` | `packages/mcp` | The stdio MCP server, 29 tool registrations (all read-only with respect to user source), 7 resources, the error envelope, the staleness `_meta` block. | +| `@opencodehub/pack` | `packages/pack` | Deterministic 8-item code-pack BOM (the artifact attached to every release). | | `@opencodehub/policy` | `packages/policy` | `opencodehub.policy.yaml` loader, validator, evaluator. | | `@opencodehub/sarif` | `packages/sarif` | SARIF 2.1.0 Zod schemas, merge + enrich, suppressions, baseline diffing. | | `@opencodehub/scanners` | `packages/scanners` | Nineteen scanner wrappers (semgrep, betterleaks, osv-scanner, bandit, biome, pip-audit, npm-audit, trivy, checkov, checkov-docker-compose, hadolint, tflint, spectral, ruff, grype, vulture, radon, ty, clamav). | | `@opencodehub/scip-ingest` | `packages/scip-ingest` | `.scip` protobuf reader + per-language indexer runners (TypeScript, Python, Go, Rust, Java, .NET, clang, Kotlin, Ruby). | | `@opencodehub/search` | `packages/search` | Hybrid BM25 + RRF search. | -| `@opencodehub/storage` | `packages/storage` | The `IGraphStore` / `ITemporalStore` interface segregation, the LadybugDB graph adapter and DuckDB temporal adapter, and `openStore()` that composes them. | +| `@opencodehub/storage` | `packages/storage` | The `IGraphStore` / `ITemporalStore` interface segregation, the `SqliteStore` class that implements both over one `store.sqlite` via `node:sqlite`, and `openStore()` that returns it as both views. | | `@opencodehub/summarizer` | `packages/summarizer` | Structured per-symbol summarizer (Haiku 4.5 via Bedrock Converse + Zod 4). | | `@opencodehub/wiki` | `packages/wiki` | Markdown wiki renderer (architecture, api-surface, dependency-map, ownership-map, risk-atlas) over the graph. | | `@opencodehub/docs` | `packages/docs` | This Starlight documentation site. | @@ -55,23 +55,24 @@ TypeScript project-references graph enforces this via `tsc --noEmit`. ## Storage — interface segregation -`@opencodehub/storage` exposes two narrow interfaces — `IGraphStore` +`@opencodehub/storage` exposes two narrow interfaces: `IGraphStore` (graph workload: nodes, edges, embeddings, multi-hop traversal) and `ITemporalStore` (temporal workload: cochanges, summary cache). The -single shipping pair implements them: +single shipping class implements both: -- **LadybugDB graph store + DuckDB temporal store** — always. Two - artifacts on disk (`graph.lbug` + `temporal.duckdb`), backed by a - Cypher-emitting dialect for the graph half and DuckDB SQL for the - temporal half. `IGraphStore` lives only on `GraphDbStore`; - `DuckDbStore` implements `ITemporalStore` only; `openStore()` - composes them. There is no backend selector and no fallback (ADR - 0016) — a missing LadybugDB binding throws `GraphDbBindingError`. +- **`SqliteStore` over one `store.sqlite`** — always. One artifact on + disk (`.codehub/store.sqlite`, WAL mode) backed by Node's built-in + `node:sqlite`, holding nodes, edges, embeddings, the FTS5 index, and + the temporal tables. One `SqliteStore` implements both `IGraphStore` + and `ITemporalStore`; `openStore()` returns that one instance as both + the `graph` and `temporal` views. There is no backend selector, no + native binding, and no fallback (ADR 0019 removed both + `@ladybugdb/core` and `@duckdb/node-api`). See [Storage backend](/opencodehub/architecture/storage-backend/) for -how `openStore()` composes the pair and the community-adapter escape -hatch (AGE / Memgraph / Neo4j / Neptune via the segregated -interfaces). +how `openStore()` returns the single store as both views and the +community-adapter escape hatch (AGE / Memgraph / Neo4j / Neptune via +the segregated interfaces). ## Related files diff --git a/packages/docs/src/content/docs/architecture/overview.md b/packages/docs/src/content/docs/architecture/overview.md index 43deb688..3ede11e5 100644 --- a/packages/docs/src/content/docs/architecture/overview.md +++ b/packages/docs/src/content/docs/architecture/overview.md @@ -1,6 +1,6 @@ --- title: Architecture overview -description: Six-phase pipeline from source tree to MCP — parse, resolve, augment, index, cluster, serve — backed by a graph-native store with deterministic outputs. +description: Six-phase pipeline from source tree to MCP — parse, resolve, augment, index, cluster, serve — backed by a single-file SQLite store with deterministic outputs. sidebar: order: 10 --- @@ -17,7 +17,7 @@ flowchart LR tree[Source tree] --> parse[Parse] parse --> resolve[Resolve] resolve --> augment[Augment
SCIP] - augment --> index[Index
BM25 + HNSW] + augment --> index[Index
BM25 + vector KNN] index --> cluster[Cluster
communities + processes] cluster --> serve[Serve
MCP] ``` @@ -26,33 +26,32 @@ Fifteen tree-sitter grammars produce a unified `ParseCapture` stream. Per-language resolvers turn captures into typed relations. SCIP indexers (TypeScript, Python, Go, Rust, Java, C#, C/C++, Kotlin, Ruby) upgrade heuristic edges to compiler-grade references where -available. The graph persists into LadybugDB, with DuckDB -carrying the temporal sibling. Communities and -processes are precomputed. An stdio MCP server with 28 tools answers +available. The whole index persists into one `store.sqlite` file via +Node's built-in `node:sqlite`. Communities and +processes are precomputed. An stdio MCP server with 29 tools answers agent queries. ## Where the data lives -The graph tier is always **LadybugDB** (`graph.lbug`); the temporal tier -is always **DuckDB** (`temporal.duckdb`). Both files live under -`.codehub/`. There is no selection knob, no probe, and no fallback — if -the `@ladybugdb/core` binding cannot load, `open()` throws -`GraphDbBindingError` and the operation aborts. See [Storage backend](/opencodehub/architecture/storage-backend/). +The entire index lives in one **`store.sqlite`** file (WAL mode) under +`.codehub/`, via Node's built-in `node:sqlite`. It holds graph nodes, +edges, embeddings, the FTS5 search index, and the temporal tables +(cochanges, summary cache). There is no selection knob, no native +binding, and no fallback: ADR 0019 removed both `@ladybugdb/core` and +`@duckdb/node-api`, leaving zero native storage bindings. See +[Storage backend](/opencodehub/architecture/storage-backend/). ```mermaid flowchart LR - subgraph lbug[".codehub/ (default)"] - nodes[(graph.lbug
nodes + edges)] - embed[(embeddings)] - temporal[(temporal.duckdb
cochanges, summary cache)] + subgraph store[".codehub/"] + db[(store.sqlite
nodes + edges + embeddings
+ cochanges, summary cache)] end - fts["BM25 over names + summaries"] --- nodes - hnsw["filter-aware HNSW"] --- embed - nodes -. round-trip parity .- temporal + fts["BM25 (FTS5) over names + summaries"] --- db + vec["vector search over embeddings"] --- db ``` -Embeddings live in the same physical store as the graph (one -`embeddings` table, one HNSW index, three granularities keyed by a +Embeddings live in the same `store.sqlite` file as the graph (one +`embeddings` table, three granularities keyed by a `granularity` discriminator). Findings reuse the `nodes` table with `kind='Finding'`. @@ -67,8 +66,8 @@ line+col, nodeType). Lines are 1-indexed, columns 0-indexed. Fifteen languages are registered via a compile-time exhaustive `satisfies Record` table: TypeScript, TSX, JavaScript, Python, Go, Rust, Java, C#, C, C++, Ruby, Kotlin, -Swift, PHP, Dart. The runtime is `web-tree-sitter` (WASM) — the only -parse runtime on Node 20, 22, and 24. There is no native parser and no +Swift, PHP, Dart. The runtime is `web-tree-sitter` (WASM), the only +parse runtime on Node ≥24.15. There is no native parser and no opt-in (ADR 0015). See [Parsing and resolution](/opencodehub/architecture/parsing-and-resolution/). @@ -103,16 +102,17 @@ and Ruby (scip-ruby). Pins live in `.github/workflows/gym.yml`. See [SCIP reconciliation](/opencodehub/architecture/scip-reconciliation/). -### 4. Index — BM25, HNSW, and scanners +### 4. Index — BM25, vector KNN, and scanners -One job: persist the graph into LadybugDB with search indexes wired up. +One job: persist the graph into `store.sqlite` with search indexes wired up. -- **BM25** — over symbol names, signatures, and summaries. -- **HNSW** — filter-aware, with the granularity discriminator pushed - into the predicate so all three tiers (symbol / file / community) - share one index without recall collapse. -- **Multi-hop traversal** — Cypher-emitting dialect on the LadybugDB - graph store. +- **BM25** — over symbol names, signatures, and summaries via an FTS5 + virtual table. +- **Vector search** — filter-aware, with the granularity discriminator + pushed into the predicate so all three tiers (symbol / file / + community) share one `embeddings` table without recall collapse. +- **Multi-hop traversal** — recursive CTEs over the `edges` table for + impact and blast-radius. Embeddings are optional, gated on `PipelineOptions.embeddings`. The backend cascade is SageMaker → HTTP / OpenAI-compatible → local ONNX. @@ -157,7 +157,7 @@ cheapest configuration that hits all three: `codehub analyze --offline` opens zero sockets. - **Deterministic.** Phases are pure: same inputs → same outputs, byte-identical `graphHash`. The `graphHash` invariant holds over the - LadybugDB graph tier. See + graph nodes and edges in `store.sqlite`. See [Determinism](/opencodehub/architecture/determinism/). - **Apache-2.0, every transitive dep on the permissive allowlist.** No BSL, no AGPL, no source-available engines in the core. See @@ -167,19 +167,23 @@ cheapest configuration that hits all three: | ADR | Topic | |---|---| -| 0001 | Storage backend selection — DuckDB + `hnsw_acorn` + `fts` (the v1.0 baseline). | +| 0001 | Storage backend selection — the v1.0 embedded baseline. **Superseded by later storage ADRs.** | | 0002 | Rust core deferred — v2.0 stays pure TypeScript. | -| 0004 | Hierarchical embeddings — one table, three granularities, filter-aware HNSW. | +| 0004 | Hierarchical embeddings — one table, three granularities, filter-aware vector search. | | 0005 | SCIP replaces LSP — compiler-grade edges without long-running language servers. | | 0006 | SCIP indexer CI pins — current version table per language. | | 0007–0010 | Artifact factory, document pattern, output conventions, dogfood findings. | -| 0011 | LadybugDB (phase-1) — graph-native backend behind the `IGraphStore` seam. | +| 0011 | Graph-native backend (phase-1) behind the `IGraphStore` seam. | | 0012 | Repo as a first-class graph node — `repo_uri`, group registry, `AMBIGUOUS_REPO` envelope. | -| 0013 (storage) | M7 default-flip + interface segregation. **Superseded by 0016.** | +| 0013 (storage) | M7 default-flip + interface segregation. **Superseded by 0019.** | | 0013 (parse) | WASM-default parse runtime, native opt-in. **Superseded by 0015.** | | 0014 | SCIP REFERENCES + TYPE_OF emission, embedder modelId stamping. | -| 0015 | WASM-only parser — `web-tree-sitter` is the only runtime on Node 20/22/24; native opt-in removed. | -| 0016 | DuckDB graph backend ripped out — LadybugDB graph + DuckDB temporal, both always present, no selection knob. | +| 0015 | WASM-only parser — `web-tree-sitter` is the only runtime on Node ≥24.15; native opt-in removed. | +| 0016 | Graph-backend rip-out, segregated interfaces preserved. **Superseded by 0019.** | +| 0017 | Drop detect-secrets — ship a tuned betterleaks default config. | +| 0018 | Cleanroom provenance of the route / tool / contract tool names. | +| 0019 | Single-file SQLite storage — one `store.sqlite` via `node:sqlite`; both native storage bindings removed. Supersedes 0016. | +| 0020 | Decision-equivalence is the pack contract; byte-identity is a witness, not the contract. | See [ADRs](/opencodehub/architecture/adrs/) for the full list. @@ -188,7 +192,8 @@ See [ADRs](/opencodehub/architecture/adrs/) for the full list. - [Monorepo map](/opencodehub/architecture/monorepo-map/) — every workspace package and what it owns. - [Storage backend](/opencodehub/architecture/storage-backend/) — the - graph + temporal interface segregation and the resolver. + single `store.sqlite` file and the `IGraphStore` / `ITemporalStore` + interface segregation. - [Cross-repo federation](/opencodehub/architecture/cross-repo-federation/) — `repo_uri`, the group registry, and the `AMBIGUOUS_REPO` envelope. - [Determinism](/opencodehub/architecture/determinism/) — the diff --git a/packages/docs/src/content/docs/architecture/parsing-and-resolution.md b/packages/docs/src/content/docs/architecture/parsing-and-resolution.md index 34b3cf1c..1c903eec 100644 --- a/packages/docs/src/content/docs/architecture/parsing-and-resolution.md +++ b/packages/docs/src/content/docs/architecture/parsing-and-resolution.md @@ -20,7 +20,7 @@ threads. Each file is hashed and the resulting `ParseCapture[]` is cached keyed on `(sha256, grammarSha, SCHEMA_VERSION)`, so a subsequent analyze with the same content skips tree-sitter entirely. -The runtime is `web-tree-sitter` (WASM) on Node 20, 22, and 24 — the +The runtime is `web-tree-sitter` (WASM) on Node ≥24.15, the only supported parse runtime. All 15 grammar `.wasm` blobs are vendored at `packages/ingestion/vendor/wasms/`, built from the grammar sources pinned in `package.json`; rebuild via `bash scripts/build-vendor-wasms.sh` diff --git a/packages/docs/src/content/docs/architecture/storage-backend.md b/packages/docs/src/content/docs/architecture/storage-backend.md index a7736834..ab901455 100644 --- a/packages/docs/src/content/docs/architecture/storage-backend.md +++ b/packages/docs/src/content/docs/architecture/storage-backend.md @@ -1,85 +1,108 @@ --- title: Storage backend -description: LadybugDB graph store + DuckDB temporal sibling, the IGraphStore / ITemporalStore segregation, how openStore composes them, and the community-adapter escape hatch. +description: One store.sqlite file backs the whole index via node:sqlite, the SqliteStore class that implements both IGraphStore and ITemporalStore, how openStore composes them, and the community-adapter escape hatch. sidebar: order: 25 --- -OpenCodeHub's storage layer is two narrow interfaces composed into one -store. The graph half is always LadybugDB; the temporal half is always -DuckDB. There is no backend selector, no probe, and no fallback layout -— `openStore()` composes a `GraphDbStore` (graph) with a `DuckDbStore` -(temporal) and returns both. If the LadybugDB binding fails to load, -`open()` throws `GraphDbBindingError` and the operation aborts. +OpenCodeHub's storage layer is two narrow interfaces implemented by one +class over one file. The entire index lives in a single +`/.codehub/store.sqlite` (WAL mode) via Node's built-in +`node:sqlite`. A single `SqliteStore` implements both `IGraphStore` and +`ITemporalStore`, and `openStore()` returns that one instance as both +the `graph` and `temporal` views. There is no backend selector, no +native binding to probe, and no fallback layout. ADR 0019 removed both +`@ladybugdb/core` and `@duckdb/node-api`, so there are zero native +storage bindings. ## The interfaces `@opencodehub/storage` exports two interfaces: - **`IGraphStore`** — graph workload. Nodes, edges, embeddings, - multi-hop traversal. Shape: properties + Cypher / Cypher-equivalent - query surface. + multi-hop traversal. - **`ITemporalStore`** — temporal workload. Cochanges, the symbol-summary cache. Statistical signals over git history that never enter `graphHash`. -Splitting the interfaces lets community adapters implement only the -half they have an engine for. A graph-only Neo4j adapter does not have -to handle cochange queries; the in-tree DuckDB temporal store does not -have to implement Cypher. `IGraphStore` lives only on `GraphDbStore`; -`DuckDbStore` implements `ITemporalStore` only — neither adapter +The interfaces stay segregated so a community adapter can implement only +the half it has an engine for. A graph-only Neo4j adapter does not have +to handle cochange queries, and a temporal-only adapter does not have to +implement graph traversal. In the shipping build, one `SqliteStore` implements both. ADR 0013 records the call-site refactor that routed 108 raw-SQL call sites across `analysis/`, `mcp/`, `pack/`, `wiki/`, -and `cli/` through the typed finders on the interfaces; ADR 0016 then -ripped the DuckDB graph adapter out entirely. +and `cli/` through the typed finders on the interfaces. -## The single pair that ships +## The single store that ships -### LadybugDB graph store + DuckDB temporal store +### One store.sqlite file, backed by node:sqlite -Two artifacts on disk, both always present after `codehub analyze`: +One artifact on disk, always present after `codehub analyze`: | File | Holds | |---|---| -| `/.codehub/graph.lbug` | Nodes, edges, embeddings, BM25 + HNSW indexes — everything `IGraphStore` owns. | -| `/.codehub/temporal.duckdb` | Cochanges, symbol-summary cache — everything `ITemporalStore` owns. | - -The graph half speaks Cypher natively and stores each edge kind in -its own physical layout — the part of the motivation that DuckDB's -polymorphic `relations` table could not match. The temporal half runs -columnar SQL aggregations over git history, where DuckDB is the right -engine. - -Embeddings live in `graph.lbug`. At pack time they stream from -`store.graph.listEmbeddings()` into a per-call DuckDB temp table on -`temporal.duckdb`, so the byte-identical `embeddings.parquet` sidecar -still works without a graph-tier round trip. +| `/.codehub/store.sqlite` | Nodes, edges, embeddings, BM25 (FTS5) indexes, and the temporal tables (cochanges, symbol-summary cache). The entire index. | + +`node:sqlite` (`DatabaseSync`, enabled by default on Node ≥24.15, the +engines floor) provides every primitive the store needs: BLOB storage +for `Float32Array` embeddings, recursive CTEs for graph traversal +(impact and blast-radius), WAL for crash-safe concurrent reads, and FTS5 +for BM25 search. It is in the standard library, so the store adds zero +install weight. + +WAL companions `store.sqlite-wal` and `store.sqlite-shm` appear while a +writer is open and collapse back to the single file on +`wal_checkpoint(TRUNCATE)` at close. + +Embeddings live in the `embeddings` table inside `store.sqlite` +(BLOB-exact and directly queryable). At pack time they stream from +`store.graph.listEmbeddings()` straight into the code-pack; there is no +Parquet sidecar and no separate temporal file to round-trip through. + +## Schema + +- One generic **`nodes`** table: typed columns for the universal base + (`id, kind, name, file_path, start_line, end_line`) plus a JSON + `payload` overflow for the 37 kind-specific shapes, rehydrated on + read. Findings reuse this table with `kind='Finding'`. +- One polymorphic **`edges`** table keyed by the `(from, to, type, + step)` dedup tuple. +- An **FTS5** virtual table over node names, signatures, and + descriptions for `search`. +- **Recursive CTEs** for multi-hop traversal (impact and blast-radius). + +The `embeddings` table holds all three granularities (symbol / file / +community) keyed by a `granularity` discriminator, so one table serves +every tier. ## How the store is composed -`openStore({path})` always returns -`{graph: GraphDbStore, temporal: DuckDbStore, graphFile, temporalFile, close}`. +`openStore({path})` opens one `store.sqlite` and returns +`{graph, temporal, storeFile, close}`, where `graph` and `temporal` are +the same `SqliteStore` instance viewed through each interface. All +existing call sites keep working unchanged: `store.graph.X()` reaches +the graph surface, `store.temporal.Y()` reaches the temporal surface. There is no `backend` field on the result and no `backend?` option on -the input. The graph artifact is always `graph.lbug`; the temporal -artifact is always `temporal.duckdb`. The `CODEHUB_STORE` env var, the -dynamic-import probe of `@ladybugdb/core`, and the dual-artifact mtime -arbitration are all gone — removed in ADR 0016. If the LadybugDB -binding cannot load, `open()` throws `GraphDbBindingError`; there is no -DuckDB-as-graph fallback. `codehub doctor` hard-fails on a missing -binding (it warned and continued in the prior auto-probe era). - -## Why the segregation, in one example - -The clean motivation: cochange detection (the temporal-store workload) -runs over git history and produces frequency / co-edit scores. The -queries are columnar SQL aggregations that DuckDB is the right -engine for. The graph workload is a different shape — multi-hop -traversal across typed edge kinds — that benefits from a graph-native -engine. Segregating the two interfaces lets each backend specialize. +the input. The `CODEHUB_STORE` env var, the dynamic-import probe of +`@ladybugdb/core`, and the dual-artifact mtime arbitration are all gone. +`codehub doctor` drops the native-binding probes and gains a +`node:sqlite` builtin check: an import plus a WAL round-trip. There is +no native storage binding left to probe. + +## Why one file + +A single embedded file removes the native binding from the install hot +path. `npm i -g @opencodehub/cli` plus Node ≥24.15 is the whole install: +no Docker, no postinstall compile, no second process. Every platform is +supported, including Windows arm64 and Linux musl (Alpine), because there +is no per-platform prebuilt to match. The graph and temporal workloads +still map to distinct primitives inside SQLite: recursive CTEs for +multi-hop traversal across typed edge kinds, and columnar aggregations +for cochange frequency and co-edit scores over git history. ## Community adapters (escape hatch) -The two interfaces are deliberately narrow so a community adapter can +The two interfaces stay deliberately narrow so a community adapter can implement either independently. Candidates for `IGraphStore` adapters include: @@ -88,27 +111,31 @@ include: - **Neo4j** (the canonical Cypher engine). - **Neptune** (AWS managed Cypher / Gremlin). -OCH ships only the LadybugDB + DuckDB pair; it does not ship these -adapters. The seam is a deliberate escape hatch — a team that already -operates one of these engines can supply an `IGraphStore` adapter and -pair it with the in-tree DuckDB `ITemporalStore`. The conformance -suite (`assertIGraphStoreConformance`) and the parity harness in +OCH ships one `SqliteStore` that implements both interfaces; it does not +ship these adapters. The seam is a deliberate escape hatch: a team that +already operates one of these engines can supply an `IGraphStore` +adapter and pair it with a temporal implementation, or implement both on +one class. The conformance suite +(`assertIGraphStoreConformance`) and the parity harness in `packages/storage/src/test-utils/` stay precisely because they are the v1.0 contract these community adapters target. ADR 0013 names the four -candidates explicitly; ADR 0016 confirms the segregated interfaces -survive the DuckDB-graph rip-out for exactly this reason. +candidates explicitly, and ADR 0019 confirms the segregated interfaces +survive the move to a single store for exactly this reason. ## Determinism The `graphHash` invariant covers everything `IGraphStore` owns and is asserted by a CI gate on every PR that touches `packages/storage`. The -temporal signals in `temporal.duckdb` (cochanges, symbol summaries) -are statistical and never enter `graphHash`. +temporal signals in `store.sqlite` (cochanges, symbol summaries) are +statistical and never enter `graphHash`. The migration's hard gate was +that a `KnowledgeGraph` rebuilt from `listNodes({})` + `listEdges({})` +must hash byte-identically to the original; +`sqlite-parity.test.ts` proves it across small and mixed-kind fixtures. ## See also - [ADR 0011 — LadybugDB graph backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0011-graph-db-backend.md) - [ADR 0013 — Storage default + interface segregation](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0013-m7-default-flip-and-abstraction.md) -- [ADR 0016 — Rip out the DuckDB graph backend](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0016-duckdb-graph-rip.md) +- [ADR 0019 — Single-file SQLite storage](https://github.com/theagenticguy/opencodehub/blob/main/docs/adr/0019-single-file-sqlite-storage.md) - [Configuration](/opencodehub/reference/configuration/) — env vars and on-disk layout. diff --git a/packages/docs/src/content/docs/architecture/summarization-and-fusion.md b/packages/docs/src/content/docs/architecture/summarization-and-fusion.md index ea182f5d..bea4306d 100644 --- a/packages/docs/src/content/docs/architecture/summarization-and-fusion.md +++ b/packages/docs/src/content/docs/architecture/summarization-and-fusion.md @@ -130,7 +130,7 @@ sequenceDiagram participant Summ as summarize phase participant Bedrock participant Emb as embeddings phase - participant HNSW as embeddings table + HNSW + participant Vec as embeddings table Summ->>Summ: filter by SCIP-trust Summ->>Summ: cache probe (nodeId, contentHash, promptVersion) @@ -141,7 +141,7 @@ sequenceDiagram end Summ->>Summ: persist SymbolSummaryRow Emb->>Emb: symbolText(node, summary, body) — fuse - Emb->>HNSW: upsert symbol-tier vector + Emb->>Vec: upsert symbol-tier vector ``` ## Cache-key discriminator diff --git a/packages/docs/src/content/docs/guides/indexing-a-repo.md b/packages/docs/src/content/docs/guides/indexing-a-repo.md index a5973c1d..a8163f31 100644 --- a/packages/docs/src/content/docs/guides/indexing-a-repo.md +++ b/packages/docs/src/content/docs/guides/indexing-a-repo.md @@ -9,13 +9,14 @@ sidebar: tree-sitter (and SCIP for every language with a pinned indexer — TypeScript, Python, Go, Rust, Java, C#, C/C++, Kotlin, Ruby), resolve imports and inheritance, detect processes and clusters, build BM25 -and HNSW indexes, and write everything to `.codehub/` under the repo +and vector indexes, and write everything to `.codehub/` under the repo root. -The graph half is always **LadybugDB** (`.codehub/graph.lbug`) and the -temporal sibling is always **DuckDB** (`.codehub/temporal.duckdb`). Both -files are written on every analyze — there is no backend knob and no -single-file fallback. See +The whole index lives in one **`store.sqlite`** file (WAL mode) under +`.codehub/`, via Node's built-in `node:sqlite`. It holds graph nodes, +edges, embeddings, and the temporal tables, and it is written on every +analyze. There is no backend knob and no native storage binding (ADR +0019). See [Storage backend](/opencodehub/architecture/storage-backend/). ## Basic indexing @@ -34,7 +35,7 @@ codehub analyze --embeddings ``` `--embeddings` computes symbol and optional file/community vectors and -writes them to the HNSW index. After this, `codehub query` fuses BM25 +writes them to the `embeddings` table. After this, `codehub query` fuses BM25 and vector results via reciprocal-rank fusion (RRF). Memory-constrained machines can use `--embeddings-int8` for quantised @@ -83,13 +84,13 @@ symbol participates in. The default granularity is `symbol`. ## What lives in `.codehub/` -Every index writes the same two-file layout — LadybugDB for the graph, -DuckDB for the temporal sibling: +Every index writes the same single-file layout: one `store.sqlite` via +Node's built-in `node:sqlite`: | Path | Purpose | |---|---| -| `graph.lbug` | LadybugDB graph store — symbols, edges, embeddings, BM25 + HNSW indexes. | -| `temporal.duckdb` | DuckDB sibling — cochanges, symbol-summary cache. | +| `store.sqlite` | The whole index (WAL mode) — symbols, edges, embeddings, the FTS5 search index, and the temporal tables (cochanges, symbol-summary cache). | +| `store.sqlite-wal` / `store.sqlite-shm` | WAL companions present while a writer is open; collapse into `store.sqlite` at close. | | `meta.json` | Index metadata (graph hash, node counts, CLI version, toolchain pins, embedder modelId). | | `scan.sarif` | SARIF scan output when `codehub scan` has run. | | `sbom.cyclonedx.json` / `sbom.spdx.json` | SBOMs when `codehub analyze --sbom` has run. | diff --git a/packages/docs/src/content/docs/guides/using-with-claude-code.md b/packages/docs/src/content/docs/guides/using-with-claude-code.md index 5cd6cb93..efb5e739 100644 --- a/packages/docs/src/content/docs/guides/using-with-claude-code.md +++ b/packages/docs/src/content/docs/guides/using-with-claude-code.md @@ -93,7 +93,7 @@ entries in `.mcp.json` are preserved. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the full catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the full catalogue of 29 tools Claude Code will see. - [MCP overview](/opencodehub/mcp/overview/) — server name, transport, envelope conventions. diff --git a/packages/docs/src/content/docs/guides/using-with-codex.md b/packages/docs/src/content/docs/guides/using-with-codex.md index a4e2d377..71bdc50c 100644 --- a/packages/docs/src/content/docs/guides/using-with-codex.md +++ b/packages/docs/src/content/docs/guides/using-with-codex.md @@ -65,5 +65,5 @@ MCP servers are left alone. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 29 tools Codex will see. diff --git a/packages/docs/src/content/docs/guides/using-with-opencode.md b/packages/docs/src/content/docs/guides/using-with-opencode.md index c1dee36f..9a7545e7 100644 --- a/packages/docs/src/content/docs/guides/using-with-opencode.md +++ b/packages/docs/src/content/docs/guides/using-with-opencode.md @@ -76,5 +76,5 @@ MCP servers configured there are left alone. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 29 tools OpenCode will see. diff --git a/packages/docs/src/content/docs/guides/using-with-windsurf.md b/packages/docs/src/content/docs/guides/using-with-windsurf.md index 34bcb87e..0fa68ddc 100644 --- a/packages/docs/src/content/docs/guides/using-with-windsurf.md +++ b/packages/docs/src/content/docs/guides/using-with-windsurf.md @@ -76,5 +76,5 @@ are left alone. ## Next -- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 28 tools +- [MCP tools](/opencodehub/mcp/tools/) — the catalogue of 29 tools Windsurf will see. diff --git a/packages/docs/src/content/docs/index.mdx b/packages/docs/src/content/docs/index.mdx index 6efec178..57e25257 100644 --- a/packages/docs/src/content/docs/index.mdx +++ b/packages/docs/src/content/docs/index.mdx @@ -1,6 +1,6 @@ --- title: OpenCodeHub -description: Apache-2.0 code intelligence graph + MCP server for AI coding agents — 28 tools, 15 GA languages, deterministic, offline-capable. +description: Apache-2.0 code intelligence graph + MCP server for AI coding agents — 29 tools, 15 GA languages, deterministic, offline-capable. template: splash hero: tagline: Graph-aware impact, context, and query for an AI coding agent — local, deterministic, Apache-2.0. @@ -35,7 +35,7 @@ import { Card, CardGrid, LinkCard } from "@astrojs/starlight/components"; participating processes — without grep round-trips. - Hybrid BM25 + filter-aware HNSW search, results grouped by + Hybrid BM25 + brute-force vector KNN search, results grouped by execution-flow process. Fed by a typed graph, not a flat index. @@ -51,7 +51,7 @@ import { Card, CardGrid, LinkCard } from "@astrojs/starlight/components"; diff --git a/packages/docs/src/content/docs/mcp/overview.md b/packages/docs/src/content/docs/mcp/overview.md index b1f8a092..52209a8b 100644 --- a/packages/docs/src/content/docs/mcp/overview.md +++ b/packages/docs/src/content/docs/mcp/overview.md @@ -16,7 +16,7 @@ can connect to over stdio. - **Capabilities:** `tools` and `resources`. The server does not advertise `prompts` — the canned-prompts surface lives as Claude Code skills shipped by `plugins/opencodehub/` instead. -- **Tool count:** 28 (registered in `packages/mcp/src/server.ts`). Every +- **Tool count:** 29 (registered in `packages/mcp/src/server.ts`). Every tool is read-only with respect to user source — no tool edits the working tree. @@ -39,7 +39,7 @@ editor's native MCP config location. ## The four tool families -The 28 tools fall into four functional clusters plus a meta cluster. +The 29 tools fall into four functional clusters plus a meta cluster. The full per-tool catalog is in [MCP tools](/opencodehub/mcp/tools/). | Family | Tools | Count | @@ -91,7 +91,7 @@ Error responses instead carry `isError: true`, ## What the server exposes -- **28 tools** — exploration, federation, scan/findings, HTTP routing, +- **29 tools** — exploration, federation, scan/findings, HTTP routing, and metadata. All read-only with respect to user source. See [tools](/opencodehub/mcp/tools/). - **7 resources** — structured views over repos, clusters, and diff --git a/packages/docs/src/content/docs/mcp/resources.md b/packages/docs/src/content/docs/mcp/resources.md index 13632bdd..7d73f80a 100644 --- a/packages/docs/src/content/docs/mcp/resources.md +++ b/packages/docs/src/content/docs/mcp/resources.md @@ -1,6 +1,6 @@ --- title: MCP resources -description: The seven MCP resources the opencodehub server publishes alongside its 28 tools. +description: The seven MCP resources the opencodehub server publishes alongside its 29 tools. sidebar: order: 30 --- diff --git a/packages/docs/src/content/docs/mcp/tools.md b/packages/docs/src/content/docs/mcp/tools.md index 20110d72..7fb3eeb1 100644 --- a/packages/docs/src/content/docs/mcp/tools.md +++ b/packages/docs/src/content/docs/mcp/tools.md @@ -1,11 +1,11 @@ --- title: MCP tools -description: All 28 MCP tools the opencodehub server registers, grouped by functional family. Every tool is read-only with respect to user source. +description: All 29 MCP tools the opencodehub server registers, grouped by functional family. Every tool is read-only with respect to user source. sidebar: order: 20 --- -The `opencodehub` MCP server registers **28 tools**, imported and +The `opencodehub` MCP server registers **29 tools**, imported and invoked from `packages/mcp/src/server.ts`. The number is taken live from `buildServer()` at startup. Every tool is **read-only with respect to user source** — no tool edits the working tree. @@ -73,8 +73,8 @@ The high-frequency tools. Most agent loops live here. | | | |---|---| -| **Use when** | You need a custom view of the temporal store (`cochanges` + `symbol_summaries`) that no other tool exposes. Read-only. 5-second timeout. | -| **Avoid when** | A typed tool (`context`, `impact`, `query`) already covers the question, or you need the node/edge graph — that lives in `graph.lbug` and is reached via the typed tools or Cypher (ADR 0016), not this SQL path. | +| **Use when** | You need a custom view of `store.sqlite` that no other tool exposes. Everything is directly SQL-queryable: `nodes`, `edges`, `embeddings`, `cochanges`, `symbol_summaries`, and `store_meta` (ADR 0019); reach kind-specific fields via SQLite JSON1, `payload->>'$.field'`. Read-only. 5-second timeout. | +| **Avoid when** | A typed tool (`context`, `impact`, `query`) already covers the question. The typed tools stay the high-level path; the `cypher` arg is reserved for community-fork graph adapters and is not supported by the default backend. | | **Inputs** | `query` (required), `repo?`, `repo_uri?` | | **Returns** | `{ rows: [...], row_count, next_steps }` | @@ -141,7 +141,7 @@ form so a follow-up `AMBIGUOUS_REPO` retry can use it as input. | **Inputs** | `group` | | **Returns** | `{ group, contracts_written, cross_links_written, next_steps }` | -## Scan / findings / verdict (7) +## Scan / findings / verdict (8) `scan` is the only tool that spawns processes (`openWorldHint=true`). `verdict` exits 0/1/2/3 by tier — the canonical source of CI signal. @@ -194,6 +194,14 @@ form so a follow-up `AMBIGUOUS_REPO` retry can use it as input. | **Inputs** | `repo?`, `repo_uri?`, `base?` (default `main`), `head?` (default `HEAD`) | | **Returns** | `{ tier: "auto_merge" \| "single_review" \| "dual_review" \| "expert_review" \| "block", exit_code, reasons, signals }` | +### `change_pack` + +| | | +|---|---| +| **Use when** | A CI agent needs everything a diff touches in one deterministic, read-only payload: the changed symbols, their upstream impacted subgraph, the `verdict` tier, the affected tests, and a token-cost estimate. | +| **Inputs** | `repo?`, `repo_uri?`, `base?` (default `main`), `head?` (default `HEAD`), `depth?` (upstream traversal, default 4), `budget?` (context budget in heuristic tokens, default 100000) | +| **Returns** | `{ changed, impacted_subgraph, verdict, affected_tests, cost_estimate }` — the same `ChangePack` the CLI's `codehub change-pack --json` emits, snake-cased under `structuredContent`. | + ### `risk_trends` | | | diff --git a/packages/docs/src/content/docs/reference/cli.md b/packages/docs/src/content/docs/reference/cli.md index 0d267671..31291a20 100644 --- a/packages/docs/src/content/docs/reference/cli.md +++ b/packages/docs/src/content/docs/reference/cli.md @@ -13,7 +13,7 @@ unhandled throw writes `codehub: ` to stderr and sets ## `analyze` Index a repository. Runs the full pipeline: parse, resolve, cluster, -build BM25 + HNSW indexes, and write `.codehub/`. +build BM25 + vector indexes, and write `.codehub/`. ```bash title="usage" codehub analyze [path] @@ -23,7 +23,7 @@ codehub analyze [path] |---|---|---| | `--force` | off | Ignore the registry cache and re-run the pipeline. | | `--embeddings` | off | Compute semantic vectors. | -| `--embeddings-int8` | off | Quantise vectors to int8 (~23 MB weights). | +| `--embeddings-int8` | off | Use the int8 embedder variant (~81 MB) instead of fp32 (~321 MB). | | `--granularity ` | `symbol` | Any subset of `symbol,file,community`. | | `--embeddings-workers ` | `auto` | Size of the ONNX worker pool. | | `--embeddings-batch-size ` | 32 | Batch size per worker. | @@ -89,9 +89,11 @@ codehub setup | `--force` | off | Overwrite existing entries; re-download weights. | | `--undo` | off | Restore the most recent `.bak` next to each config. | | `--embeddings` | off | Download `F2LLM-v2-80M` ONNX weights (SHA256-pinned GitHub release asset). | -| `--int8` | off | Use the int8 weight variant (~81 MB) instead of fp32 (~321 MB). | +| `--int8` | off | Use the int8 weight variant (~92 MB) instead of fp32 (~332 MB). | | `--model-dir ` | — | Override the target directory for embedder weights. | | `--plugin` | off | Install the Claude Code plugin to `~/.claude/plugins/opencodehub/`. | +| `--scip ` | — | Install an external SCIP adapter binary: `clang`, `ruby`, `dotnet`, `kotlin`, or `all`. SHA256-pinned; `dotnet` requires .NET SDK 8+ on `PATH`. | +| `--cobol-proleap` | off | Build the `uwol/cobol-parser` library from source (`git clone` + `mvn install`) and compile the bridge wrapper. Requires `git`, `mvn`, and JDK 17+ on `PATH`. Installs under `~/.codehub/vendor/proleap/`. | ## `mcp` @@ -152,10 +154,17 @@ codehub pack [path] ## `code-pack` -Produce the deterministic 9-item code-pack BOM (manifest, skeleton, -file-tree, dependency list, top symbols, processes, routes, tools, -findings) sized to a token budget. This is the artifact attached to -every release and signed with cosign. +Produce the deterministic 8-item code-pack BOM sized to a token budget. +The BOM is `manifest.json` plus seven body items: skeleton, file-tree, +dependency list, ast-chunks, xrefs, findings, and licenses. A +consumer-facing `readme.md` ships alongside the BOM but is not part of +the manifest hash preimage. The pack is byte-identical given the same +`(commit, tokenizer, budget)`, and `packHash` names its on-disk +directory (`/.codehub/packs//`). + +The default engine is `pack` (the `@opencodehub/pack` BOM). `--engine +repomix` opts into the legacy single-file snapshot (a single output +file, `bomItemCount` of 1, no manifest). ```bash title="usage" codehub code-pack [path] @@ -164,6 +173,47 @@ codehub code-pack [path] | Flag | Default | Purpose | |---|---|---| | `--budget ` | 100000 | AST-chunker token budget. | +| `--tokenizer ` | `openai:o200k_base@tiktoken-0.8.0` | Tokenizer pin `:@`. | +| `--out-dir ` | `/.codehub/packs//` | Override the default output directory. | +| `--engine ` | `pack` | `pack` emits the 8-item BOM; `repomix` emits the legacy single-file snapshot. | +| `--explain-context` | off | After packing, print the context read-receipt (files indexed, lines, hash coverage, per-language breakdown) from `context-bom.json`. | +| `--json` | off | With `--explain-context` or `--variance-probe`, emit the result as JSON on stdout. | +| `--variance-probe ` | — | Measure the run-to-run answer variance an OCH pack removes from a coding agent. Loads the task file, generates the pack, runs the agent N times with vs. without the pack, and reports the dispersion delta plus token overhead. Agents run on Amazon Bedrock. On-demand only. | +| `--runs ` | 10 | With `--variance-probe`: runs per arm. | +| `--harness ` | both | With `--variance-probe`: restrict to one agent. | +| `--aws-region ` | inherited `AWS_REGION` | With `--variance-probe`: AWS region for Bedrock inference. | +| `--model-claude ` | `us.anthropic.claude-sonnet-4-6` | With `--variance-probe`: Claude Code Bedrock model / inference-profile id. | +| `--model-codex ` | `openai.gpt-5.5` | With `--variance-probe`: Codex Bedrock model id. | + +```bash title="example" +codehub code-pack . --budget 80000 --explain-context +``` + +## `replay` + +Assert two code-packs are decision-equivalent (spec 011 / ADR 0020): the +same files and byte ranges selected under the same budget, regardless of +incidental drift in `tokenCount`, pins, or chunk text. `packHash` equality +is the cheap witness; a `decisionHash` projection is the contract. The +verdict is one of `EQUIVALENT`, `DIVERGED`, `BUDGET_MISMATCH`, or +`CORRUPT`. On-demand, never a CI gate. + +```bash title="usage" +codehub replay --compare +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--compare ` | — | **Required.** Exactly two pack directories (`.codehub/packs//`) to compare. | +| `--json` | off | Emit the full replay record (verdict, `decisionHash`es, diff) as JSON on stdout. | +| `--budget-strict` | off | Treat a `BUDGET_MISMATCH` (different `--budget` between the packs) as a failure exit. | + +Exit codes: `EQUIVALENT` → 0, `BUDGET_MISMATCH` → 0 (or 1 with +`--budget-strict`), `DIVERGED` → 1, `CORRUPT` → 1. + +```bash title="example" +codehub replay --compare .codehub/packs/abc123 .codehub/packs/def456 --json +``` ## `query` @@ -259,6 +309,30 @@ codehub verdict Exit codes: `auto_merge=0`, `single_review=1`, `dual_review=1`, `expert_review=2`, `block=3`. +## `change-pack` + +Diff-scoped change-pack: the impacted subgraph, a PR verdict, affected +tests, and a cost estimate for one diff. CLI sibling of the `change_pack` +MCP tool, usable in CI without launching the MCP server. + +```bash title="usage" +codehub change-pack +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--base ` | `main` | Base git ref. | +| `--head ` | `HEAD` | Head git ref. | +| `--depth ` | 4 | Upstream traversal depth. | +| `--min-confidence ` | 0.7 | Traversal confidence floor, 0 to 1. | +| `--budget ` | 100000 | Context budget in heuristic tokens. | +| `--include-tests-in-subgraph` | off | Retain test nodes in the impacted subgraph. | +| `--json` | off | Structured envelope. | + +Exit codes mirror `verdict`: `auto_merge` / `single_review` → 0, +`dual_review` → 1, `expert_review` / `block` → 2. + ## `group` Cross-repo group management. @@ -317,9 +391,13 @@ codehub doctor | Flag | Default | Purpose | |---|---|---| -| `--skip-native` | off | Skip checks that require native bindings (duckdb / lbug — `@duckdb/node-api` and `@ladybugdb/core`). Parsing has no native binding; it is WASM-only (`web-tree-sitter`) and unaffected by this flag. | +| `--skip-native` | off | Skip the two probes that load a runtime module: the `node:sqlite` built-in WAL round-trip and the optional `onnxruntime-web` embedder (prebuilt WASM). The store has no native bindings, so this flag retains only these two checks; it is kept for compatibility with CI sandboxes. Parsing is WASM-only (`web-tree-sitter`) and is never skipped. | +| `--strict` | off | Treat a missing SCIP indexer as a failure (exit 2), not a warning. For release / CI gates. Vendored WASM grammars fail in both modes. | | `--repoRoot ` | cwd | Repo root to probe. | +Exit codes: `0` all checks OK, `1` at least one warning, `2` at least +one failure. + ## `bench` Run the acceptance-gate bench suite and emit a dashboard. @@ -335,21 +413,28 @@ codehub bench ## `wiki` -Emit a Markdown wiki for the repo. +Emit a Markdown wiki for the repo under `--output`. Deterministic by +default; `--llm` routes top-ranked modules through the summarizer for +narrative prose. ```bash title="usage" -codehub wiki +codehub wiki --output ``` | Flag | Default | Purpose | |---|---|---| +| `--output ` | — | **Required.** Target directory for rendered pages. | | `--repo ` | current | Target repo. | | `--json` | off | Emit a JSON summary on stdout. | | `--offline` | off | Assert no network access (incompatible with `--llm`). | | `--llm` | off | Route top-ranked modules through the summarizer. | -| `--max-llm-calls ` | 0 (dry-run) | LLM call budget. | +| `--max-llm-calls ` | 0 (dry-run) | LLM call budget when `--llm` is set. | | `--llm-model ` | — | Override the Bedrock summary model id. | +```bash title="example" +codehub wiki --output docs/wiki +``` + ## `ci-init` Emit opinionated CI workflows. @@ -380,11 +465,17 @@ codehub augment ## `sql` -Read-only SQL against the **temporal store** — the DuckDB-backed `cochanges` and -`symbol_summaries` tables. 5-second timeout by default. The node/edge graph lives -in `graph.lbug` (see ADR 0016) and is **not** reachable from this SQL path; query -it via the typed tools (`query` / `context` / `impact`) or Cypher via the MCP `sql` -tool. +Read-only SQL against the single-file store, `/.codehub/store.sqlite` +(WAL, via Node's built-in `node:sqlite`, ADR 0019). Every table lives in +this one file and is directly queryable: `nodes`, `edges`, `embeddings`, +`cochanges`, `symbol_summaries`, and `store_meta`. Reach kind-specific +fields on `nodes` via SQLite JSON1, e.g. `payload->>'$.field'`. The guard +rejects any mutation. 5-second timeout by default. + +The typed tools (`query` / `context` / `impact`) remain the high-level +path for graph traversal. A `cypher` query path exists only as a reserved +escape hatch for community-fork graph adapters (AGE / Memgraph / Neo4j / +Neptune) and is not supported by the default backend. ```bash title="usage" codehub sql @@ -395,3 +486,189 @@ codehub sql | `--repo ` | current | Target repo. | | `--timeout ` | 5000 | Statement timeout. | | `--json` | off | Structured envelope. | + +```bash title="example" +codehub sql "SELECT id, name FROM nodes WHERE kind = 'Function' LIMIT 10" +``` + +## Read-only graph capabilities + +Each command below is a CLI sibling of an MCP tool, reusing the same +underlying reader against the single-file store. They run in CI without +launching the MCP server. + +## `findings` + +List SARIF `Finding` nodes (sibling of the MCP `list_findings` tool). + +```bash title="usage" +codehub findings +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--severity ` | — | Restrict to one SARIF severity: `error`, `warning`, `note`, or `none`. | +| `--scanner ` | — | Restrict to a single scanner id (e.g. `semgrep`). | +| `--rule-id ` | — | Restrict to a single rule id. | +| `--file-path ` | — | Substring filter on the finding's file path. | +| `--limit ` | 500 | Maximum findings to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub findings --severity error --scanner semgrep +``` + +## `dead-code` + +List dead and unreachable-export symbols (sibling of the MCP +`list_dead_code` tool). + +```bash title="usage" +codehub dead-code +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--file-path-pattern ` | — | Substring filter on each symbol's file path. | +| `--include-unreachable-exports` | off | Also include exported-but-unreferenced symbols. | +| `--limit ` | 100 | Maximum symbols to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub dead-code --include-unreachable-exports +``` + +## `license-audit` + +Classify `Dependency` nodes by license risk tier (sibling of the MCP +`license_audit` tool). + +```bash title="usage" +codehub license-audit +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub license-audit --json +``` + +## `project-profile` + +Show the detected project profile (sibling of the MCP `project_profile` +tool). + +```bash title="usage" +codehub project-profile +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub project-profile +``` + +## `risk-trends` + +Per-community risk trend plus a 30-day projection (sibling of the MCP +`risk_trends` tool). + +```bash title="usage" +codehub risk-trends +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub risk-trends --json +``` + +## `owners` + +List ranked `OWNED_BY` contributors for a node (sibling of the MCP +`owners` tool). + +```bash title="usage" +codehub owners +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--limit ` | 20 | Maximum contributors to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub owners src/auth/session.ts +``` + +## `route-map` + +Map HTTP routes to handlers and consumers (sibling of the MCP +`route_map` tool). + +```bash title="usage" +codehub route-map +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--route ` | — | Substring match against `Route.url` (e.g. `/api/users`). | +| `--method ` | — | Exact match against `Route.method` (e.g. `GET`). | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub route-map --route /api/users --method GET +``` + +## `api-impact` + +Score the blast radius of changing a `Route`'s contract (sibling of the +MCP `api_impact` tool). + +```bash title="usage" +codehub api-impact +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--route ` | — | Substring match against `Route.url`. | +| `--file ` | — | Substring match against `Route.filePath`. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub api-impact --route /api/users +``` + +## `dependencies` + +List external dependencies (sibling of the MCP `dependencies` tool). + +```bash title="usage" +codehub dependencies +``` + +| Flag | Default | Purpose | +|---|---|---| +| `--repo ` | current | Target repo. | +| `--ecosystem ` | — | Restrict to one ecosystem: `npm`, `pypi`, `go`, `cargo`, `maven`, or `nuget`. | +| `--file-path ` | — | Substring filter on the manifest / lockfile path. | +| `--limit ` | 500 | Maximum dependencies to return. | +| `--json` | off | Structured envelope. | + +```bash title="example" +codehub dependencies --ecosystem npm +``` diff --git a/packages/docs/src/content/docs/reference/configuration.md b/packages/docs/src/content/docs/reference/configuration.md index 50fe8e82..b9210ae3 100644 --- a/packages/docs/src/content/docs/reference/configuration.md +++ b/packages/docs/src/content/docs/reference/configuration.md @@ -14,25 +14,24 @@ mutate global state. ### Storage -The graph tier is always LadybugDB (`graph.lbug`) and the temporal tier -is always DuckDB (`temporal.duckdb`). There is no backend selector — the -`CODEHUB_STORE` env var was removed in ADR 0016 along with the probe and -the DuckDB-as-graph fallback. If the LadybugDB binding cannot load, -`open()` throws `GraphDbBindingError`. +The whole index lives in one `store.sqlite` file (WAL mode) via Node's +built-in `node:sqlite`. There is no backend selector: the `CODEHUB_STORE` +env var was removed and there is no native storage binding to probe (ADR +0019). Nothing fails for lack of a platform prebuilt. | Variable | Purpose | |---|---| | `CODEHUB_HOME` | Override `~/.codehub/` (where the registry, embedder weights, and global state live). | ADR 0013 (`docs/adr/0013-m7-default-flip-and-abstraction.md`) records -the `IGraphStore` / `ITemporalStore` interface segregation; ADR 0016 -(`docs/adr/0016-duckdb-graph-rip.md`) records the rip-out of the DuckDB -graph backend, the env var, and the resolver. +the `IGraphStore` / `ITemporalStore` interface segregation; ADR 0019 +(`docs/adr/0019-single-file-sqlite-storage.md`) records collapsing the +whole index into one `store.sqlite` and removing both native storage +bindings. ### Parse runtime -`web-tree-sitter` (WASM) is the only parse runtime on Node 20, 22, and -24. There is no env var or CLI flag to switch parsers — the native +`web-tree-sitter` (WASM) is the only parse runtime on Node ≥24.15. There is no env var or CLI flag to switch parsers — the native `tree-sitter` N-API addon was removed in 0.4.0. The CLI emits a one-shot stderr advisory if a stale legacy env var is set, then ignores it; consult the CHANGELOG and ADR 0015 for the variable name and @@ -70,13 +69,12 @@ When none of the above are set, the local ONNX backend ## On-disk layout: `.codehub/` `codehub analyze` writes everything under `/.codehub/`. The -layout is fixed: a LadybugDB graph file alongside a DuckDB temporal -file. +layout is fixed: one `store.sqlite` file backs the whole index. | Path | Purpose | |---|---| -| `graph.lbug` | LadybugDB graph store — nodes, edges, embeddings, BM25 + HNSW indexes. | -| `temporal.duckdb` | Sibling DuckDB file — temporal store (cochanges, symbol-summary cache). | +| `store.sqlite` | The whole index (WAL mode, `node:sqlite`) — nodes, edges, embeddings, the FTS5 search index, and the temporal tables (cochanges, symbol-summary cache). | +| `store.sqlite-wal` / `store.sqlite-shm` | WAL companions present while a writer is open; collapse into `store.sqlite` at close. | | `meta.json` | Index metadata: graph hash, node counts, CLI version, embedder model id. | | `scan.sarif` | SARIF output from `codehub scan`. | | `sbom.cyclonedx.json` / `sbom.spdx.json` | SBOMs when `codehub analyze --sbom` has run. | diff --git a/packages/docs/src/content/docs/reference/error-codes.md b/packages/docs/src/content/docs/reference/error-codes.md index c225d9c8..795e7d79 100644 --- a/packages/docs/src/content/docs/reference/error-codes.md +++ b/packages/docs/src/content/docs/reference/error-codes.md @@ -21,8 +21,7 @@ The canonical list lives at | `STALENESS` | The index lags `HEAD` far enough to mistrust results. | `codehub analyze` (or `--force`). | | `INVALID_INPUT` | A tool argument failed schema validation. | Correct the call; check required fields. | | `NOT_FOUND` | The target symbol, repo, or group does not exist. | Confirm the name; run `codehub list` for repos. | -| `DB_ERROR` | The graph store returned an error during the query. | Check `codehub doctor`; inspect `.codehub/graph.lbug`. | -| `GraphDbBindingError` | The `@ladybugdb/core` native binding could not load, so `open()` aborted. There is no DuckDB-as-graph fallback (ADR 0016). | Run `codehub doctor` (it hard-fails on a missing binding); confirm the lbug prebuilt target matches your platform, or build from source via `cmake-js`. | +| `DB_ERROR` | The store returned an error during the query. | Check `codehub doctor` (it runs a `node:sqlite` import + WAL round-trip); inspect `.codehub/store.sqlite`. | | `SCHEMA_MISMATCH` | The index was produced by a different CLI version with an incompatible schema. | `codehub analyze --force` to rebuild. | | `RATE_LIMITED` | A downstream service (embedder, summariser) rate-limited the request. | Retry with backoff; reduce concurrency. | | `INTERNAL` | Catch-all for unhandled exceptions reaching the tool boundary. | File an issue with the error `message`. | @@ -30,6 +29,11 @@ The canonical list lives at | `AMBIGUOUS_REPO` | More than one repo is indexed and neither `repo` nor `repo_uri` was supplied. | Retry with one of the `choices[].repo_uri` values. | | `EMBEDDER_MISMATCH` | The store was indexed by a different embedder than the one currently configured. | Re-index with the configured embedder, or pass the documented force flag. | +The historical `GraphDbBindingError` (a failed native graph-binding +load) no longer exists. ADR 0019 removed both native storage bindings +and moved the whole index into one `store.sqlite` file via the built-in +`node:sqlite`, so there is no binding to fail. + ## `AMBIGUOUS_REPO` envelope `AMBIGUOUS_REPO` is the most common error a federated client encounters. diff --git a/packages/docs/src/content/docs/reference/languages.md b/packages/docs/src/content/docs/reference/languages.md index 09d56627..6f36190e 100644 --- a/packages/docs/src/content/docs/reference/languages.md +++ b/packages/docs/src/content/docs/reference/languages.md @@ -41,7 +41,7 @@ ProLeap deep-parse. ## Parse runtime — WASM-only -The parse runtime is `web-tree-sitter` (WASM) on Node 20, 22, and 24. +The parse runtime is `web-tree-sitter` (WASM) on Node ≥24.15. WASM has no native ABI dependency, so it works on every supported Node version out of the box and `npm install -g @opencodehub/cli@latest` does zero native builds. diff --git a/packages/docs/src/content/docs/skills/codehub-code-pack.mdx b/packages/docs/src/content/docs/skills/codehub-code-pack.mdx index 73ee3809..32afbf3d 100644 --- a/packages/docs/src/content/docs/skills/codehub-code-pack.mdx +++ b/packages/docs/src/content/docs/skills/codehub-code-pack.mdx @@ -1,21 +1,21 @@ --- title: "codehub-code-pack" -description: "Deterministic 9-item code-pack BOM for a repo or group — byte-identical given the same (commit, tokenizer, budget)." +description: "Deterministic 8-item code-pack BOM for a repo or group — byte-identical given the same (commit, tokenizer, budget)." --- import { Aside } from "@astrojs/starlight/components"; Standalone skill. Surfaces the `pack_codebase` MCP tool to produce a -**deterministic, 9-item Bill of Materials (BOM)** at +**deterministic, 8-item Bill of Materials (BOM)** at `/.codehub/packs//` that is byte-identical given the same -`(commit, tokenizer, budget, chonkie_version, duckdb_version, -grammar_commits)`. The pack is the durable artifact agents hand to a +`(commit, tokenizer, budget, chonkie_version, grammar_commits)`. The +pack is the durable artifact agents hand to a long-context LLM, archive for later replay, or diff between commits to prove invariants did not change. @@ -42,7 +42,7 @@ model: sonnet requested in the pack. 4. `mcp__opencodehub__pack_codebase` with the default `engine: "pack"`. The tool resolves the output to `/.codehub/packs//` - and writes the 9 items plus `manifest.json`. + and writes the 8 items plus `manifest.json`. 5. Report back the `packHash`, the `determinismClass`, and the absolute output directory; name the cause when the class is `best_effort` or `degraded`. @@ -53,7 +53,7 @@ Run the single-repo flow per member of the named group, then emit a table of `(repoUri, packHash, determinismClass, outDir)`. `packHash` is per-repo, not per-group — a group pack is the union of the member BOMs. -## The 9-item BOM +## The 8-item BOM | # | File | Determinism contract | |---|------|----------------------| @@ -63,9 +63,12 @@ not per-group — a group pack is the union of the member BOMs. | 4 | `deps.jsonl` | `(ecosystem, name, version, id)` lexicographic ASC | | 5 | `ast-chunks.jsonl` | LF-normalized; degrades to line-split with `determinismClass: degraded` | | 6 | `xrefs.jsonl` | community rows first, then call rows | -| 7 | `embeddings.parquet` | OPTIONAL — absent entirely when no embeddings exist | -| 8 | `findings.jsonl` | severity rank then `ruleId` ASC | -| 9 | `licenses.md` + `readme.md` | alpha-sorted dependency list + manifest-derived header | +| 7 | `findings.jsonl` | severity rank then `ruleId` ASC | +| 8 | `licenses.md` + `readme.md` | alpha-sorted dependency list + manifest-derived header | + +Embeddings are not a separate BOM item. They live in the `embeddings` +table inside `store.sqlite`; the write-only Parquet sidecar that used to +be item 7 was dropped with DuckDB (ADR 0019), since nothing read it back. ## Determinism class diff --git a/packages/docs/src/content/docs/start-here/first-query.md b/packages/docs/src/content/docs/start-here/first-query.md index 9a548bce..06841e34 100644 --- a/packages/docs/src/content/docs/start-here/first-query.md +++ b/packages/docs/src/content/docs/start-here/first-query.md @@ -18,7 +18,7 @@ haven't linked the CLI, replace `codehub` with ## Hybrid search: `query` -`codehub query` fuses BM25 lexical search with HNSW vector search (when +`codehub query` fuses BM25 lexical search with brute-force vector KNN (when embeddings are present) to find symbols related to a natural-language concept. @@ -99,7 +99,7 @@ down` restricts to dependencies (who do I call), and `--direction both` ## Next - [MCP tools overview](/opencodehub/mcp/overview/) for the full server - capabilities (28 tools across exploration, federation, scan, HTTP, + capabilities (29 tools across exploration, federation, scan, HTTP, and meta). - [Using with Claude Code](/opencodehub/guides/using-with-claude-code/) to let the agent run these tools for you. diff --git a/packages/docs/src/content/docs/start-here/install.md b/packages/docs/src/content/docs/start-here/install.md index 901b6030..c6224139 100644 --- a/packages/docs/src/content/docs/start-here/install.md +++ b/packages/docs/src/content/docs/start-here/install.md @@ -11,16 +11,16 @@ sidebar: parity with the Linux dev path, but native Windows now works without the MSVC build chain because OpenCodeHub does no native compilation at install time. -- **Node.js:** Node 20, 22, or 24. The parse runtime is `web-tree-sitter` - (WASM) on every supported version — there is no native opt-in (ADR 0015). +- **Node.js:** Node ≥24.15. The store is Node's built-in `node:sqlite` + (`DatabaseSync`, enabled by default at that version) and the parse + runtime is `web-tree-sitter` (WASM) — there is no native opt-in (ADR 0015). ## Supported platforms -OpenCodeHub installs with **zero native compilation** — the parse runtime is -WASM, and the two native bindings (`@ladybugdb/core` for the graph store, -`@duckdb/node-api` for the temporal store) ship prebuilt per platform. The -graph store is the narrowest matrix and is **mandatory** (there is no -fallback), so its prebuilt coverage defines where OpenCodeHub runs: +OpenCodeHub installs with **zero native compilation and zero native +storage bindings** — the store is Node's built-in `node:sqlite` and the +parse runtime is WASM (ADR 0019). There is no per-platform prebuilt to +match, so **every platform is supported**: | Platform | Supported | |---|---| @@ -29,16 +29,14 @@ fallback), so its prebuilt coverage defines where OpenCodeHub runs: | Linux x64 (glibc — Debian/Ubuntu/RHEL) | ✅ | | Linux arm64 (glibc) | ✅ | | Windows x64 | ✅ | -| **Windows arm64** | ❌ no `@ladybugdb/core` prebuilt | -| **Linux musl (Alpine)** | ❌ no `@ladybugdb/core` prebuilt | - -On an unsupported platform the CLI fails fast with a `GraphDbBindingError` that -names the case. For containers, use a **glibc** base image (`node:22`, -`node:22-slim`, `debian`, `ubuntu`) rather than an Alpine/musl image -(`node:22-alpine`). Windows-on-ARM users should run under x64 emulation or WSL2 -with an x64/arm64-glibc Linux until upstream ships the missing prebuilts -(tracked upstream in `@ladybugdb/core`). -- **pnpm:** `>=10.0.0` (the workspace lockfile is generated with 10.33.2). +| Windows arm64 | ✅ | +| Linux musl (Alpine) | ✅ | + +There is no unsupported-platform failure mode: `npm install -g +@opencodehub/cli` plus Node ≥24.15 is the whole install. Any base +image works, including Alpine/musl (`node:24-alpine`) and Windows-on-ARM, +because nothing compiles and no native binding has to load. +- **pnpm:** `>=11.0.0` (the workspace lockfile is generated with 11.1.0). - **Python 3.12:** optional, only used by auxiliary tooling (the harness packages do not ship as runtime dependencies). Not required for the CLI or MCP server. @@ -58,7 +56,7 @@ pnpm -r build mise run cli:link # puts `codehub` on your PATH ``` -`mise install` activates the Node 22, pnpm 11.1.0, and Python 3.12 pins +`mise install` activates the Node 24, pnpm 11.1.0, and Python 3.12 pins from `mise.toml`. `pnpm install --frozen-lockfile` installs exactly the lockfile-pinned dependencies. `pnpm -r build` compiles every TypeScript package so the CLI entrypoint at `packages/cli/dist/index.js` is @@ -76,10 +74,10 @@ tarball globally. If you already manage Node and pnpm another way: -1. Install Node 20, 22, or 24 (`nvm install 22`, `fnm install 22`, or - from [nodejs.org](https://nodejs.org)). Every supported version uses - the same `web-tree-sitter` (WASM) parse runtime — there is no native - parser and no opt-in (ADR 0015). +1. Install Node ≥24.15 (`nvm install 24`, `fnm install 24`, or + from [nodejs.org](https://nodejs.org)). The store uses the built-in + `node:sqlite` and parsing uses `web-tree-sitter` (WASM) — there is no + native parser and no opt-in (ADR 0015). 2. Install pnpm `>=11.0.0` (`corepack enable pnpm`, or `npm install -g pnpm@11`). 3. Clone, build, and link: @@ -113,10 +111,11 @@ Then probe your environment: codehub doctor ``` -`codehub doctor` checks your Node version, pnpm version, native-module -bindings (the DuckDB and LadybugDB prebuilds — parsing is WASM-only, so -there is no native parser to probe), and writable paths in `~/.codehub/` -and `.codehub/`. It exits non-zero if anything looks off. +`codehub doctor` checks your Node version, pnpm version, the built-in +`node:sqlite` module (an import plus a WAL round-trip — there is no +native storage binding to probe, and parsing is WASM-only), and writable +paths in `~/.codehub/` and `.codehub/`. It exits non-zero if anything +looks off. :::note[Fallback for unlinked checkouts] If you cannot or will not link the CLI (locked-down CI images, a @@ -132,11 +131,11 @@ node packages/cli/dist/index.js doctor ## Optional environment toggles -Storage has no toggle — the graph tier is always LadybugDB -(`.codehub/graph.lbug`) and the temporal tier is always DuckDB -(`.codehub/temporal.duckdb`); both are written on every `analyze` and -there is no backend-selection env var (ADR 0016). Parsing has no toggle -either — `web-tree-sitter` (WASM) is the only runtime (ADR 0015). +Storage has no toggle: the whole index lands in one +`.codehub/store.sqlite` file (WAL mode) via the built-in `node:sqlite`, +written on every `analyze`, with no backend-selection env var and no +native binding (ADR 0019). Parsing has no toggle either: +`web-tree-sitter` (WASM) is the only runtime (ADR 0015). | Variable | Default | Effect | |---|---|---| diff --git a/packages/docs/src/content/docs/start-here/quick-start.md b/packages/docs/src/content/docs/start-here/quick-start.md index 5cd0d02d..5b6c8a6e 100644 --- a/packages/docs/src/content/docs/start-here/quick-start.md +++ b/packages/docs/src/content/docs/start-here/quick-start.md @@ -77,11 +77,11 @@ codehub setup --editors claude-code codehub analyze ``` -`analyze` writes the graph to `.codehub/` under the repo root and -registers the repo in `~/.codehub/registry.json`. The graph always lands -in `.codehub/graph.lbug` (LadybugDB) with `.codehub/temporal.duckdb` -alongside it; if `@ladybugdb/core` cannot load on the current platform, -analyze aborts with a `GraphDbBindingError` rather than falling back. +`analyze` writes the index to `.codehub/` under the repo root and +registers the repo in `~/.codehub/registry.json`. The whole index always +lands in one `.codehub/store.sqlite` file (WAL mode) via Node's built-in +`node:sqlite`; there is no native storage binding to load and no +platform where analyze fails for lack of a prebuilt (ADR 0019). Add `--embeddings` to compute semantic vectors for hybrid search, or `--offline` to guarantee zero network sockets. @@ -117,7 +117,7 @@ codehub impact validateUser --depth 2 - [Your first query](/opencodehub/start-here/first-query/) walks through `query`, `context`, and `impact` with sample output. -- [MCP tools](/opencodehub/mcp/tools/) lists all 28 tools the server +- [MCP tools](/opencodehub/mcp/tools/) lists all 29 tools the server exposes. - [Using with Claude Code](/opencodehub/guides/using-with-claude-code/) covers the plugin path (PreToolUse hooks) and the MCP-only path. diff --git a/packages/docs/src/content/docs/start-here/what-is-opencodehub.md b/packages/docs/src/content/docs/start-here/what-is-opencodehub.md index b899de8b..ffbccf89 100644 --- a/packages/docs/src/content/docs/start-here/what-is-opencodehub.md +++ b/packages/docs/src/content/docs/start-here/what-is-opencodehub.md @@ -28,12 +28,12 @@ where does this data flow. OpenCodeHub parses your repository with tree-sitter (15 GA languages, plus SCIP indexers for TypeScript, Python, Go, Rust, and Java), resolves imports and inheritance, and materialises a **typed symbol -graph**. That graph is stored in LadybugDB, a graph-native database, -with DuckDB carrying the temporal sibling (cochanges and the -symbol-summary cache). Both tiers are always present — there is no -backend toggle, and a failure to load the `@ladybugdb/core` binding -aborts the operation rather than falling back. BM25 lexical search and -filter-aware HNSW vector search sit on the same store. A local MCP +graph**. That graph is stored in one `store.sqlite` file via Node's +built-in `node:sqlite`, which also carries the temporal tables +(cochanges and the symbol-summary cache). There is no backend toggle and +no native storage binding: ADR 0019 removed both `@ladybugdb/core` and +`@duckdb/node-api`, so the whole index is one file. BM25 lexical search +and filter-aware vector search sit on the same store. A local MCP server exposes the graph to any agent that speaks Model Context Protocol. @@ -41,11 +41,11 @@ Protocol. flowchart LR A[Source tree] -->|tree-sitter parse| B[Symbol graph] B -->|resolve imports and MRO| C[Typed relations] - C -->|BM25 plus HNSW index| D[Hybrid graph store] + C -->|BM25 plus vector index| D[store.sqlite] C -->|detect communities and flows| E[Processes and clusters] D --> F[MCP server] E --> F - F -->|28 tools| G[AI coding agent] + F -->|29 tools| G[AI coding agent] ``` Clustering, execution-flow tracing, and blast-radius analysis all happen @@ -54,19 +54,19 @@ call, not ten round-trips. ## What you get in v1 -- **Graph-native storage.** LadybugDB is the graph tier and a dedicated - DuckDB sibling serves the temporal store. Both files (`graph.lbug` + - `temporal.duckdb`) are written on every index — no backend knob, no - fallback layout (ADR 0016). +- **Single-file storage.** One `store.sqlite` file (WAL mode) via Node's + built-in `node:sqlite` holds the whole index: graph nodes, edges, + embeddings, and the temporal tables. There is no backend knob and no + native storage binding (ADR 0019), so every platform is supported. - **Cross-repo federation.** Group several indexed repos with `codehub group` and query them through the `group_*` MCP tools. The repo is a first-class graph node and `repo_uri` carries through every cross-repo response, including the `AMBIGUOUS_REPO` envelope. - **Deterministic code-pack.** `pack_codebase` (MCP) and `codehub - code-pack` produce a reproducible 9-item BOM signed by the release + code-pack` produce a reproducible 8-item BOM signed by the release workflow. - **WASM-only parsing.** `web-tree-sitter` is the only parse runtime on - Node 20, 22, and 24, with all 15 grammar `.wasm` blobs vendored in the + Node ≥24.15, with all 15 grammar `.wasm` blobs vendored in the `@opencodehub/ingestion` tarball. `npm install -g @opencodehub/cli@latest` does zero native builds and zero GitHub fetches (ADR 0015). diff --git a/packages/ingestion/src/parse/unified-queries.ts b/packages/ingestion/src/parse/unified-queries.ts index a2e9cce0..edc68ccf 100644 --- a/packages/ingestion/src/parse/unified-queries.ts +++ b/packages/ingestion/src/parse/unified-queries.ts @@ -593,6 +593,20 @@ const DART_QUERY = ` ; --- mixins: with M1, M2 (lives inside a superclass node in Dart grammar) --- (mixins (type_identifier) @name @reference.mixin) + +; --- calls: intentionally NOT captured --- +; Dart's tree-sitter grammar has no invocation node (no call_expression / +; function_expression_invocation). A call is a flat sibling chain under the +; statement: an (identifier) or (unconditional_assignable_selector) followed by +; a separate (selector (argument_part)). The callee name is a PRECEDING SIBLING +; of the arguments node, which a tree-sitter query pattern cannot condition on. +; Any single-S-expression @reference.call is therefore either UNSOUND (matching +; (unconditional_assignable_selector (identifier)) also captures field READS such +; as obj.field as calls, and still misses bare calls) or unable to reach the +; callee name (matching the argument selector). A correct capture would need a +; bespoke extractCalls that walks from the arguments node to the preceding +; identifier -- deliberately not done. Dart's call graph is a documented +; grammar-precision gap, same class as the Rust/Swift SCIP gaps in the roadmap. `; // --------------------------------------------------------------------------- diff --git a/packages/ingestion/src/pipeline/incremental-determinism.test.ts b/packages/ingestion/src/pipeline/incremental-determinism.test.ts index 138fa7ad..ad936974 100644 --- a/packages/ingestion/src/pipeline/incremental-determinism.test.ts +++ b/packages/ingestion/src/pipeline/incremental-determinism.test.ts @@ -110,6 +110,31 @@ async function writeFixture(repo: string): Promise { "", ].join("\n"), ); + // A multi-dot TypeScript declaration file. Its `.d.ts` extension resolves + // to "typescript" by the LAST dot under the canonical detectLanguage (which + // the phases now route through) — the same result the deleted phase-local + // `lastIndexOf(".")` switches produced on a repo-relative path. Including it + // pins that the multi-dot handling does not perturb the graph hash. + await fs.writeFile( + path.join(repo, "shapes.d.ts"), + ["export interface Shape { readonly kind: string; }", ""].join("\n"), + ); + // A COBOL program. cross-file's deleted local switch OMITTED cobol, so a + // .cbl file used to resolve to `undefined`; the canonical detectLanguage + // returns "cobol". COBOL has no tree-sitter grammar and emits no + // defs/calls, so it carries no CALLS edges to re-resolve — the widening is + // inert. Its presence here proves that inertness at the graph-hash level. + await fs.writeFile( + path.join(repo, "REPORT.cbl"), + [ + " IDENTIFICATION DIVISION.", + " PROGRAM-ID. REPORT.", + " PROCEDURE DIVISION.", + " DISPLAY 'HELLO'.", + " STOP RUN.", + "", + ].join("\n"), + ); // Padding: 20 unrelated leaf files keep the total count high enough // that a single-file touch's closure stays well under the 30% valve. for (let i = 0; i < 20; i += 1) { diff --git a/packages/ingestion/src/pipeline/phases/accesses.ts b/packages/ingestion/src/pipeline/phases/accesses.ts index 7e88447a..03ce3904 100644 --- a/packages/ingestion/src/pipeline/phases/accesses.ts +++ b/packages/ingestion/src/pipeline/phases/accesses.ts @@ -31,8 +31,9 @@ * files; overage is dropped with a `warn` event. */ -import type { GraphNode, NodeId } from "@opencodehub/core-types"; +import type { GraphNode, LanguageId, NodeId } from "@opencodehub/core-types"; import { makeNodeId } from "@opencodehub/core-types"; +import { detectLanguage } from "../../parse/language-detector.js"; import { getProvider } from "../../providers/registry.js"; import type { PipelineContext, PipelinePhase } from "../types.js"; import { CROSS_FILE_PHASE_NAME } from "./cross-file.js"; @@ -96,7 +97,7 @@ function runAccesses(ctx: PipelineContext, parse: ParseOutput): AccessesOutput { const files = [...parse.definitionsByFile.keys()].sort(); for (const filePath of files) { - const language = languageForFile(filePath, parse); + const language = languageForFile(filePath); if (language === undefined) continue; const provider = getProvider(language); if (provider.extractPropertyAccesses === undefined) continue; @@ -192,29 +193,26 @@ function runAccesses(ctx: PipelineContext, parse: ParseOutput): AccessesOutput { return { edgeCount, truncatedFiles, unresolvedCount }; } -function languageForFile( - filePath: string, - _parse: ParseOutput, -): "typescript" | "tsx" | "javascript" | "python" | undefined { - const idx = filePath.lastIndexOf("."); - if (idx < 0) return undefined; - const ext = filePath.slice(idx).toLowerCase(); - switch (ext) { - case ".ts": - case ".mts": - case ".cts": - return "typescript"; - case ".tsx": - return "tsx"; - case ".js": - case ".mjs": - case ".cjs": - case ".jsx": - return "javascript"; - case ".py": - case ".pyi": - return "python"; - default: - return undefined; +/** + * Languages the accesses phase supports (those whose providers expose an + * `extractPropertyAccesses` hook). We route file paths through the canonical + * {@link detectLanguage} and then whitelist to this narrow set, preserving + * the phase's original 4-language contract even though `detectLanguage` + * recognises more. + */ +type AccessesLanguage = "typescript" | "tsx" | "javascript" | "python"; + +const ACCESSES_LANGUAGES: ReadonlySet = new Set([ + "typescript", + "tsx", + "javascript", + "python", +]); + +function languageForFile(filePath: string): AccessesLanguage | undefined { + const lang = detectLanguage(filePath); + if (lang !== undefined && ACCESSES_LANGUAGES.has(lang)) { + return lang as AccessesLanguage; } + return undefined; } diff --git a/packages/ingestion/src/pipeline/phases/cross-file.ts b/packages/ingestion/src/pipeline/phases/cross-file.ts index 4e06f6fc..84596896 100644 --- a/packages/ingestion/src/pipeline/phases/cross-file.ts +++ b/packages/ingestion/src/pipeline/phases/cross-file.ts @@ -29,6 +29,7 @@ */ import { makeNodeId, type NodeId } from "@opencodehub/core-types"; +import { detectLanguage } from "../../parse/language-detector.js"; import { getProvider } from "../../providers/registry.js"; import { CONFIDENCE_BY_TIER, @@ -36,11 +37,7 @@ import { type SymbolIndex, } from "../../providers/resolution/context.js"; import type { PipelineContext, PipelinePhase } from "../types.js"; -import { - buildFilePathLookup, - partitionPriorEdges, - resolveIncrementalView, -} from "./incremental-helper.js"; +import { carryForwardEdges, resolveIncrementalView } from "./incremental-helper.js"; import { INCREMENTAL_SCOPE_PHASE_NAME } from "./incremental-scope.js"; import { ORM_PHASE_NAME } from "./orm.js"; import { PARSE_PHASE_NAME, type ParseOutput } from "./parse.js"; @@ -86,29 +83,10 @@ function runCrossFile(ctx: PipelineContext, parse: ParseOutput): CrossFileOutput // a full run at the same commit. Determinism gate: see // `packages/ingestion/src/pipeline/incremental-determinism.test.ts`. const view = resolveIncrementalView(ctx); - if ( - view.active && - view.previousGraph?.edges !== undefined && - view.previousGraph.nodes !== undefined - ) { - const filePathByNodeId = buildFilePathLookup(view.previousGraph.nodes); - const carried = partitionPriorEdges( - view.previousGraph.edges, - filePathByNodeId, - view.closure, - new Set(["CALLS"]), - ); - for (const e of carried) { - if (e.confidence <= CONFIDENCE_BY_TIER.global) continue; - ctx.graph.addEdge({ - from: e.from, - to: e.to, - type: e.type, - confidence: e.confidence, - ...(e.reason !== undefined ? { reason: e.reason } : {}), - }); - } - } + carryForwardEdges(ctx, view, { + edgeTypes: ["CALLS"], + minConfidence: CONFIDENCE_BY_TIER.global, + }); // ---- 1. Build the import graph restricted to files we actually parsed. - // In incremental mode we narrow the SCC+upgrade walk to closure files; @@ -215,9 +193,7 @@ function reresolveCallsForFile( } const fileNodeId = makeNodeId("File", filePath, filePath); - const language = parse.definitionsByFile.has(filePath) - ? inferLanguageFromFile(filePath) - : undefined; + const language = parse.definitionsByFile.has(filePath) ? detectLanguage(filePath) : undefined; if (language === undefined) return { upgraded: 0, stillUnresolved: 0 }; const provider = getProvider(language); @@ -256,84 +232,6 @@ function reresolveCallsForFile( return { upgraded, stillUnresolved }; } -function inferLanguageFromFile( - filePath: string, -): - | "typescript" - | "tsx" - | "javascript" - | "python" - | "go" - | "rust" - | "java" - | "csharp" - | "c" - | "cpp" - | "ruby" - | "kotlin" - | "swift" - | "php" - | "dart" - | undefined { - const idx = filePath.lastIndexOf("."); - if (idx < 0) return undefined; - const ext = filePath.slice(idx).toLowerCase(); - switch (ext) { - case ".ts": - case ".mts": - case ".cts": - return "typescript"; - case ".tsx": - return "tsx"; - case ".js": - case ".mjs": - case ".cjs": - case ".jsx": - return "javascript"; - case ".py": - case ".pyi": - return "python"; - case ".go": - return "go"; - case ".rs": - return "rust"; - case ".java": - return "java"; - case ".cs": - return "csharp"; - case ".c": - case ".h": - // .h is ambiguous between C/C++; default to C. A dedicated C++ header - // detector can upgrade the classification later. - return "c"; - case ".cpp": - case ".cc": - case ".cxx": - case ".hpp": - case ".hh": - case ".hxx": - return "cpp"; - case ".rb": - return "ruby"; - case ".kt": - case ".kts": - return "kotlin"; - case ".swift": - return "swift"; - case ".php": - case ".php3": - case ".php4": - case ".php5": - case ".php7": - case ".phtml": - return "php"; - case ".dart": - return "dart"; - default: - return undefined; - } -} - /** * Recover a File's relative path from a `File::` node id. Returns * undefined for non-file ids. diff --git a/packages/ingestion/src/pipeline/phases/incremental-helper.ts b/packages/ingestion/src/pipeline/phases/incremental-helper.ts index 65bd682a..8af0a186 100644 --- a/packages/ingestion/src/pipeline/phases/incremental-helper.ts +++ b/packages/ingestion/src/pipeline/phases/incremental-helper.ts @@ -162,3 +162,70 @@ export function buildFilePathLookup(priorNodes: readonly GraphNode[]): ReadonlyM for (const n of priorNodes) m.set(n.id, n.filePath); return m; } + +/** + * Options for {@link carryForwardEdges}. + */ +export interface CarryForwardEdgesOptions { + /** + * The edge types eligible for carry-forward. Only prior edges of one of + * these types whose BOTH endpoints resolve outside the closure are replayed. + */ + readonly edgeTypes: readonly string[]; + /** + * When set, prior edges with `confidence <= minConfidence` are skipped + * rather than carried forward. crossFile uses this to drop global-tier + * (0.5-floor) CALLS edges, which the post-parse graph already carries; mro + * omits it entirely (all its heritage edges are high-confidence). + */ + readonly minConfidence?: number; +} + +/** + * Replay prior-graph edges whose anchors are outside the closure into + * `ctx.graph`. This is the incremental carry-forward preamble shared verbatim + * by the crossFile and mro phases: guard the incremental view, build the + * node→filePath lookup, partition the prior edges by the given types, then + * re-emit each carried edge with its ORIGINAL field values. + * + * Emission order is irrelevant to the graph hash — `orderedEdges()` re-sorts + * before hashing and `addEdge` dedupes deterministically (see + * `core-types/src/graph-hash.ts`) — so this extraction is a pure code motion: + * the emitted `from/to/type/confidence/reason` are copied unchanged from the + * prior edge. Guarded end-to-end by `incremental-determinism.test.ts`. + * + * A no-op when `view.active` is false or the prior graph lacks a node/edge + * snapshot — callers may invoke it unconditionally. + */ +export function carryForwardEdges( + ctx: PipelineContext, + view: IncrementalScopeView, + options: CarryForwardEdgesOptions, +): void { + if ( + !view.active || + view.previousGraph?.edges === undefined || + view.previousGraph.nodes === undefined + ) { + return; + } + const filePathByNodeId = buildFilePathLookup(view.previousGraph.nodes); + const carried = partitionPriorEdges( + view.previousGraph.edges, + filePathByNodeId, + view.closure, + new Set(options.edgeTypes), + ); + for (const e of carried) { + if (options.minConfidence !== undefined && e.confidence <= options.minConfidence) { + continue; + } + ctx.graph.addEdge({ + from: e.from, + to: e.to, + type: e.type, + confidence: e.confidence, + ...(e.reason !== undefined ? { reason: e.reason } : {}), + }); + } +} diff --git a/packages/ingestion/src/pipeline/phases/language-detection-parity.test.ts b/packages/ingestion/src/pipeline/phases/language-detection-parity.test.ts new file mode 100644 index 00000000..2d8702a3 --- /dev/null +++ b/packages/ingestion/src/pipeline/phases/language-detection-parity.test.ts @@ -0,0 +1,122 @@ +/** + * Extension-parity pin for the pipeline phases' language detection. + * + * The cross-file and mro phases used to each carry a private + * `inferLanguageFromFile` extension→language switch (a verbatim clone of the + * `EXTENSION_MAP` in `parse/language-detector.ts`). Those copies were deleted + * in favour of the canonical {@link detectLanguage}. This test locks the + * contract that the swap preserved every call-site's post-switch behaviour: + * + * 1. Every extension the two local switches handled resolves to the SAME + * `LanguageId` under `detectLanguage` (the mro switch's 16-member table; + * cross-file's 15-member table was the same MINUS cobol). + * 2. cobol (.cbl/.cob/.cpy) is now covered — this is the one intentional + * widening (cross-file's local switch silently omitted it). The + * determinism gate in `incremental-determinism.test.ts` proves the + * widening does not move the graph hash. + * 3. Multi-dot paths (e.g. `.d.ts`) resolve by the LAST extension, matching + * the local switches' `lastIndexOf(".")` behaviour on realistic + * repo-relative paths. + * + * If someone changes `EXTENSION_MAP` and drops one of these mappings, this + * pin fails loudly at the phase boundary rather than silently altering which + * provider a file resolves to during cross-file re-resolution. + */ + +import { strict as assert } from "node:assert"; +import { describe, it } from "node:test"; +import type { LanguageId } from "@opencodehub/core-types"; +import { detectLanguage } from "../../parse/language-detector.js"; + +/** + * The exact extension→LanguageId contract the deleted `inferLanguageFromFile` + * switch in `mro.ts` returned (the superset — 16 members incl cobol). The + * cross-file copy was byte-identical minus the three cobol cases. + */ +const MRO_SWITCH_CONTRACT: ReadonlyArray = [ + [".ts", "typescript"], + [".mts", "typescript"], + [".cts", "typescript"], + [".tsx", "tsx"], + [".js", "javascript"], + [".mjs", "javascript"], + [".cjs", "javascript"], + [".jsx", "javascript"], + [".py", "python"], + [".pyi", "python"], + [".go", "go"], + [".rs", "rust"], + [".java", "java"], + [".cs", "csharp"], + [".c", "c"], + [".h", "c"], + [".cpp", "cpp"], + [".cc", "cpp"], + [".cxx", "cpp"], + [".hpp", "cpp"], + [".hh", "cpp"], + [".hxx", "cpp"], + [".rb", "ruby"], + [".kt", "kotlin"], + [".kts", "kotlin"], + [".swift", "swift"], + [".php", "php"], + [".php3", "php"], + [".php4", "php"], + [".php5", "php"], + [".php7", "php"], + [".phtml", "php"], + [".dart", "dart"], +]; + +/** Extensions cross-file's inline union OMITTED — now covered (the widening). */ +const COBOL_WIDENING: ReadonlyArray = [ + [".cbl", "cobol"], + [".cob", "cobol"], + [".cpy", "cobol"], +]; + +describe("pipeline language-detection parity (detectLanguage replaces the local switches)", () => { + it("reproduces the deleted mro inferLanguageFromFile switch for every extension", () => { + for (const [ext, expected] of MRO_SWITCH_CONTRACT) { + assert.equal( + detectLanguage(`src/file${ext}`), + expected, + `detectLanguage drifted from the mro switch on ${ext}`, + ); + // Case-insensitivity — the local switch lower-cased the extension. + assert.equal( + detectLanguage(`src/file${ext.toUpperCase()}`), + expected, + `detectLanguage lost case-insensitivity on ${ext}`, + ); + } + }); + + it("covers cobol — the one extension cross-file's local switch omitted", () => { + for (const [ext, expected] of COBOL_WIDENING) { + assert.equal( + detectLanguage(`src/PROG${ext}`), + expected, + `cobol widening regressed on ${ext}`, + ); + } + }); + + it("resolves multi-dot paths by the last extension (matches the local switches on repo-relative paths)", () => { + // The deleted switches used `lastIndexOf(".")`; detectLanguage uses + // `lastExtension()`. On realistic repo-relative files they agree. + assert.equal(detectLanguage("src/types.d.ts"), "typescript"); + assert.equal(detectLanguage("src/a.test.ts"), "typescript"); + assert.equal(detectLanguage("src/App.stories.tsx"), "tsx"); + assert.equal(detectLanguage("copybooks/ACCT.rec.cpy"), "cobol"); + }); + + it("returns undefined for extensions no switch handled", () => { + // Every call site treats undefined as "skip this file", so unknown + // extensions must not resolve to a provider. + assert.equal(detectLanguage("README.md"), undefined); + assert.equal(detectLanguage("data.json"), undefined); + assert.equal(detectLanguage("Makefile"), undefined); + }); +}); diff --git a/packages/ingestion/src/pipeline/phases/mro.ts b/packages/ingestion/src/pipeline/phases/mro.ts index f87440e8..58268cea 100644 --- a/packages/ingestion/src/pipeline/phases/mro.ts +++ b/packages/ingestion/src/pipeline/phases/mro.ts @@ -15,17 +15,13 @@ */ import type { NodeId } from "@opencodehub/core-types"; +import { detectLanguage } from "../../parse/language-detector.js"; import { getProvider } from "../../providers/registry.js"; import { MroConflictError } from "../../providers/resolution/c3.js"; import { getMroStrategy } from "../../providers/resolution/mro.js"; -import type { LanguageId } from "../../providers/types.js"; import type { PipelineContext, PipelinePhase } from "../types.js"; import { CROSS_FILE_PHASE_NAME } from "./cross-file.js"; -import { - buildFilePathLookup, - partitionPriorEdges, - resolveIncrementalView, -} from "./incremental-helper.js"; +import { carryForwardEdges, resolveIncrementalView } from "./incremental-helper.js"; import { INCREMENTAL_SCOPE_PHASE_NAME } from "./incremental-scope.js"; import { STRUCTURE_PHASE_NAME } from "./structure.js"; @@ -61,28 +57,9 @@ function runMro(ctx: PipelineContext): MroOutput { // forward for the rest keeps the graph hash byte-identical to a full // run at the same commit (see `incremental-determinism.test.ts`). const view = resolveIncrementalView(ctx); - if ( - view.active && - view.previousGraph?.edges !== undefined && - view.previousGraph.nodes !== undefined - ) { - const filePathByNodeId = buildFilePathLookup(view.previousGraph.nodes); - const carried = partitionPriorEdges( - view.previousGraph.edges, - filePathByNodeId, - view.closure, - new Set(["METHOD_OVERRIDES", "METHOD_IMPLEMENTS"]), - ); - for (const e of carried) { - ctx.graph.addEdge({ - from: e.from, - to: e.to, - type: e.type, - confidence: e.confidence, - ...(e.reason !== undefined ? { reason: e.reason } : {}), - }); - } - } + carryForwardEdges(ctx, view, { + edgeTypes: ["METHOD_OVERRIDES", "METHOD_IMPLEMENTS"], + }); // ---- Collect owner nodes + their methods + their heritage. ------------- // @@ -148,7 +125,7 @@ function runMro(ctx: PipelineContext): MroOutput { // indirect ancestors through this lookup, but belt-and-suspenders. linearizationCache.set(id, [id as string]); const filePath = ownerFilePath.get(id) ?? ""; - const lang = inferLanguageFromFile(filePath); + const lang = detectLanguage(filePath); if (lang === undefined) { linearizationCache.set(id, [id as string]); return [id as string]; @@ -187,7 +164,7 @@ function runMro(ctx: PipelineContext): MroOutput { for (const id of ownerIds) { const ownerPath = ownerFilePath.get(id) ?? ""; - const lang = inferLanguageFromFile(ownerPath); + const lang = detectLanguage(ownerPath); if (lang === undefined) continue; const provider = getProvider(lang); if (provider.mroStrategy === "none") continue; @@ -259,63 +236,3 @@ function runMro(ctx: PipelineContext): MroOutput { return { overridesCount, implementsCount, conflictCount }; } - -function inferLanguageFromFile(filePath: string): LanguageId | undefined { - const idx = filePath.lastIndexOf("."); - if (idx < 0) return undefined; - const ext = filePath.slice(idx).toLowerCase(); - switch (ext) { - case ".ts": - case ".mts": - case ".cts": - return "typescript"; - case ".tsx": - return "tsx"; - case ".js": - case ".mjs": - case ".cjs": - case ".jsx": - return "javascript"; - case ".py": - case ".pyi": - return "python"; - case ".go": - return "go"; - case ".rs": - return "rust"; - case ".java": - return "java"; - case ".cs": - return "csharp"; - case ".c": - case ".h": - // .h is ambiguous between C/C++; default to C. A dedicated C++ header - // detector can upgrade the classification later. - return "c"; - case ".cpp": - case ".cc": - case ".cxx": - case ".hpp": - case ".hh": - case ".hxx": - return "cpp"; - case ".rb": - return "ruby"; - case ".kt": - case ".kts": - return "kotlin"; - case ".swift": - return "swift"; - case ".php": - case ".php3": - case ".php4": - case ".php5": - case ".php7": - case ".phtml": - return "php"; - case ".dart": - return "dart"; - default: - return undefined; - } -} diff --git a/packages/ingestion/src/providers/c.ts b/packages/ingestion/src/providers/c.ts index 3bc9a31a..ed73625b 100644 --- a/packages/ingestion/src/providers/c.ts +++ b/packages/ingestion/src/providers/c.ts @@ -1,10 +1,10 @@ import type { NodeKind } from "@opencodehub/core-types"; -import type { ParseCapture } from "../parse/types.js"; import { + type DefinitionsConfig, + extractCallsGeneric, + extractDefinitionsGeneric, getLine, - innermostEnclosingDef, - isInside, - pairDefinitionsWithNames, + kindFromMap, } from "./extract-helpers.js"; import type { ExtractedCall, @@ -51,90 +51,22 @@ const C_DEF_KIND_MAP: Readonly> = { "definition.macro": "Macro", }; -function extractCDefinitions(input: ExtractDefinitionsInput): readonly ExtractedDefinition[] { - const { filePath, captures, sourceText } = input; - const paired = pairDefinitionsWithNames(captures); - const defCaptures = captures.filter((c) => c.tag.startsWith("definition.")); - const out: ExtractedDefinition[] = []; - - for (const { def, name } of paired) { - const kind = C_DEF_KIND_MAP[def.tag]; - if (kind === undefined) continue; - - let owner: string | undefined; - const ownerDef = innermostEnclosingDef(def, defCaptures); - if (ownerDef !== undefined) { - const ownerPaired = paired.find((p) => p.def === ownerDef); - if (ownerPaired !== undefined) owner = ownerPaired.name.text; - } - - const qualifiedName = owner !== undefined ? `${owner}.${name.text}` : name.text; - const headerLine = getLine(sourceText, def.startLine); - const isStatic = /\bstatic\b/.test(headerLine); - // C convention: identifiers prefixed with `_` tend to be internal; we - // respect both `static` and leading-underscore as non-exported. - const isExported = !isStatic && !name.text.startsWith("_"); +const C_DEFS_CONFIG: DefinitionsConfig = { + kindFor: kindFromMap(C_DEF_KIND_MAP), + // C convention: identifiers prefixed with `_` tend to be internal; we + // respect both `static` (file-scoped) and leading-underscore as non-exported. + isExported: ({ name, def, sourceText }) => + !/\bstatic\b/.test(getLine(sourceText, def.startLine)) && !name.startsWith("_"), +}; - out.push({ - kind, - name: name.text, - qualifiedName, - filePath, - startLine: def.startLine, - endLine: def.endLine, - isExported, - ...(owner !== undefined ? { owner } : {}), - }); - } - return out; +function extractCDefinitions(input: ExtractDefinitionsInput): readonly ExtractedDefinition[] { + return extractDefinitionsGeneric(input, C_DEFS_CONFIG); } function extractCCalls(input: ExtractCallsInput): readonly ExtractedCall[] { - const { filePath, captures, definitions } = input; - const defCaptures = captures.filter((c) => c.tag.startsWith("definition.")); - const callRefs = captures.filter((c) => c.tag === "reference.call"); - const out: ExtractedCall[] = []; - - for (const ref of callRefs) { - const innerName = findNameInside(captures, ref); - const calleeName = innerName?.text ?? ref.text; - - const enclosingDef = innermostEnclosingDef(ref, defCaptures); - const callerQualifiedName = enclosingDef - ? qualifiedForCapture(enclosingDef, definitions) - : ""; - - out.push({ - callerQualifiedName, - calleeName, - filePath, - startLine: ref.startLine, - }); - } - return out; -} - -function findNameInside( - captures: readonly ParseCapture[], - outer: ParseCapture, -): ParseCapture | undefined { - let best: ParseCapture | undefined; - for (const c of captures) { - if (c.tag !== "name") continue; - if (!isInside(c, outer)) continue; - if (best === undefined || c.startLine < best.startLine) best = c; - } - return best; -} - -function qualifiedForCapture( - def: ParseCapture, - definitions: readonly ExtractedDefinition[], -): string { - for (const d of definitions) { - if (d.startLine === def.startLine) return d.qualifiedName; - } - return ""; + // C emits NO receiver: qualified calls (`ns::foo()`) are a C++ concept, not + // applicable here. Omitting `inferReceiver` means `calleeOwner` is never set. + return extractCallsGeneric(input); } /** diff --git a/packages/ingestion/src/providers/characterization-golden.ts b/packages/ingestion/src/providers/characterization-golden.ts new file mode 100644 index 00000000..c58358fe --- /dev/null +++ b/packages/ingestion/src/providers/characterization-golden.ts @@ -0,0 +1,130 @@ +/** + * GENERATED characterization golden — DO NOT hand-edit. + * + * This file is the committed, byte-stable snapshot consumed by + * `characterization.test.ts`. It is a compiled-in `const` (not a JSON asset + * read at runtime) so the test resolves it from `dist` with a plain import, + * dodging `import.meta.url` path-offset fragility on bundle collapse. + * + * Each entry maps a `LanguageId` to the `canonicalJson(...)` string of that + * language's SORTED extractor output (see the test for the sort key), one + * string per core extractor. Full-value equality against these strings is the + * safety net for the extractor-generic refactor: any drift in a hash-relevant + * field (calleeOwner / qualifiedName / startLine / owner / …) changes the + * canonical string and fails the test with a per-language, per-extractor diff. + * + * To regenerate (ONLY for a deliberate, reviewed behavior change): + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion build + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion test + * The test rewrites THIS file's `GOLDEN` literal, then re-asserts against it. + */ + +import type { LanguageId } from "@opencodehub/core-types"; + +/** Per-extractor canonical-JSON snapshots for one language. */ +export interface ExtractorSnapshot { + readonly definitions: string; + readonly calls: string; + readonly heritage: string; + readonly imports: string; +} + +// biome-ignore format: generated literal — leave the regenerator's formatting intact. +export const GOLDEN: Record = { + "typescript": { + "definitions": "[{\"endLine\":15,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.greet\",\"startLine\":12},{\"endLine\":18,\"filePath\":\"greeter.ts\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.log\",\"startLine\":16},{\"endLine\":19,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Welcomer\",\"qualifiedName\":\"Welcomer\",\"startLine\":10},{\"endLine\":21,\"filePath\":\"greeter.ts\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"MESSAGE\",\"qualifiedName\":\"MESSAGE\",\"startLine\":21},{\"endLine\":24,\"filePath\":\"greeter.ts\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"w\",\"owner\":\"run\",\"qualifiedName\":\"run.w\",\"startLine\":24},{\"endLine\":26,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":23},{\"endLine\":8,\"filePath\":\"greeter.ts\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":6}]", + "calls": "[{\"calleeName\":\"debug\",\"calleeOwner\":\"Logger\",\"callerQualifiedName\":\"Welcomer.log\",\"filePath\":\"greeter.ts\",\"startLine\":17},{\"calleeName\":\"greet\",\"calleeOwner\":\"w\",\"callerQualifiedName\":\"run\",\"filePath\":\"greeter.ts\",\"startLine\":25},{\"calleeName\":\"log\",\"calleeOwner\":\"this\",\"callerQualifiedName\":\"Welcomer.greet\",\"filePath\":\"greeter.ts\",\"startLine\":13}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"greeter.ts\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":6},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"greeter.ts\",\"parentName\":\"Greeter\",\"relation\":\"IMPLEMENTS\",\"startLine\":10}]", + "imports": "[{\"filePath\":\"greeter.ts\",\"importedNames\":[\"Logger\"],\"kind\":\"named\",\"source\":\"./logger.js\"},{\"filePath\":\"greeter.ts\",\"importedNames\":[\"other\"],\"kind\":\"named\",\"source\":\"./mixed\"},{\"filePath\":\"greeter.ts\",\"isWildcard\":true,\"kind\":\"namespace\",\"localAlias\":\"util\",\"source\":\"./util\"},{\"filePath\":\"greeter.ts\",\"kind\":\"default\",\"localAlias\":\"defaultExport\",\"source\":\"./mixed\"}]" + }, + "tsx": { + "definitions": "[{\"endLine\":10,\"filePath\":\"page.tsx\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"label\",\"owner\":\"Greeting\",\"qualifiedName\":\"Greeting.label\",\"startLine\":10},{\"endLine\":12,\"filePath\":\"page.tsx\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"Greeting\",\"qualifiedName\":\"Greeting\",\"startLine\":9},{\"endLine\":17,\"filePath\":\"page.tsx\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"render\",\"owner\":\"Page\",\"qualifiedName\":\"Page.render\",\"startLine\":15},{\"endLine\":18,\"filePath\":\"page.tsx\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Page\",\"qualifiedName\":\"Page\",\"startLine\":14},{\"endLine\":7,\"filePath\":\"page.tsx\",\"isExported\":false,\"kind\":\"Interface\",\"name\":\"Props\",\"qualifiedName\":\"Props\",\"startLine\":5}]", + "calls": "[{\"calleeName\":\"format\",\"calleeOwner\":\"svc\",\"callerQualifiedName\":\"Greeting\",\"filePath\":\"page.tsx\",\"startLine\":10}]", + "heritage": "[{\"childQualifiedName\":\"Page\",\"filePath\":\"page.tsx\",\"parentName\":\"React.Component\",\"relation\":\"EXTENDS\",\"startLine\":14}]", + "imports": "[{\"filePath\":\"page.tsx\",\"importedNames\":[\"Button\"],\"kind\":\"named\",\"source\":\"./button.js\"},{\"filePath\":\"page.tsx\",\"kind\":\"default\",\"localAlias\":\"React\",\"source\":\"react\"}]" + }, + "javascript": { + "definitions": "[{\"endLine\":16,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":12},{\"endLine\":17,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":11},{\"endLine\":20,\"filePath\":\"esm.js\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"g\",\"owner\":\"run\",\"qualifiedName\":\"run.g\",\"startLine\":20},{\"endLine\":22,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":19},{\"endLine\":26,\"filePath\":\"esm.js\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_internalHelper\",\"qualifiedName\":\"_internalHelper\",\"startLine\":24},{\"endLine\":8,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":6},{\"endLine\":9,\"filePath\":\"esm.js\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":5}]", + "calls": "[{\"calleeName\":\"debug\",\"calleeOwner\":\"Logger\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"esm.js\",\"startLine\":14},{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"esm.js\",\"startLine\":21},{\"calleeName\":\"hello\",\"calleeOwner\":\"this\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"esm.js\",\"startLine\":13}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"esm.js\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":11}]", + "imports": "[{\"filePath\":\"esm.js\",\"importedNames\":[\"Logger\"],\"kind\":\"named\",\"source\":\"./logger.js\"},{\"filePath\":\"esm.js\",\"kind\":\"default\",\"localAlias\":\"defaultExport\",\"source\":\"./default.js\"}]" + }, + "python": { + "definitions": "[{\"endLine\":12,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":10},{\"endLine\":12,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Base\",\"qualifiedName\":\"Base.greet\",\"startLine\":11},{\"endLine\":17,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":15},{\"endLine\":20,\"filePath\":\"mod.py\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"_private\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter._private\",\"startLine\":19},{\"endLine\":20,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":14},{\"endLine\":23,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Variable\",\"name\":\"g\",\"owner\":\"run\",\"qualifiedName\":\"run.g\",\"startLine\":23},{\"endLine\":24,\"filePath\":\"mod.py\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":22},{\"endLine\":7,\"filePath\":\"mod.py\",\"isConst\":true,\"isExported\":true,\"kind\":\"Const\",\"name\":\"MAX_RETRY\",\"qualifiedName\":\"MAX_RETRY\",\"startLine\":7},{\"endLine\":8,\"filePath\":\"mod.py\",\"isConst\":false,\"isExported\":false,\"kind\":\"Const\",\"name\":\"_internal_version\",\"qualifiedName\":\"_internal_version\",\"startLine\":8}]", + "calls": "[{\"calleeName\":\"Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"mod.py\",\"startLine\":23},{\"calleeName\":\"getenv\",\"calleeOwner\":\"os\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"mod.py\",\"startLine\":16},{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"mod.py\",\"startLine\":24},{\"calleeName\":\"greet\",\"calleeOwner\":\"self\",\"callerQualifiedName\":\"Greeter._private\",\"filePath\":\"mod.py\",\"startLine\":20},{\"calleeName\":\"super\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"mod.py\",\"startLine\":17},{\"calleeName\":\"super\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"mod.py\",\"startLine\":17}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"mod.py\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":14},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"mod.py\",\"parentName\":\"Mixin\",\"relation\":\"EXTENDS\",\"startLine\":14}]", + "imports": "[{\"filePath\":\"mod.py\",\"importedNames\":[\"List\",\"Opt\"],\"kind\":\"named\",\"source\":\"typing\"},{\"filePath\":\"mod.py\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"utils\"},{\"filePath\":\"mod.py\",\"kind\":\"namespace\",\"localAlias\":\"np\",\"source\":\"numpy\"},{\"filePath\":\"mod.py\",\"kind\":\"namespace\",\"source\":\"os\"}]" + }, + "go": { + "definitions": "[{\"endLine\":1,\"filePath\":\"greet.go\",\"isExported\":false,\"kind\":\"Module\",\"name\":\"greet\",\"qualifiedName\":\"greet\",\"startLine\":1},{\"endLine\":11,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":9},{\"endLine\":15,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Speaker\",\"qualifiedName\":\"Speaker\",\"startLine\":13},{\"endLine\":19,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Const\",\"name\":\"MaxGreet\",\"qualifiedName\":\"MaxGreet\",\"startLine\":17},{\"endLine\":23,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"Greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.Greet\",\"startLine\":21},{\"endLine\":28,\"filePath\":\"greet.go\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":25},{\"endLine\":30,\"filePath\":\"greet.go\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"Exported\",\"qualifiedName\":\"Exported\",\"startLine\":30},{\"endLine\":31,\"filePath\":\"greet.go\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"internal\",\"qualifiedName\":\"internal\",\"startLine\":31}]", + "calls": "[{\"calleeName\":\"Greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.go\",\"startLine\":27},{\"calleeName\":\"Sprintf\",\"calleeOwner\":\"fmt\",\"callerQualifiedName\":\"Greeter.Greet\",\"filePath\":\"greet.go\",\"startLine\":22},{\"calleeName\":\"ToLower\",\"calleeOwner\":\"str\",\"callerQualifiedName\":\"Greeter.Greet\",\"filePath\":\"greet.go\",\"startLine\":22}]", + "heritage": "[]", + "imports": "[{\"filePath\":\"greet.go\",\"kind\":\"package-wildcard\",\"localAlias\":\".\",\"source\":\"errors\"},{\"filePath\":\"greet.go\",\"kind\":\"package-wildcard\",\"localAlias\":\"str\",\"source\":\"strings\"},{\"filePath\":\"greet.go\",\"kind\":\"package-wildcard\",\"source\":\"fmt\"}]" + }, + "rust": { + "definitions": "[{\"endLine\":13,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":11},{\"endLine\":19,\"filePath\":\"lib.rs\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":16},{\"endLine\":25,\"filePath\":\"lib.rs\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":23},{\"endLine\":28,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Const\",\"name\":\"DEFAULT\",\"qualifiedName\":\"DEFAULT\",\"startLine\":28},{\"endLine\":30,\"filePath\":\"lib.rs\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"internal\",\"qualifiedName\":\"internal\",\"startLine\":30},{\"endLine\":34,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":31},{\"endLine\":9,\"filePath\":\"lib.rs\",\"isExported\":true,\"kind\":\"Trait\",\"name\":\"Greet\",\"qualifiedName\":\"Greet\",\"startLine\":7}]", + "calls": "[{\"calleeName\":\"debug\",\"calleeOwner\":\"Logger\",\"callerQualifiedName\":\"Greeter.log\",\"filePath\":\"lib.rs\",\"startLine\":24},{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"lib.rs\",\"startLine\":33},{\"calleeName\":\"log\",\"calleeOwner\":\"self\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"lib.rs\",\"startLine\":17},{\"calleeName\":\"to_string\",\"calleeOwner\":\"\\\"world\\\"\",\"callerQualifiedName\":\"run\",\"filePath\":\"lib.rs\",\"startLine\":32}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"lib.rs\",\"parentName\":\"Greet\",\"relation\":\"IMPLEMENTS\",\"startLine\":15}]", + "imports": "[{\"filePath\":\"lib.rs\",\"importedNames\":[\"HashMap\",\"Sorted\"],\"kind\":\"named\",\"source\":\"std::collections\"},{\"filePath\":\"lib.rs\",\"importedNames\":[\"Logger\"],\"kind\":\"named\",\"source\":\"crate::logger\"},{\"filePath\":\"lib.rs\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"crate::util\"},{\"filePath\":\"lib.rs\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"self::public_api\"}]" + }, + "java": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":8},{\"endLine\":14,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":12},{\"endLine\":21,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Constructor\",\"name\":\"Welcomer\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Welcomer\",\"startLine\":19},{\"endLine\":25,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.greet\",\"startLine\":23},{\"endLine\":30,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"run\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.run\",\"startLine\":27},{\"endLine\":31,\"filePath\":\"Welcomer.java\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Welcomer\",\"qualifiedName\":\"Welcomer\",\"startLine\":16},{\"endLine\":33,\"filePath\":\"Welcomer.java\",\"isExported\":false,\"kind\":\"Class\",\"name\":\"Internal\",\"qualifiedName\":\"Internal\",\"startLine\":33},{\"endLine\":9,\"filePath\":\"Welcomer.java\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"greet\",\"callerQualifiedName\":\"Welcomer.run\",\"filePath\":\"Welcomer.java\",\"startLine\":28},{\"calleeName\":\"greet\",\"callerQualifiedName\":\"Welcomer.run\",\"filePath\":\"Welcomer.java\",\"startLine\":29},{\"calleeName\":\"println\",\"calleeOwner\":\"System.out\",\"callerQualifiedName\":\"Welcomer.run\",\"filePath\":\"Welcomer.java\",\"startLine\":29}]", + "heritage": "[{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.java\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":16},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.java\",\"parentName\":\"Greeter\",\"relation\":\"IMPLEMENTS\",\"startLine\":16},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.java\",\"parentName\":\"Runnable\",\"relation\":\"IMPLEMENTS\",\"startLine\":16}]", + "imports": "[{\"filePath\":\"Welcomer.java\",\"importedNames\":[\"List\"],\"kind\":\"named\",\"source\":\"java.util\"},{\"filePath\":\"Welcomer.java\",\"importedNames\":[\"PI\"],\"kind\":\"named\",\"source\":\"java.lang.Math\"},{\"filePath\":\"Welcomer.java\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"java.util.concurrent\"}]" + }, + "csharp": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Welcomer.cs\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"Greet\",\"owner\":\"IGreeter\",\"qualifiedName\":\"IGreeter.Greet\",\"startLine\":10},{\"endLine\":11,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"IGreeter\",\"qualifiedName\":\"IGreeter\",\"startLine\":8},{\"endLine\":16,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":13},{\"endLine\":25,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Constructor\",\"name\":\"Welcomer\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Welcomer\",\"startLine\":22},{\"endLine\":30,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"Greet\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Greet\",\"startLine\":27},{\"endLine\":35,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"Dispose\",\"owner\":\"Welcomer\",\"qualifiedName\":\"Welcomer.Dispose\",\"startLine\":32},{\"endLine\":36,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Welcomer\",\"qualifiedName\":\"Welcomer\",\"startLine\":18},{\"endLine\":38,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Record\",\"name\":\"Pair\",\"qualifiedName\":\"Pair\",\"startLine\":38},{\"endLine\":40,\"filePath\":\"Welcomer.cs\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"Point\",\"qualifiedName\":\"Point\",\"startLine\":40},{\"endLine\":42,\"filePath\":\"Welcomer.cs\",\"isExported\":false,\"kind\":\"Class\",\"name\":\"Hidden\",\"qualifiedName\":\"Hidden\",\"startLine\":42},{\"endLine\":43,\"filePath\":\"Welcomer.cs\",\"isExported\":false,\"kind\":\"Namespace\",\"name\":\"App.Greet\",\"qualifiedName\":\"App.Greet\",\"startLine\":6}]", + "calls": "[{\"calleeName\":\"WriteLine\",\"calleeOwner\":\"Console\",\"callerQualifiedName\":\"Welcomer.Dispose\",\"filePath\":\"Welcomer.cs\",\"startLine\":34}]", + "heritage": "[{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.cs\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":18},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.cs\",\"parentName\":\"IDisposable\",\"relation\":\"IMPLEMENTS\",\"startLine\":18},{\"childQualifiedName\":\"Welcomer\",\"filePath\":\"Welcomer.cs\",\"parentName\":\"IGreeter\",\"relation\":\"IMPLEMENTS\",\"startLine\":18}]", + "imports": "[{\"filePath\":\"Welcomer.cs\",\"kind\":\"namespace\",\"localAlias\":\"Json\",\"source\":\"Newtonsoft.Json\"},{\"filePath\":\"Welcomer.cs\",\"kind\":\"namespace\",\"source\":\"System\"},{\"filePath\":\"Welcomer.cs\",\"kind\":\"namespace\",\"source\":\"System.Collections.Generic\"}]" + }, + "c": { + "definitions": "[{\"endLine\":13,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Enum\",\"name\":\"Status\",\"owner\":\"Status\",\"qualifiedName\":\"Status.Status\",\"startLine\":10},{\"endLine\":13,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Typedef\",\"name\":\"Status\",\"qualifiedName\":\"Status\",\"startLine\":10},{\"endLine\":19,\"filePath\":\"user.c\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"reset_counter\",\"qualifiedName\":\"reset_counter\",\"startLine\":17},{\"endLine\":26,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"register_user\",\"qualifiedName\":\"register_user\",\"startLine\":21},{\"endLine\":32,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"main\",\"qualifiedName\":\"main\",\"startLine\":28},{\"endLine\":8,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Struct\",\"name\":\"User\",\"owner\":\"User\",\"qualifiedName\":\"User.User\",\"startLine\":5},{\"endLine\":8,\"filePath\":\"user.c\",\"isExported\":true,\"kind\":\"Typedef\",\"name\":\"User\",\"qualifiedName\":\"User\",\"startLine\":5}]", + "calls": "[{\"calleeName\":\"printf\",\"callerQualifiedName\":\"register_user\",\"filePath\":\"user.c\",\"startLine\":24},{\"calleeName\":\"register_user\",\"callerQualifiedName\":\"main\",\"filePath\":\"user.c\",\"startLine\":29},{\"calleeName\":\"reset_counter\",\"callerQualifiedName\":\"main\",\"filePath\":\"user.c\",\"startLine\":30}]", + "heritage": "[]", + "imports": "[{\"filePath\":\"user.c\",\"kind\":\"package-wildcard\",\"source\":\"stdio.h\"},{\"filePath\":\"user.c\",\"kind\":\"package-wildcard\",\"source\":\"user.h\"}]" + }, + "cpp": { + "definitions": "[{\"endLine\":10,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":7},{\"endLine\":14,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"mix\",\"owner\":\"Mixin\",\"qualifiedName\":\"Mixin.mix\",\"startLine\":14},{\"endLine\":15,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Mixin\",\"qualifiedName\":\"Mixin\",\"startLine\":12},{\"endLine\":19,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"Greeter\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.Greeter\",\"startLine\":19},{\"endLine\":20,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":20},{\"endLine\":23,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":17},{\"endLine\":31,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":25},{\"endLine\":33,\"filePath\":\"greet.cpp\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_internalHelper\",\"qualifiedName\":\"_internalHelper\",\"startLine\":33},{\"endLine\":35,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Namespace\",\"name\":\"auth\",\"qualifiedName\":\"auth\",\"startLine\":5},{\"endLine\":9,\"filePath\":\"greet.cpp\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"hello\",\"calleeOwner\":\"Base\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.cpp\",\"startLine\":30},{\"calleeName\":\"hello\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.cpp\",\"startLine\":27},{\"calleeName\":\"hello\",\"calleeOwner\":\"ptr\",\"callerQualifiedName\":\"run\",\"filePath\":\"greet.cpp\",\"startLine\":29}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"greet.cpp\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":17},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"greet.cpp\",\"parentName\":\"Mixin\",\"relation\":\"EXTENDS\",\"startLine\":17}]", + "imports": "[{\"filePath\":\"greet.cpp\",\"kind\":\"package-wildcard\",\"source\":\"db.h\"},{\"filePath\":\"greet.cpp\",\"kind\":\"package-wildcard\",\"source\":\"string\"}]" + }, + "ruby": { + "definitions": "[{\"endLine\":10,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":6},{\"endLine\":13,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"log\",\"qualifiedName\":\"log\",\"startLine\":13},{\"endLine\":14,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Module\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":12},{\"endLine\":22,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.greet\",\"startLine\":19},{\"endLine\":26,\"filePath\":\"auth.rb\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"_private\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter._private\",\"startLine\":24},{\"endLine\":27,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":16},{\"endLine\":28,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Module\",\"name\":\"Auth\",\"qualifiedName\":\"Auth\",\"startLine\":5},{\"endLine\":33,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":30},{\"endLine\":9,\"filePath\":\"auth.rb\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"greet\",\"owner\":\"Base\",\"qualifiedName\":\"Base.greet\",\"startLine\":7}]", + "calls": "[{\"calleeName\":\"greet\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"auth.rb\",\"startLine\":32},{\"calleeName\":\"greet\",\"callerQualifiedName\":\"Greeter._private\",\"filePath\":\"auth.rb\",\"startLine\":25},{\"calleeName\":\"log\",\"callerQualifiedName\":\"Greeter.greet\",\"filePath\":\"auth.rb\",\"startLine\":20},{\"calleeName\":\"new\",\"calleeOwner\":\"Auth::Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"auth.rb\",\"startLine\":31}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.rb\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":16},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.rb\",\"parentName\":\"Logger\",\"relation\":\"IMPLEMENTS\",\"startLine\":17}]", + "imports": "[{\"filePath\":\"auth.rb\",\"kind\":\"named\",\"source\":\"./session\"},{\"filePath\":\"auth.rb\",\"kind\":\"package-wildcard\",\"source\":\"digest\"}]" + }, + "kotlin": { + "definitions": "[{\"endLine\":12,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":12},{\"endLine\":13,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":11},{\"endLine\":19,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":16},{\"endLine\":23,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":21},{\"endLine\":24,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":15},{\"endLine\":27,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Property\",\"name\":\"g\",\"owner\":\"run\",\"qualifiedName\":\"run.g\",\"startLine\":27},{\"endLine\":29,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":26},{\"endLine\":31,\"filePath\":\"Auth.kt\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_privateHelper\",\"qualifiedName\":\"_privateHelper\",\"startLine\":31},{\"endLine\":8,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Logger\",\"qualifiedName\":\"Logger.log\",\"startLine\":8},{\"endLine\":9,\"filePath\":\"Auth.kt\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":7}]", + "calls": "[{\"calleeName\":\"Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.kt\",\"startLine\":27},{\"calleeName\":\"hello\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.kt\",\"startLine\":28},{\"calleeName\":\"hello\",\"calleeOwner\":\"super\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.kt\",\"startLine\":18},{\"calleeName\":\"log\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.kt\",\"startLine\":17},{\"calleeName\":\"println\",\"callerQualifiedName\":\"Greeter.log\",\"filePath\":\"Auth.kt\",\"startLine\":22}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.kt\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":15},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.kt\",\"parentName\":\"Logger\",\"relation\":\"EXTENDS\",\"startLine\":15}]", + "imports": "[{\"filePath\":\"Auth.kt\",\"isWildcard\":true,\"kind\":\"package-wildcard\",\"source\":\"kotlin.collections\"},{\"filePath\":\"Auth.kt\",\"kind\":\"named\",\"source\":\"java.util.UUID\"}]" + }, + "swift": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":8},{\"endLine\":22,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":19},{\"endLine\":26,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":24},{\"endLine\":27,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":12},{\"endLine\":32,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":29},{\"endLine\":34,\"filePath\":\"Auth.swift\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_privateHelper\",\"qualifiedName\":\"_privateHelper\",\"startLine\":34},{\"endLine\":6,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":4},{\"endLine\":9,\"filePath\":\"Auth.swift\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"Greeter\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.swift\",\"startLine\":30},{\"calleeName\":\"hello\",\"calleeOwner\":\"g\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.swift\",\"startLine\":31},{\"calleeName\":\"hello\",\"calleeOwner\":\"super\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.swift\",\"startLine\":21},{\"calleeName\":\"log\",\"callerQualifiedName\":\"Greeter.hello\",\"filePath\":\"Auth.swift\",\"startLine\":20},{\"calleeName\":\"print\",\"callerQualifiedName\":\"Greeter.log\",\"filePath\":\"Auth.swift\",\"startLine\":25}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.swift\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":12},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.swift\",\"parentName\":\"Logger\",\"relation\":\"EXTENDS\",\"startLine\":12}]", + "imports": "[{\"filePath\":\"Auth.swift\",\"kind\":\"named\",\"source\":\"Foundation\"}]" + }, + "php": { + "definitions": "[{\"endLine\":10,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Interface\",\"name\":\"Authenticatable\",\"qualifiedName\":\"Authenticatable\",\"startLine\":7},{\"endLine\":14,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"touch\",\"owner\":\"Timestamps\",\"qualifiedName\":\"Timestamps.touch\",\"startLine\":14},{\"endLine\":15,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Trait\",\"name\":\"Timestamps\",\"qualifiedName\":\"Timestamps\",\"startLine\":12},{\"endLine\":19,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":19},{\"endLine\":2,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Namespace\",\"name\":\"Auth\",\"qualifiedName\":\"Auth\",\"startLine\":2},{\"endLine\":20,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":17},{\"endLine\":31,\"filePath\":\"Auth.php\",\"isExported\":false,\"kind\":\"Method\",\"name\":\"__construct\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.__construct\",\"startLine\":28},{\"endLine\":38,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"login\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.login\",\"startLine\":33},{\"endLine\":39,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":22},{\"endLine\":45,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":41},{\"endLine\":9,\"filePath\":\"Auth.php\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"login\",\"owner\":\"Authenticatable\",\"qualifiedName\":\"Authenticatable.login\",\"startLine\":9}]", + "calls": "[{\"calleeName\":\"hello\",\"calleeOwner\":\"$this\",\"callerQualifiedName\":\"Greeter.login\",\"filePath\":\"Auth.php\",\"startLine\":35},{\"calleeName\":\"hello\",\"calleeOwner\":\"Base\",\"callerQualifiedName\":\"Greeter.login\",\"filePath\":\"Auth.php\",\"startLine\":36},{\"calleeName\":\"login\",\"calleeOwner\":\"$g\",\"callerQualifiedName\":\"run\",\"filePath\":\"Auth.php\",\"startLine\":44}]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.php\",\"parentName\":\"Authenticatable\",\"relation\":\"IMPLEMENTS\",\"startLine\":22},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.php\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":22},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"Auth.php\",\"parentName\":\"Timestamps\",\"relation\":\"IMPLEMENTS\",\"startLine\":24}]", + "imports": "[{\"filePath\":\"Auth.php\",\"kind\":\"named\",\"source\":\"Psr/Log/LoggerInterface\"},{\"filePath\":\"Auth.php\",\"kind\":\"named\",\"source\":\"Timestamps\"},{\"filePath\":\"Auth.php\",\"kind\":\"package-wildcard\",\"source\":\"config.php\"}]" + }, + "dart": { + "definitions": "[{\"endLine\":10,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"touch\",\"qualifiedName\":\"touch\",\"startLine\":10},{\"endLine\":10,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"touch\",\"qualifiedName\":\"touch\",\"startLine\":10},{\"endLine\":11,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Trait\",\"name\":\"Timestamps\",\"qualifiedName\":\"Timestamps\",\"startLine\":9},{\"endLine\":14,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":14},{\"endLine\":14,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Base\",\"qualifiedName\":\"Base.hello\",\"startLine\":14},{\"endLine\":15,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Base\",\"qualifiedName\":\"Base\",\"startLine\":13},{\"endLine\":23,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":23},{\"endLine\":23,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"hello\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.hello\",\"startLine\":23},{\"endLine\":29,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":29},{\"endLine\":29,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Greeter\",\"qualifiedName\":\"Greeter.log\",\"startLine\":29},{\"endLine\":32,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Greeter\",\"qualifiedName\":\"Greeter\",\"startLine\":17},{\"endLine\":34,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Function\",\"name\":\"run\",\"qualifiedName\":\"run\",\"startLine\":34},{\"endLine\":39,\"filePath\":\"auth.dart\",\"isExported\":false,\"kind\":\"Function\",\"name\":\"_privateHelper\",\"qualifiedName\":\"_privateHelper\",\"startLine\":39},{\"endLine\":6,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Method\",\"name\":\"log\",\"owner\":\"Logger\",\"qualifiedName\":\"Logger.log\",\"startLine\":6},{\"endLine\":7,\"filePath\":\"auth.dart\",\"isExported\":true,\"kind\":\"Class\",\"name\":\"Logger\",\"qualifiedName\":\"Logger\",\"startLine\":5}]", + "calls": "[]", + "heritage": "[{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.dart\",\"parentName\":\"Base\",\"relation\":\"EXTENDS\",\"startLine\":17},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.dart\",\"parentName\":\"Logger\",\"relation\":\"IMPLEMENTS\",\"startLine\":17},{\"childQualifiedName\":\"Greeter\",\"filePath\":\"auth.dart\",\"parentName\":\"Timestamps\",\"relation\":\"IMPLEMENTS\",\"startLine\":17}]", + "imports": "[{\"filePath\":\"auth.dart\",\"kind\":\"namespace\",\"localAlias\":\"meta\",\"source\":\"package:meta/meta.dart\"},{\"filePath\":\"auth.dart\",\"kind\":\"namespace\",\"source\":\"dart:io\"}]" + }, + "cobol": { + "definitions": "[]", + "calls": "[]", + "heritage": "[]", + "imports": "[]" + } +}; diff --git a/packages/ingestion/src/providers/characterization.test.ts b/packages/ingestion/src/providers/characterization.test.ts new file mode 100644 index 00000000..bcd6a14d --- /dev/null +++ b/packages/ingestion/src/providers/characterization.test.ts @@ -0,0 +1,780 @@ +/** + * Provider extractor characterization ("golden") harness. + * + * WHY THIS EXISTS + * --------------- + * The per-language provider tests (`swift.test.ts`, `csharp.test.ts`, …) are + * STRUCTURAL: they assert set-membership ("defs include `Greeter`"), so they do + * NOT lock hash-relevant VALUES like `calleeOwner`, `qualifiedName`, `startLine`, + * or `owner`. A refactor that collapses the per-provider `extractCalls` / + * `extractHeritage` implementations into shared generics could silently drift one + * of those fields and still pass every existing test — but change the downstream + * `graphHash`. + * + * This harness closes that gap. For all 16 registered providers × the 4 core + * extractors, it snapshots the FULL canonical-JSON output over a representative + * fixture and asserts byte-equality against a committed golden + * (`characterization-golden.ts`). It fails loudly with a per-language, per-extractor + * diff on any value drift. + * + * DESIGN + * ------ + * - Fixtures reuse the exact `FIXTURE` string each per-language `*.test.ts` defines + * (representative; known to exercise defs/calls/heritage/imports). cobol has no + * tree-sitter grammar, so it is NOT routed through the ParsePool — its provider + * ignores inputs and returns [] for every extractor; we snapshot the empty arrays. + * - Each extractor output array is sorted by `canonicalJson(element)` before + * snapshotting. That is a stable TOTAL order, so a pure emission-order refactor + * does NOT false-positive, while any VALUE drift changes an element's canonical + * string and IS caught. + * - Coverage is a tripwire: the harness asserts it snapshotted exactly + * `listProviders().length` languages, so adding a provider forces a golden update. + * + * REGENERATING THE GOLDEN (deliberate, reviewed behavior changes ONLY) + * -------------------------------------------------------------------- + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion build + * UPDATE_CHARACTERIZATION=1 pnpm --filter @opencodehub/ingestion test + * The env flag rewrites `src/providers/characterization-golden.ts` from the live + * extractor output, then STILL asserts against the just-written values. Without the + * flag the golden is never mutated — the test is read-only. + */ + +import { strict as assert } from "node:assert"; +import { writeFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { after, before, describe, it } from "node:test"; +import { canonicalJson, type LanguageId } from "@opencodehub/core-types"; +import { ParsePool } from "../parse/worker-pool.js"; +import { type ExtractorSnapshot, GOLDEN } from "./characterization-golden.js"; +import { getProvider, listProviders } from "./registry.js"; +import { type ParsedFixture, parseFixture } from "./test-helpers.js"; + +/** + * One representative fixture per language. The 15 tree-sitter languages reuse the + * verbatim `FIXTURE` string from their existing behavior test (each already + * exercises definitions + calls + heritage + imports). cobol gets a small program + * whose captures are empty (no grammar), which its stub provider maps to []. + */ +const FIXTURES: Record = { + typescript: { + path: "greeter.ts", + source: ` +import { Logger } from "./logger.js"; +import * as util from "./util"; +import defaultExport, { other } from "./mixed"; + +export interface Greeter extends Base { + greet(name: string): string; +} + +export abstract class Welcomer implements Greeter { + private banner: string; + public greet(name: string): string { + this.log(name); + return "hi " + name; + } + private log(msg: string): void { + Logger.debug(msg); + } +} + +export const MESSAGE = "welcome"; + +export function run(): void { + const w = new Welcomer(); + w.greet("world"); +} +`, + }, + tsx: { + path: "page.tsx", + source: ` +import React from "react"; +import { Button } from "./button.js"; + +interface Props { + name: string; +} + +export function Greeting(props: Props) { + const label = svc.format(props.name); + return