From c67b2c503271df3c2c7946be0917106297d66835 Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Wed, 27 May 2026 13:21:11 +0200 Subject: [PATCH 01/10] feat(manifest/bazel): merge checked-in maven_install.json files for sub-workspace discovery The existing bazel-query discovery path only inspects MODULE.bazel / WORKSPACE at the invocation cwd. Ruleset repos with per-example sub-workspaces (rules_kotlin/examples, rules_js/examples, rules_rust, rules_python) declare additional Maven artifacts in nested MODULE.bazel projects with their own maven_install.json lockfiles. Those files were silently dropped, leaving the CLI's SBOM a strict subset of what the server-side depscan parser already returns from the same tree. Add a walker that finds every checked-in maven_install.json under cwd (pruning .git, node_modules, .socket-auto-manifest, and Bazel's bazel-* convenience symlinks into ), parses each via the existing parseUnsortedDepsJson v2-lockfile path, and merges the artifacts into the SBOM after the bazel-query extraction step. Merge is keyed by mavenCoordinates so the root workspace's lockfile (which bazel-query already extracts) does not double-count; conflicting group:artifact versions across sub-workspaces continue to surface as the existing loud-failure error in normalizeToMavenInstallJson. Verified against bazel-bench/oss/rules_kotlin: walker now surfaces all 10 examples/*/maven_install.json files and merges 393 unique artifacts into the SBOM beyond what the root @kotlin_rules_maven discovery returns. No regression on tink-java (0 lockfiles) or protobuf (1 root lockfile, deduped against bazel-query's @maven extraction). --- .../bazel/bazel-lockfile-discovery.mts | 196 ++++++++++++++ .../bazel/bazel-lockfile-discovery.test.mts | 241 ++++++++++++++++++ .../manifest/bazel/extract_bazel_to_maven.mts | 33 +++ 3 files changed, 470 insertions(+) create mode 100644 src/commands/manifest/bazel/bazel-lockfile-discovery.mts create mode 100644 src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts diff --git a/src/commands/manifest/bazel/bazel-lockfile-discovery.mts b/src/commands/manifest/bazel/bazel-lockfile-discovery.mts new file mode 100644 index 000000000..eabef8860 --- /dev/null +++ b/src/commands/manifest/bazel/bazel-lockfile-discovery.mts @@ -0,0 +1,196 @@ +/** + * Find and parse checked-in `maven_install.json` files anywhere under the + * invocation cwd. This is the sub-workspace discovery path: rules_kotlin (and + * any ruleset-style repo with per-example `MODULE.bazel` projects under + * `examples/`) declares Maven artifacts in per-example lockfiles that the + * root-MODULE.bazel-only bazel-query path never sees. The server-side + * depscan parser already merges these files unconditionally; the CLI matches + * that semantics here so its SBOM is not a strict subset of the server's. + * + * Security gates: each lockfile read is size-capped, the walk prunes dirs + * that are obviously not Bazel workspaces (.git, node_modules, the + * .socket-auto-manifest output dir), and the walk skips Bazel's `bazel-*` + * convenience symlinks so we never recurse into (which can + * be tens of GiB and contains generated copies of the same lockfiles). + */ + +import { readFileSync, readdirSync, statSync } from 'node:fs' +import path from 'node:path' + +import { logger } from '@socketsecurity/registry/lib/logger' + +import { parseUnsortedDepsJson } from './bazel-build-parser.mts' + +import type { ExtractedArtifact } from './bazel-build-parser.mts' + +// Cap any single checked-in lockfile read at 1 GiB. Matches the cap that +// extract_bazel_to_maven uses for generated `unsorted_deps.json` files so a +// hostile or malformed lockfile cannot OOM the CLI. +const MAX_LOCKFILE_BYTES = 1024 * 1024 * 1024 +// Hard ceiling on number of lockfiles we will surface. Real monorepos have +// well under 50; this cap is a guard against pathological inputs. +const MAX_LOCKFILES = 256 +// Hard ceiling on directory walk depth. Real workspaces nest <8 deep; the +// cap protects against pathological symlink loops that slipped past the +// `bazel-*` prefix prune. +const MAX_WALK_DEPTH = 16 +// Directory basenames the walk refuses to descend into. None of these +// contain Bazel workspaces, and node_modules / .git can be enormous. +const PRUNE_DIR_NAMES = new Set([ + '.git', + '.hg', + '.idea', + '.pnpm-store', + '.socket-auto-manifest', + '.svn', + '.vscode', + 'node_modules', +]) +// Directory basename prefixes the walk refuses to descend into. Bazel's +// `bazel-out`, `bazel-bin`, `bazel-testlogs`, and `bazel-` +// convenience symlinks all point into the output_base, which contains +// generated copies of the same lockfiles plus tens of GiB of build output. +const PRUNE_DIR_PREFIXES = ['bazel-'] + +// Walks the tree rooted at `cwd` and returns absolute paths to every +// checked-in `maven_install.json` file the walk reaches before hitting the +// MAX_LOCKFILES cap. Output is sorted for determinism. +export function findCheckedInMavenLockfiles( + cwd: string, + verbose?: boolean, +): string[] { + const out: string[] = [] + // Tuple stack: [absolute dir, depth from cwd]. + const stack: Array<[string, number]> = [[cwd, 0]] + while (stack.length) { + if (out.length >= MAX_LOCKFILES) { + if (verbose) { + logger.log( + `[VERBOSE] subworkspace: hit MAX_LOCKFILES cap (${MAX_LOCKFILES}); truncating walk`, + ) + } + break + } + const next = stack.pop() + if (!next) { + break + } + const { 0: dir, 1: depth } = next + let entries + try { + entries = readdirSync(dir, { withFileTypes: true }) + } catch { + continue + } + for (const entry of entries) { + const name = entry.name + if (entry.isFile() && name === 'maven_install.json') { + out.push(path.join(dir, name)) + continue + } + if (!entry.isDirectory()) { + continue + } + if (depth + 1 > MAX_WALK_DEPTH) { + continue + } + if (PRUNE_DIR_NAMES.has(name)) { + continue + } + let pruned = false + for (const prefix of PRUNE_DIR_PREFIXES) { + if (name.startsWith(prefix)) { + pruned = true + break + } + } + if (pruned) { + continue + } + stack.push([path.join(dir, name), depth + 1]) + } + } + return out.sort() +} + +// Reads a single checked-in `maven_install.json` and returns its artifacts. +// Defers parsing to `parseUnsortedDepsJson`, which already handles the +// rules_jvm_external v2 lockfile shape (the canonical checked-in form) as +// well as the legacy artifact-array shape. Tags each artifact with a +// synthetic `sourceRepo` derived from the lockfile's path relative to cwd +// so downstream dep-label resolution does not collide a lockfile-derived +// rule with a bazel-query-derived rule of the same name in a different +// sub-workspace. +export function readCheckedInMavenLockfile( + file: string, + cwd: string, + verbose?: boolean, +): ExtractedArtifact[] { + let size: number + try { + size = statSync(file).size + } catch { + return [] + } + if (size > MAX_LOCKFILE_BYTES) { + if (verbose) { + logger.log( + `[VERBOSE] subworkspace: skip oversized lockfile ${file} (${size} bytes; cap ${MAX_LOCKFILE_BYTES})`, + ) + } + return [] + } + let json: string + try { + json = readFileSync(file, 'utf8') + } catch { + return [] + } + const artifacts = parseUnsortedDepsJson(json) + const relPath = path.relative(cwd, file) + // Use the directory containing the lockfile (relative to cwd) so two + // lockfiles in different sub-workspaces get distinct synthetic repo tags. + // For the root-cwd lockfile, relative dir is '' and the tag collapses to + // `lockfile:.` — harmless and still distinct from real repo names. + const repoTag = `lockfile:${path.dirname(relPath) || '.'}` + const out: ExtractedArtifact[] = [] + for (const a of artifacts) { + out.push({ ...a, sourceRepo: a.sourceRepo ?? repoTag }) + } + return out +} + +// Convenience composition: find all checked-in lockfiles under cwd, parse +// each, and return a flat list of artifacts deduplicated by +// `mavenCoordinates`. The dedup is intentionally coarse: two lockfiles in +// different sub-workspaces that pin the same `group:artifact:version` +// contribute one entry; conflicting versions for the same `group:artifact` +// are NOT resolved here and will surface as the existing `Conflicting +// versions for ...` error in `normalizeToMavenInstallJson` downstream +// (preserving today's loud-failure behavior for genuine version conflicts). +export function discoverAllCheckedInMavenArtifacts( + cwd: string, + verbose?: boolean, +): { artifacts: ExtractedArtifact[]; lockfilePaths: string[] } { + const lockfilePaths = findCheckedInMavenLockfiles(cwd, verbose) + const seenCoords = new Set() + const artifacts: ExtractedArtifact[] = [] + for (const file of lockfilePaths) { + const fromFile = readCheckedInMavenLockfile(file, cwd, verbose) + let merged = 0 + for (const a of fromFile) { + if (seenCoords.has(a.mavenCoordinates)) { + continue + } + seenCoords.add(a.mavenCoordinates) + artifacts.push(a) + merged += 1 + } + if (verbose) { + logger.log( + `[VERBOSE] subworkspace: lockfile ${path.relative(cwd, file)} contributed ${merged} new artifact(s) (file had ${fromFile.length})`, + ) + } + } + return { artifacts, lockfilePaths } +} diff --git a/src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts b/src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts new file mode 100644 index 000000000..209ced27c --- /dev/null +++ b/src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts @@ -0,0 +1,241 @@ +import { + mkdirSync, + mkdtempSync, + rmSync, + symlinkSync, + writeFileSync, +} from 'node:fs' +import os from 'node:os' +import path from 'node:path' + +import { afterEach, beforeEach, describe, expect, it } from 'vitest' + +import { + discoverAllCheckedInMavenArtifacts, + findCheckedInMavenLockfiles, + readCheckedInMavenLockfile, +} from './bazel-lockfile-discovery.mts' + +// Minimal v2-lockfile shape (the canonical checked-in rules_jvm_external +// `maven_install.json`). We write distinct group:artifact:version triples per +// fixture so the merge logic has something measurable to dedupe. +function v2Lockfile(entries: Record): string { + const artifacts: Record< + string, + { shasums: { jar: string }; version: string } + > = {} + for (const [groupArtifact, version] of Object.entries(entries)) { + artifacts[groupArtifact] = { + shasums: { jar: 'a'.repeat(64) }, + version, + } + } + return JSON.stringify({ artifacts, dependencies: {} }) +} + +describe('bazel-lockfile-discovery', () => { + let tmp: string + + beforeEach(() => { + tmp = mkdtempSync(path.join(os.tmpdir(), 'sock-bazel-lock-')) + }) + + afterEach(() => { + rmSync(tmp, { recursive: true, force: true }) + }) + + describe('findCheckedInMavenLockfiles', () => { + it('finds lockfiles at root and arbitrary depth', () => { + writeFileSync(path.join(tmp, 'maven_install.json'), v2Lockfile({})) + mkdirSync(path.join(tmp, 'examples', 'dagger'), { recursive: true }) + writeFileSync( + path.join(tmp, 'examples', 'dagger', 'maven_install.json'), + v2Lockfile({}), + ) + mkdirSync(path.join(tmp, 'examples', 'android', 'nested'), { + recursive: true, + }) + writeFileSync( + path.join(tmp, 'examples', 'android', 'nested', 'maven_install.json'), + v2Lockfile({}), + ) + const found = findCheckedInMavenLockfiles(tmp).map(p => + path.relative(tmp, p), + ) + expect(found).toEqual([ + 'examples/android/nested/maven_install.json', + 'examples/dagger/maven_install.json', + 'maven_install.json', + ]) + }) + + it('prunes node_modules / .git / .socket-auto-manifest', () => { + for (const dir of ['node_modules', '.git', '.socket-auto-manifest']) { + mkdirSync(path.join(tmp, dir, 'sub'), { recursive: true }) + writeFileSync( + path.join(tmp, dir, 'sub', 'maven_install.json'), + v2Lockfile({}), + ) + } + // Sanity: a tracked lockfile alongside the pruned dirs is still found. + writeFileSync(path.join(tmp, 'maven_install.json'), v2Lockfile({})) + const found = findCheckedInMavenLockfiles(tmp).map(p => + path.relative(tmp, p), + ) + expect(found).toEqual(['maven_install.json']) + }) + + it('prunes bazel-* convenience symlinks (output_base)', () => { + // Simulate Bazel's `bazel-out` symlink pointing at a directory that + // contains a generated copy of the same lockfile. The walk must skip + // it; otherwise discovery would surface generated lockfiles from + // (tens of GiB of bazel state). + const fakeOutputBase = mkdtempSync( + path.join(os.tmpdir(), 'sock-fake-outbase-'), + ) + try { + mkdirSync(path.join(fakeOutputBase, 'external', 'maven'), { + recursive: true, + }) + writeFileSync( + path.join(fakeOutputBase, 'external', 'maven', 'maven_install.json'), + v2Lockfile({ 'com.example:generated': '1.0' }), + ) + symlinkSync(fakeOutputBase, path.join(tmp, 'bazel-out')) + writeFileSync( + path.join(tmp, 'maven_install.json'), + v2Lockfile({ 'com.example:checkedin': '1.0' }), + ) + const found = findCheckedInMavenLockfiles(tmp).map(p => + path.relative(tmp, p), + ) + expect(found).toEqual(['maven_install.json']) + } finally { + rmSync(fakeOutputBase, { recursive: true, force: true }) + } + }) + }) + + describe('readCheckedInMavenLockfile', () => { + it('parses a v2 lockfile and tags sourceRepo with the relative dir', () => { + mkdirSync(path.join(tmp, 'examples', 'dagger'), { recursive: true }) + const file = path.join(tmp, 'examples', 'dagger', 'maven_install.json') + writeFileSync( + file, + v2Lockfile({ + 'com.google.dagger:dagger': '2.50', + 'com.google.guava:guava': '33.0.0-jre', + }), + ) + const artifacts = readCheckedInMavenLockfile(file, tmp) + expect(artifacts).toHaveLength(2) + const coords = artifacts.map(a => a.mavenCoordinates).sort() + expect(coords).toEqual([ + 'com.google.dagger:dagger:2.50', + 'com.google.guava:guava:33.0.0-jre', + ]) + for (const a of artifacts) { + expect(a.sourceRepo).toBe('lockfile:examples/dagger') + } + }) + + it('tags the root-cwd lockfile as lockfile:.', () => { + const file = path.join(tmp, 'maven_install.json') + writeFileSync(file, v2Lockfile({ 'com.example:a': '1.0' })) + const artifacts = readCheckedInMavenLockfile(file, tmp) + expect(artifacts).toHaveLength(1) + expect(artifacts[0]?.sourceRepo).toBe('lockfile:.') + }) + + it('returns [] on malformed JSON without throwing', () => { + const file = path.join(tmp, 'maven_install.json') + writeFileSync(file, '{not valid json') + expect(readCheckedInMavenLockfile(file, tmp)).toEqual([]) + }) + }) + + describe('discoverAllCheckedInMavenArtifacts', () => { + it('merges artifacts from every lockfile and dedupes by coordinates', () => { + // Root lockfile pins guava 33. + writeFileSync( + path.join(tmp, 'maven_install.json'), + v2Lockfile({ 'com.google.guava:guava': '33.0.0-jre' }), + ) + // Sub-workspace A pins guava 33 (duplicate) AND dagger 2.50. + mkdirSync(path.join(tmp, 'examples', 'dagger'), { recursive: true }) + writeFileSync( + path.join(tmp, 'examples', 'dagger', 'maven_install.json'), + v2Lockfile({ + 'com.google.dagger:dagger': '2.50', + 'com.google.guava:guava': '33.0.0-jre', + }), + ) + // Sub-workspace B pins compose 1.6. + mkdirSync(path.join(tmp, 'examples', 'jetpack_compose'), { + recursive: true, + }) + writeFileSync( + path.join(tmp, 'examples', 'jetpack_compose', 'maven_install.json'), + v2Lockfile({ 'androidx.compose.ui:ui': '1.6.0' }), + ) + const { artifacts, lockfilePaths } = + discoverAllCheckedInMavenArtifacts(tmp) + expect(lockfilePaths).toHaveLength(3) + const coords = artifacts.map(a => a.mavenCoordinates).sort() + // Guava appears once even though it's pinned in two lockfiles. + expect(coords).toEqual([ + 'androidx.compose.ui:ui:1.6.0', + 'com.google.dagger:dagger:2.50', + 'com.google.guava:guava:33.0.0-jre', + ]) + }) + + it('emits the rules_kotlin shape: 1 root + several per-example lockfiles, strict superset', () => { + // Stand-in for rules_kotlin's layout: a small root lockfile plus per- + // example lockfiles that each declare some unique artifacts. The test + // asserts the strict-superset property — merged artifact count is + // greater than any single lockfile's count. + writeFileSync( + path.join(tmp, 'maven_install.json'), + v2Lockfile( + Object.fromEntries( + Array.from({ length: 70 }, (_, i) => [ + `org.jetbrains.kotlin:lib-${i}`, + '1.9.0', + ]), + ), + ), + ) + for (const example of [ + 'android', + 'anvil', + 'dagger', + 'jetpack_compose', + 'ksp', + 'multiplex', + 'plugin', + ]) { + mkdirSync(path.join(tmp, 'examples', example), { recursive: true }) + writeFileSync( + path.join(tmp, 'examples', example, 'maven_install.json'), + v2Lockfile( + Object.fromEntries( + Array.from({ length: 73 }, (_, i) => [ + `com.example.${example}:lib-${i}`, + '1.0', + ]), + ), + ), + ) + } + const { artifacts, lockfilePaths } = + discoverAllCheckedInMavenArtifacts(tmp) + expect(lockfilePaths).toHaveLength(8) + // Root has 70 unique; each of 7 examples has 73 unique disjoint sets. + expect(artifacts.length).toBe(70 + 7 * 73) + // Strict-superset of the root alone (which is what the CLI returns + // today without sub-workspace discovery). + expect(artifacts.length).toBeGreaterThan(70) + }) + }) +}) diff --git a/src/commands/manifest/bazel/extract_bazel_to_maven.mts b/src/commands/manifest/bazel/extract_bazel_to_maven.mts index 334b116db..ab3c8ebf9 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_maven.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_maven.mts @@ -15,6 +15,7 @@ import { parseUnsortedDepsJson, } from './bazel-build-parser.mts' import { ensureJavaOnPath } from './bazel-java-shim.mts' +import { discoverAllCheckedInMavenArtifacts } from './bazel-lockfile-discovery.mts' import { validateOutputBase } from './bazel-output-base-check.mts' import { provisionPythonShim } from './bazel-python-shim.mts' import { @@ -430,6 +431,38 @@ export async function extractBazelToMaven( logger.info(`@${repo}: ${artifacts.length} artifact(s)`) } + // Step 5b: merge checked-in `maven_install.json` files found anywhere + // under cwd. The root-only bazel-query path above never sees per-example + // sub-workspace lockfiles (rules_kotlin, rules_js, rules_rust, etc. all + // declare additional Maven artifacts in `examples/*/MODULE.bazel` + // projects with their own lockfiles), so without this merge the CLI + // emits a strict subset of what depscan's server-side parser already + // returns. Dedup by `mavenCoordinates` so the root workspace's lockfile + // — which bazel-query already extracted — does not double-count. + const seenCoords = new Set( + allArtifacts.map(a => a.mavenCoordinates), + ) + const { artifacts: lockfileArtifacts, lockfilePaths } = + discoverAllCheckedInMavenArtifacts(cwd, verbose) + let mergedFromLockfiles = 0 + for (const a of lockfileArtifacts) { + if (seenCoords.has(a.mavenCoordinates)) { + continue + } + seenCoords.add(a.mavenCoordinates) + allArtifacts.push(a) + mergedFromLockfiles += 1 + } + if (mergedFromLockfiles > 0) { + logger.info( + `Sub-workspace discovery: merged ${mergedFromLockfiles} additional artifact(s) from ${lockfilePaths.length} checked-in maven_install.json file(s).`, + ) + } else if (verbose) { + logger.log( + `[VERBOSE] subworkspace: no additional artifacts beyond bazel-query (${lockfilePaths.length} lockfile(s) examined)`, + ) + } + // Step 6: normalize to maven_install.json shape. const normalized = normalizeToMavenInstallJson(allArtifacts) From b06749e1604a955ac87831ef90d3ec964b515439 Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 13:46:18 +0200 Subject: [PATCH 02/10] =?UTF-8?q?refactor(manifest/bazel):=20delete=20chec?= =?UTF-8?q?ked-in=20lockfile=20discovery=20=E2=80=94=20server=20walker=20a?= =?UTF-8?q?lready=20covers=20it?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CLI was walking the tree for **/maven_install.json and **/*_maven_install.json lockfiles and merging them into its output. The server-side scan walker matches the same pattern natively via getReportSupportedFiles, so the CLI re-reading these files duplicated work and produced output that was a strict subset of what the walker already saw when the scan was uploaded. Removes: - bazel-lockfile-discovery.mts (196 lines) - bazel-lockfile-discovery.test.mts (241 lines) - extract_bazel_to_maven step 5b (33 lines): the merge-back-into-allArtifacts loop The .socket-auto-manifest/maven_install.json the CLI emits is still picked up by the same walker — that composition stays intact. After this change the CLI emits only what running bazel produces (the complement of the walker's lockfile coverage). --- .../bazel/bazel-lockfile-discovery.mts | 196 -------------- .../bazel/bazel-lockfile-discovery.test.mts | 241 ------------------ .../manifest/bazel/extract_bazel_to_maven.mts | 33 --- 3 files changed, 470 deletions(-) delete mode 100644 src/commands/manifest/bazel/bazel-lockfile-discovery.mts delete mode 100644 src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts diff --git a/src/commands/manifest/bazel/bazel-lockfile-discovery.mts b/src/commands/manifest/bazel/bazel-lockfile-discovery.mts deleted file mode 100644 index eabef8860..000000000 --- a/src/commands/manifest/bazel/bazel-lockfile-discovery.mts +++ /dev/null @@ -1,196 +0,0 @@ -/** - * Find and parse checked-in `maven_install.json` files anywhere under the - * invocation cwd. This is the sub-workspace discovery path: rules_kotlin (and - * any ruleset-style repo with per-example `MODULE.bazel` projects under - * `examples/`) declares Maven artifacts in per-example lockfiles that the - * root-MODULE.bazel-only bazel-query path never sees. The server-side - * depscan parser already merges these files unconditionally; the CLI matches - * that semantics here so its SBOM is not a strict subset of the server's. - * - * Security gates: each lockfile read is size-capped, the walk prunes dirs - * that are obviously not Bazel workspaces (.git, node_modules, the - * .socket-auto-manifest output dir), and the walk skips Bazel's `bazel-*` - * convenience symlinks so we never recurse into (which can - * be tens of GiB and contains generated copies of the same lockfiles). - */ - -import { readFileSync, readdirSync, statSync } from 'node:fs' -import path from 'node:path' - -import { logger } from '@socketsecurity/registry/lib/logger' - -import { parseUnsortedDepsJson } from './bazel-build-parser.mts' - -import type { ExtractedArtifact } from './bazel-build-parser.mts' - -// Cap any single checked-in lockfile read at 1 GiB. Matches the cap that -// extract_bazel_to_maven uses for generated `unsorted_deps.json` files so a -// hostile or malformed lockfile cannot OOM the CLI. -const MAX_LOCKFILE_BYTES = 1024 * 1024 * 1024 -// Hard ceiling on number of lockfiles we will surface. Real monorepos have -// well under 50; this cap is a guard against pathological inputs. -const MAX_LOCKFILES = 256 -// Hard ceiling on directory walk depth. Real workspaces nest <8 deep; the -// cap protects against pathological symlink loops that slipped past the -// `bazel-*` prefix prune. -const MAX_WALK_DEPTH = 16 -// Directory basenames the walk refuses to descend into. None of these -// contain Bazel workspaces, and node_modules / .git can be enormous. -const PRUNE_DIR_NAMES = new Set([ - '.git', - '.hg', - '.idea', - '.pnpm-store', - '.socket-auto-manifest', - '.svn', - '.vscode', - 'node_modules', -]) -// Directory basename prefixes the walk refuses to descend into. Bazel's -// `bazel-out`, `bazel-bin`, `bazel-testlogs`, and `bazel-` -// convenience symlinks all point into the output_base, which contains -// generated copies of the same lockfiles plus tens of GiB of build output. -const PRUNE_DIR_PREFIXES = ['bazel-'] - -// Walks the tree rooted at `cwd` and returns absolute paths to every -// checked-in `maven_install.json` file the walk reaches before hitting the -// MAX_LOCKFILES cap. Output is sorted for determinism. -export function findCheckedInMavenLockfiles( - cwd: string, - verbose?: boolean, -): string[] { - const out: string[] = [] - // Tuple stack: [absolute dir, depth from cwd]. - const stack: Array<[string, number]> = [[cwd, 0]] - while (stack.length) { - if (out.length >= MAX_LOCKFILES) { - if (verbose) { - logger.log( - `[VERBOSE] subworkspace: hit MAX_LOCKFILES cap (${MAX_LOCKFILES}); truncating walk`, - ) - } - break - } - const next = stack.pop() - if (!next) { - break - } - const { 0: dir, 1: depth } = next - let entries - try { - entries = readdirSync(dir, { withFileTypes: true }) - } catch { - continue - } - for (const entry of entries) { - const name = entry.name - if (entry.isFile() && name === 'maven_install.json') { - out.push(path.join(dir, name)) - continue - } - if (!entry.isDirectory()) { - continue - } - if (depth + 1 > MAX_WALK_DEPTH) { - continue - } - if (PRUNE_DIR_NAMES.has(name)) { - continue - } - let pruned = false - for (const prefix of PRUNE_DIR_PREFIXES) { - if (name.startsWith(prefix)) { - pruned = true - break - } - } - if (pruned) { - continue - } - stack.push([path.join(dir, name), depth + 1]) - } - } - return out.sort() -} - -// Reads a single checked-in `maven_install.json` and returns its artifacts. -// Defers parsing to `parseUnsortedDepsJson`, which already handles the -// rules_jvm_external v2 lockfile shape (the canonical checked-in form) as -// well as the legacy artifact-array shape. Tags each artifact with a -// synthetic `sourceRepo` derived from the lockfile's path relative to cwd -// so downstream dep-label resolution does not collide a lockfile-derived -// rule with a bazel-query-derived rule of the same name in a different -// sub-workspace. -export function readCheckedInMavenLockfile( - file: string, - cwd: string, - verbose?: boolean, -): ExtractedArtifact[] { - let size: number - try { - size = statSync(file).size - } catch { - return [] - } - if (size > MAX_LOCKFILE_BYTES) { - if (verbose) { - logger.log( - `[VERBOSE] subworkspace: skip oversized lockfile ${file} (${size} bytes; cap ${MAX_LOCKFILE_BYTES})`, - ) - } - return [] - } - let json: string - try { - json = readFileSync(file, 'utf8') - } catch { - return [] - } - const artifacts = parseUnsortedDepsJson(json) - const relPath = path.relative(cwd, file) - // Use the directory containing the lockfile (relative to cwd) so two - // lockfiles in different sub-workspaces get distinct synthetic repo tags. - // For the root-cwd lockfile, relative dir is '' and the tag collapses to - // `lockfile:.` — harmless and still distinct from real repo names. - const repoTag = `lockfile:${path.dirname(relPath) || '.'}` - const out: ExtractedArtifact[] = [] - for (const a of artifacts) { - out.push({ ...a, sourceRepo: a.sourceRepo ?? repoTag }) - } - return out -} - -// Convenience composition: find all checked-in lockfiles under cwd, parse -// each, and return a flat list of artifacts deduplicated by -// `mavenCoordinates`. The dedup is intentionally coarse: two lockfiles in -// different sub-workspaces that pin the same `group:artifact:version` -// contribute one entry; conflicting versions for the same `group:artifact` -// are NOT resolved here and will surface as the existing `Conflicting -// versions for ...` error in `normalizeToMavenInstallJson` downstream -// (preserving today's loud-failure behavior for genuine version conflicts). -export function discoverAllCheckedInMavenArtifacts( - cwd: string, - verbose?: boolean, -): { artifacts: ExtractedArtifact[]; lockfilePaths: string[] } { - const lockfilePaths = findCheckedInMavenLockfiles(cwd, verbose) - const seenCoords = new Set() - const artifacts: ExtractedArtifact[] = [] - for (const file of lockfilePaths) { - const fromFile = readCheckedInMavenLockfile(file, cwd, verbose) - let merged = 0 - for (const a of fromFile) { - if (seenCoords.has(a.mavenCoordinates)) { - continue - } - seenCoords.add(a.mavenCoordinates) - artifacts.push(a) - merged += 1 - } - if (verbose) { - logger.log( - `[VERBOSE] subworkspace: lockfile ${path.relative(cwd, file)} contributed ${merged} new artifact(s) (file had ${fromFile.length})`, - ) - } - } - return { artifacts, lockfilePaths } -} diff --git a/src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts b/src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts deleted file mode 100644 index 209ced27c..000000000 --- a/src/commands/manifest/bazel/bazel-lockfile-discovery.test.mts +++ /dev/null @@ -1,241 +0,0 @@ -import { - mkdirSync, - mkdtempSync, - rmSync, - symlinkSync, - writeFileSync, -} from 'node:fs' -import os from 'node:os' -import path from 'node:path' - -import { afterEach, beforeEach, describe, expect, it } from 'vitest' - -import { - discoverAllCheckedInMavenArtifacts, - findCheckedInMavenLockfiles, - readCheckedInMavenLockfile, -} from './bazel-lockfile-discovery.mts' - -// Minimal v2-lockfile shape (the canonical checked-in rules_jvm_external -// `maven_install.json`). We write distinct group:artifact:version triples per -// fixture so the merge logic has something measurable to dedupe. -function v2Lockfile(entries: Record): string { - const artifacts: Record< - string, - { shasums: { jar: string }; version: string } - > = {} - for (const [groupArtifact, version] of Object.entries(entries)) { - artifacts[groupArtifact] = { - shasums: { jar: 'a'.repeat(64) }, - version, - } - } - return JSON.stringify({ artifacts, dependencies: {} }) -} - -describe('bazel-lockfile-discovery', () => { - let tmp: string - - beforeEach(() => { - tmp = mkdtempSync(path.join(os.tmpdir(), 'sock-bazel-lock-')) - }) - - afterEach(() => { - rmSync(tmp, { recursive: true, force: true }) - }) - - describe('findCheckedInMavenLockfiles', () => { - it('finds lockfiles at root and arbitrary depth', () => { - writeFileSync(path.join(tmp, 'maven_install.json'), v2Lockfile({})) - mkdirSync(path.join(tmp, 'examples', 'dagger'), { recursive: true }) - writeFileSync( - path.join(tmp, 'examples', 'dagger', 'maven_install.json'), - v2Lockfile({}), - ) - mkdirSync(path.join(tmp, 'examples', 'android', 'nested'), { - recursive: true, - }) - writeFileSync( - path.join(tmp, 'examples', 'android', 'nested', 'maven_install.json'), - v2Lockfile({}), - ) - const found = findCheckedInMavenLockfiles(tmp).map(p => - path.relative(tmp, p), - ) - expect(found).toEqual([ - 'examples/android/nested/maven_install.json', - 'examples/dagger/maven_install.json', - 'maven_install.json', - ]) - }) - - it('prunes node_modules / .git / .socket-auto-manifest', () => { - for (const dir of ['node_modules', '.git', '.socket-auto-manifest']) { - mkdirSync(path.join(tmp, dir, 'sub'), { recursive: true }) - writeFileSync( - path.join(tmp, dir, 'sub', 'maven_install.json'), - v2Lockfile({}), - ) - } - // Sanity: a tracked lockfile alongside the pruned dirs is still found. - writeFileSync(path.join(tmp, 'maven_install.json'), v2Lockfile({})) - const found = findCheckedInMavenLockfiles(tmp).map(p => - path.relative(tmp, p), - ) - expect(found).toEqual(['maven_install.json']) - }) - - it('prunes bazel-* convenience symlinks (output_base)', () => { - // Simulate Bazel's `bazel-out` symlink pointing at a directory that - // contains a generated copy of the same lockfile. The walk must skip - // it; otherwise discovery would surface generated lockfiles from - // (tens of GiB of bazel state). - const fakeOutputBase = mkdtempSync( - path.join(os.tmpdir(), 'sock-fake-outbase-'), - ) - try { - mkdirSync(path.join(fakeOutputBase, 'external', 'maven'), { - recursive: true, - }) - writeFileSync( - path.join(fakeOutputBase, 'external', 'maven', 'maven_install.json'), - v2Lockfile({ 'com.example:generated': '1.0' }), - ) - symlinkSync(fakeOutputBase, path.join(tmp, 'bazel-out')) - writeFileSync( - path.join(tmp, 'maven_install.json'), - v2Lockfile({ 'com.example:checkedin': '1.0' }), - ) - const found = findCheckedInMavenLockfiles(tmp).map(p => - path.relative(tmp, p), - ) - expect(found).toEqual(['maven_install.json']) - } finally { - rmSync(fakeOutputBase, { recursive: true, force: true }) - } - }) - }) - - describe('readCheckedInMavenLockfile', () => { - it('parses a v2 lockfile and tags sourceRepo with the relative dir', () => { - mkdirSync(path.join(tmp, 'examples', 'dagger'), { recursive: true }) - const file = path.join(tmp, 'examples', 'dagger', 'maven_install.json') - writeFileSync( - file, - v2Lockfile({ - 'com.google.dagger:dagger': '2.50', - 'com.google.guava:guava': '33.0.0-jre', - }), - ) - const artifacts = readCheckedInMavenLockfile(file, tmp) - expect(artifacts).toHaveLength(2) - const coords = artifacts.map(a => a.mavenCoordinates).sort() - expect(coords).toEqual([ - 'com.google.dagger:dagger:2.50', - 'com.google.guava:guava:33.0.0-jre', - ]) - for (const a of artifacts) { - expect(a.sourceRepo).toBe('lockfile:examples/dagger') - } - }) - - it('tags the root-cwd lockfile as lockfile:.', () => { - const file = path.join(tmp, 'maven_install.json') - writeFileSync(file, v2Lockfile({ 'com.example:a': '1.0' })) - const artifacts = readCheckedInMavenLockfile(file, tmp) - expect(artifacts).toHaveLength(1) - expect(artifacts[0]?.sourceRepo).toBe('lockfile:.') - }) - - it('returns [] on malformed JSON without throwing', () => { - const file = path.join(tmp, 'maven_install.json') - writeFileSync(file, '{not valid json') - expect(readCheckedInMavenLockfile(file, tmp)).toEqual([]) - }) - }) - - describe('discoverAllCheckedInMavenArtifacts', () => { - it('merges artifacts from every lockfile and dedupes by coordinates', () => { - // Root lockfile pins guava 33. - writeFileSync( - path.join(tmp, 'maven_install.json'), - v2Lockfile({ 'com.google.guava:guava': '33.0.0-jre' }), - ) - // Sub-workspace A pins guava 33 (duplicate) AND dagger 2.50. - mkdirSync(path.join(tmp, 'examples', 'dagger'), { recursive: true }) - writeFileSync( - path.join(tmp, 'examples', 'dagger', 'maven_install.json'), - v2Lockfile({ - 'com.google.dagger:dagger': '2.50', - 'com.google.guava:guava': '33.0.0-jre', - }), - ) - // Sub-workspace B pins compose 1.6. - mkdirSync(path.join(tmp, 'examples', 'jetpack_compose'), { - recursive: true, - }) - writeFileSync( - path.join(tmp, 'examples', 'jetpack_compose', 'maven_install.json'), - v2Lockfile({ 'androidx.compose.ui:ui': '1.6.0' }), - ) - const { artifacts, lockfilePaths } = - discoverAllCheckedInMavenArtifacts(tmp) - expect(lockfilePaths).toHaveLength(3) - const coords = artifacts.map(a => a.mavenCoordinates).sort() - // Guava appears once even though it's pinned in two lockfiles. - expect(coords).toEqual([ - 'androidx.compose.ui:ui:1.6.0', - 'com.google.dagger:dagger:2.50', - 'com.google.guava:guava:33.0.0-jre', - ]) - }) - - it('emits the rules_kotlin shape: 1 root + several per-example lockfiles, strict superset', () => { - // Stand-in for rules_kotlin's layout: a small root lockfile plus per- - // example lockfiles that each declare some unique artifacts. The test - // asserts the strict-superset property — merged artifact count is - // greater than any single lockfile's count. - writeFileSync( - path.join(tmp, 'maven_install.json'), - v2Lockfile( - Object.fromEntries( - Array.from({ length: 70 }, (_, i) => [ - `org.jetbrains.kotlin:lib-${i}`, - '1.9.0', - ]), - ), - ), - ) - for (const example of [ - 'android', - 'anvil', - 'dagger', - 'jetpack_compose', - 'ksp', - 'multiplex', - 'plugin', - ]) { - mkdirSync(path.join(tmp, 'examples', example), { recursive: true }) - writeFileSync( - path.join(tmp, 'examples', example, 'maven_install.json'), - v2Lockfile( - Object.fromEntries( - Array.from({ length: 73 }, (_, i) => [ - `com.example.${example}:lib-${i}`, - '1.0', - ]), - ), - ), - ) - } - const { artifacts, lockfilePaths } = - discoverAllCheckedInMavenArtifacts(tmp) - expect(lockfilePaths).toHaveLength(8) - // Root has 70 unique; each of 7 examples has 73 unique disjoint sets. - expect(artifacts.length).toBe(70 + 7 * 73) - // Strict-superset of the root alone (which is what the CLI returns - // today without sub-workspace discovery). - expect(artifacts.length).toBeGreaterThan(70) - }) - }) -}) diff --git a/src/commands/manifest/bazel/extract_bazel_to_maven.mts b/src/commands/manifest/bazel/extract_bazel_to_maven.mts index ab3c8ebf9..334b116db 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_maven.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_maven.mts @@ -15,7 +15,6 @@ import { parseUnsortedDepsJson, } from './bazel-build-parser.mts' import { ensureJavaOnPath } from './bazel-java-shim.mts' -import { discoverAllCheckedInMavenArtifacts } from './bazel-lockfile-discovery.mts' import { validateOutputBase } from './bazel-output-base-check.mts' import { provisionPythonShim } from './bazel-python-shim.mts' import { @@ -431,38 +430,6 @@ export async function extractBazelToMaven( logger.info(`@${repo}: ${artifacts.length} artifact(s)`) } - // Step 5b: merge checked-in `maven_install.json` files found anywhere - // under cwd. The root-only bazel-query path above never sees per-example - // sub-workspace lockfiles (rules_kotlin, rules_js, rules_rust, etc. all - // declare additional Maven artifacts in `examples/*/MODULE.bazel` - // projects with their own lockfiles), so without this merge the CLI - // emits a strict subset of what depscan's server-side parser already - // returns. Dedup by `mavenCoordinates` so the root workspace's lockfile - // — which bazel-query already extracted — does not double-count. - const seenCoords = new Set( - allArtifacts.map(a => a.mavenCoordinates), - ) - const { artifacts: lockfileArtifacts, lockfilePaths } = - discoverAllCheckedInMavenArtifacts(cwd, verbose) - let mergedFromLockfiles = 0 - for (const a of lockfileArtifacts) { - if (seenCoords.has(a.mavenCoordinates)) { - continue - } - seenCoords.add(a.mavenCoordinates) - allArtifacts.push(a) - mergedFromLockfiles += 1 - } - if (mergedFromLockfiles > 0) { - logger.info( - `Sub-workspace discovery: merged ${mergedFromLockfiles} additional artifact(s) from ${lockfilePaths.length} checked-in maven_install.json file(s).`, - ) - } else if (verbose) { - logger.log( - `[VERBOSE] subworkspace: no additional artifacts beyond bazel-query (${lockfilePaths.length} lockfile(s) examined)`, - ) - } - // Step 6: normalize to maven_install.json shape. const normalized = normalizeToMavenInstallJson(allArtifacts) From 875dc93d48f9ab6444ae8b5360bd2958e66bb45e Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 14:03:08 +0200 Subject: [PATCH 03/10] feat(manifest/bazel): add workspace walker for nested-workspace discovery `findWorkspaceRoots` walks the tree from cwd and returns every directory containing MODULE.bazel / WORKSPACE / WORKSPACE.bazel. Monorepos host multiple workspace roots (e.g. examples//MODULE.bazel, mobile/ MODULE.bazel under an otherwise non-Bazel root); the per-workspace algorithm in the orchestrator runs once per discovered root. Pruning matches the previous lockfile walker: skip the usual non-workspace directories (.git, node_modules, .socket-auto-manifest, etc.), Bazel's `bazel-*` output_base symlinks (so we never recurse into tens of GiB of generated state), and `dist*` build-output directories. Caps `MAX_WALK_DEPTH` and `MAX_WORKSPACE_ROOTS` guard against pathological inputs and symlink loops. Pure-function module with no Bazel calls; unit tests use a tmpdir fixture tree and cover the root-only, nested, prune, symlink, and sort-determinism cases. --- .../manifest/bazel/bazel-workspace-walk.mts | 118 +++++++++++++++++ .../bazel/bazel-workspace-walk.test.mts | 124 ++++++++++++++++++ 2 files changed, 242 insertions(+) create mode 100644 src/commands/manifest/bazel/bazel-workspace-walk.mts create mode 100644 src/commands/manifest/bazel/bazel-workspace-walk.test.mts diff --git a/src/commands/manifest/bazel/bazel-workspace-walk.mts b/src/commands/manifest/bazel/bazel-workspace-walk.mts new file mode 100644 index 000000000..2e1f66ed8 --- /dev/null +++ b/src/commands/manifest/bazel/bazel-workspace-walk.mts @@ -0,0 +1,118 @@ +/** + * Walk the directory tree rooted at `cwd` and return every directory that + * looks like a Bazel workspace root — i.e. contains `MODULE.bazel`, + * `WORKSPACE`, or `WORKSPACE.bazel`. Real monorepos host multiple roots + * (e.g. `envoy/mobile/MODULE.bazel`, rules_kotlin's per-example + * `examples//MODULE.bazel`); the per-workspace algorithm in the + * orchestrator runs once per discovered root. + * + * Pruning matches the now-deleted `bazel-lockfile-discovery.mts`: skip + * directories that obviously aren't Bazel workspaces (`.git`, `node_modules`, + * `.socket-auto-manifest`, etc.) and Bazel's `bazel-*` convenience symlinks + * that point into (tens of GiB of generated state). Also + * prunes `dist*` build-output directories. + */ + +import { readdirSync } from 'node:fs' +import path from 'node:path' + +import { logger } from '@socketsecurity/registry/lib/logger' + +// Hard ceiling on number of workspace roots we will surface. Real monorepos +// have well under 50; this cap is a guard against pathological inputs. +const MAX_WORKSPACE_ROOTS = 256 +// Hard ceiling on directory walk depth. Real workspaces nest <8 deep; the +// cap protects against pathological symlink loops that slipped past the +// `bazel-*` prefix prune. +const MAX_WALK_DEPTH = 16 +// Directory basenames the walk refuses to descend into. None of these +// contain Bazel workspaces, and node_modules / .git can be enormous. +const PRUNE_DIR_NAMES = new Set([ + '.git', + '.hg', + '.idea', + '.pnpm-store', + '.socket-auto-manifest', + '.svn', + '.vscode', + 'node_modules', +]) +// Directory basename prefixes the walk refuses to descend into. Bazel's +// `bazel-out`, `bazel-bin`, `bazel-testlogs`, and `bazel-` +// convenience symlinks all point into the output_base. `dist`-prefixed +// directories are build artefacts, not workspaces. +const PRUNE_DIR_PREFIXES = ['bazel-', 'dist'] +// Files whose presence promotes a directory to a workspace root. +const WORKSPACE_MARKER_FILES = new Set([ + 'MODULE.bazel', + 'WORKSPACE', + 'WORKSPACE.bazel', +]) + +// Walks the tree rooted at `cwd` and returns absolute paths to every +// directory that contains at least one workspace marker file. Output is +// sorted for determinism. +export function findWorkspaceRoots(cwd: string, verbose?: boolean): string[] { + const out: string[] = [] + // Tuple stack: [absolute dir, depth from cwd]. + const stack: Array<[string, number]> = [[cwd, 0]] + while (stack.length) { + if (out.length >= MAX_WORKSPACE_ROOTS) { + if (verbose) { + logger.log( + `[VERBOSE] workspace walker: hit MAX_WORKSPACE_ROOTS cap (${MAX_WORKSPACE_ROOTS}); truncating walk`, + ) + } + break + } + const next = stack.pop() + if (!next) { + break + } + const { 0: dir, 1: depth } = next + let entries + try { + entries = readdirSync(dir, { withFileTypes: true }) + } catch { + continue + } + // First pass: detect whether this dir is itself a workspace root. + let isWorkspaceRoot = false + for (const entry of entries) { + if (entry.isFile() && WORKSPACE_MARKER_FILES.has(entry.name)) { + isWorkspaceRoot = true + break + } + } + if (isWorkspaceRoot) { + out.push(dir) + } + // Second pass: schedule descents. We descend regardless of whether the + // current dir is itself a root — nested workspaces are common in + // monorepos (root MODULE.bazel + examples/*/MODULE.bazel). + if (depth + 1 > MAX_WALK_DEPTH) { + continue + } + for (const entry of entries) { + if (!entry.isDirectory()) { + continue + } + const name = entry.name + if (PRUNE_DIR_NAMES.has(name)) { + continue + } + let pruned = false + for (const prefix of PRUNE_DIR_PREFIXES) { + if (name.startsWith(prefix)) { + pruned = true + break + } + } + if (pruned) { + continue + } + stack.push([path.join(dir, name), depth + 1]) + } + } + return out.sort() +} diff --git a/src/commands/manifest/bazel/bazel-workspace-walk.test.mts b/src/commands/manifest/bazel/bazel-workspace-walk.test.mts new file mode 100644 index 000000000..99307b1af --- /dev/null +++ b/src/commands/manifest/bazel/bazel-workspace-walk.test.mts @@ -0,0 +1,124 @@ +import { + mkdirSync, + mkdtempSync, + rmSync, + symlinkSync, + writeFileSync, +} from 'node:fs' +import os from 'node:os' +import path from 'node:path' + +import { afterEach, beforeEach, describe, expect, it } from 'vitest' + +import { findWorkspaceRoots } from './bazel-workspace-walk.mts' + +function touch(file: string): void { + mkdirSync(path.dirname(file), { recursive: true }) + writeFileSync(file, '') +} + +describe('bazel-workspace-walk', () => { + let tmp: string + + beforeEach(() => { + tmp = mkdtempSync(path.join(os.tmpdir(), 'sock-bazel-walk-')) + }) + + afterEach(() => { + rmSync(tmp, { recursive: true, force: true }) + }) + + describe('findWorkspaceRoots', () => { + it('returns the root when only the root has MODULE.bazel', () => { + touch(path.join(tmp, 'MODULE.bazel')) + expect(findWorkspaceRoots(tmp)).toEqual([tmp]) + }) + + it('detects WORKSPACE and WORKSPACE.bazel as root markers', () => { + touch(path.join(tmp, 'WORKSPACE')) + expect(findWorkspaceRoots(tmp)).toEqual([tmp]) + rmSync(path.join(tmp, 'WORKSPACE')) + touch(path.join(tmp, 'WORKSPACE.bazel')) + expect(findWorkspaceRoots(tmp)).toEqual([tmp]) + }) + + it('finds nested workspaces at arbitrary depth', () => { + touch(path.join(tmp, 'MODULE.bazel')) + touch(path.join(tmp, 'examples', 'dagger', 'MODULE.bazel')) + touch(path.join(tmp, 'examples', 'android', 'nested', 'WORKSPACE.bazel')) + const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) + expect(found).toEqual([ + '', + 'examples/android/nested', + 'examples/dagger', + ]) + }) + + it('returns [] when there is no workspace root', () => { + writeFileSync(path.join(tmp, 'README.md'), '') + expect(findWorkspaceRoots(tmp)).toEqual([]) + }) + + it('prunes .git / node_modules / .socket-auto-manifest', () => { + touch(path.join(tmp, 'MODULE.bazel')) + // Sub-MODULE.bazel files inside pruned dirs must not be surfaced. + for (const dir of ['node_modules', '.git', '.socket-auto-manifest']) { + touch(path.join(tmp, dir, 'sub', 'MODULE.bazel')) + } + const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) + expect(found).toEqual(['']) + }) + + it('prunes bazel-* convenience symlinks', () => { + // Simulate `bazel-out` pointing at a directory that contains a copy of + // MODULE.bazel. The walk must skip it; otherwise discovery would + // surface generated workspaces from . + const fakeOutputBase = mkdtempSync( + path.join(os.tmpdir(), 'sock-fake-outbase-'), + ) + try { + mkdirSync(path.join(fakeOutputBase, 'external', 'maven'), { + recursive: true, + }) + touch(path.join(fakeOutputBase, 'external', 'maven', 'MODULE.bazel')) + symlinkSync(fakeOutputBase, path.join(tmp, 'bazel-out')) + touch(path.join(tmp, 'MODULE.bazel')) + const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) + expect(found).toEqual(['']) + } finally { + rmSync(fakeOutputBase, { recursive: true, force: true }) + } + }) + + it('prunes dist* build-output directories', () => { + touch(path.join(tmp, 'MODULE.bazel')) + touch(path.join(tmp, 'dist', 'MODULE.bazel')) + touch(path.join(tmp, 'distribution', 'MODULE.bazel')) + const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) + expect(found).toEqual(['']) + }) + + it('returns absolute, sorted paths', () => { + touch(path.join(tmp, 'z', 'MODULE.bazel')) + touch(path.join(tmp, 'a', 'MODULE.bazel')) + touch(path.join(tmp, 'm', 'MODULE.bazel')) + const found = findWorkspaceRoots(tmp) + expect(found).toEqual([ + path.join(tmp, 'a'), + path.join(tmp, 'm'), + path.join(tmp, 'z'), + ]) + // Absolute. + for (const p of found) { + expect(path.isAbsolute(p)).toBe(true) + } + }) + + it('handles an unreadable directory by skipping it (no throw)', () => { + touch(path.join(tmp, 'MODULE.bazel')) + // Reference a path that does not exist as cwd; the walker must not + // throw — it should return [] (no entries to read). + expect(findWorkspaceRoots(path.join(tmp, 'nope'))).toEqual([]) + }) + }) +}) From cb699ddb70aee0430d00d3e1fccb151239648fad Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 14:06:30 +0200 Subject: [PATCH 04/10] refactor(manifest/bazel): replace Starlark regex with show_extension + probe primitives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop all static parsing of MODULE.bazel / WORKSPACE / *.bzl sources. Bazel itself sees those files via `mod show_extension` and `cquery`; the CLI no longer needs to interpret Starlark. `parseShowExtensionOutput` consumes the text-format report from bazel mod show_extension @rules_jvm_external//:extensions.bzl%maven and returns the hub repos (items annotated with `(imported by ...)`). Generated per-artifact bullets are skipped; `DEBUG:` / `WARNING:` lines are tolerated; the parser stops at the next `## ` section header so multi-extension reports don't cross-contaminate. `classifyProbeResult` turns a raw probe outcome into a tri-state status: - populated: code=0 + non-empty stdout - empty: code=1 + "no targets found beneath" - not-defined: code=1 + "No repository visible" / "no such package", or code=0 + empty stdout (WORKSPACE-mode silent miss) The orchestrator treats `empty` and `not-defined` uniformly as skips; the distinction is preserved for the sidecar status report. `CONVENTIONAL_MAVEN_REPO_NAMES` exposes the names the legacy WORKSPACE path probes (`maven`, `maven_install`, `maven_dev`, `unpinned_maven`, `maven_unpinned`). `--bazel-maven-repo=` extras are appended by the orchestrator (sibling todo). Deleted exports: `parseMavenRepoCandidates`, `parseVisibleRepoCandidates`, `validateMavenRepo`, `discoverMavenRepos`. Their replacements live in the new primitives above; the orchestrator rewrite that wires them up lands in a follow-up layer. `extract_bazel_to_maven.mts` does not typecheck in this intermediate state — fixed in the orchestrator commit. Tests cover the parser fixture (hub vs generated, separator variants, multi-section reports), the tri-state classifier (every documented input), and the verbose-logging contract for `probeCandidate`. --- .../manifest/bazel/bazel-repo-discovery.mts | 455 ++++++------------ .../bazel/bazel-repo-discovery.test.mts | 426 ++++++---------- 2 files changed, 297 insertions(+), 584 deletions(-) diff --git a/src/commands/manifest/bazel/bazel-repo-discovery.mts b/src/commands/manifest/bazel/bazel-repo-discovery.mts index 8d13542a3..539c8e6ef 100644 --- a/src/commands/manifest/bazel/bazel-repo-discovery.mts +++ b/src/commands/manifest/bazel/bazel-repo-discovery.mts @@ -1,345 +1,166 @@ -import { existsSync, readFileSync, readdirSync, statSync } from 'node:fs' -import path from 'node:path' - +/** + * Maven hub repo discovery for `socket manifest bazel`. + * + * - Bzlmod path: `bazel mod show_extension @rules_jvm_external//:extensions.bzl%maven` + * emits a text-format report listing every repo the maven extension generated; + * `parseShowExtensionOutput` extracts the names of hub repos (items annotated + * with `(imported by ...)`) and skips generated per-artifact repos. + * - Legacy WORKSPACE path: probe a fixed list of conventional Maven hub names + * (plus any names the customer passed via `--bazel-maven-repo=`). Each probe + * is classified into `populated` / `empty` / `not-defined`; the orchestrator + * keeps only the `populated` candidates. + * + * No Starlark source is read by this module. All semantic interpretation + * comes from Bazel itself (`mod show_extension`, `cquery`). + */ import { logger } from '@socketsecurity/registry/lib/logger' -import { getErrorCause } from '../../../utils/errors.mts' - -// Maximum size (bytes) we will read for any single Bazel workspace file. -// Prevents DoS via maliciously large MODULE.bazel / WORKSPACE / .bzl files. -const MAX_WORKSPACE_FILE_BYTES = 5 * 1024 * 1024 - -// Maximum candidate count we will return (deduped) before truncating. -// Real repos have <20; this is a hard ceiling against pathological inputs. -const MAX_CANDIDATES = 256 - -// Regex strategy: anchored, bounded character classes, no nested quantifiers. -// Match `use_repo(maven, "X", "Y", ...)` with a bounded arg-list window to -// avoid catastrophic backtracking on hostile input. - -// Bzlmod use_repo(maven, "name1", "name2"...). -// Bounded: matches up to ~4KB of arg list to avoid catastrophic backtracking. -const USE_REPO_RE = /use_repo\s*\(\s*maven\s*,([^)]{0,4096})\)/g -const BAZEL_REPO_NAME_PATTERN = '[A-Za-z0-9._+-]{1,129}' -const BAZEL_REPO_NAME_RE = new RegExp(`^${BAZEL_REPO_NAME_PATTERN}$`) -// Quoted-name extractor inside the captured argument blob. -const QUOTED_NAME_RE = new RegExp(`"(${BAZEL_REPO_NAME_PATTERN})"`, 'g') - -// Legacy maven_install(name = "X", ...) on a single statement. -// Match the name= keyword arg specifically; bounded. -const MAVEN_INSTALL_NAME_RE = new RegExp( - `maven_install\\s*\\([^)]{0,8192}?\\bname\\s*=\\s*"(${BAZEL_REPO_NAME_PATTERN})"`, - 'g', -) -const MAVEN_COORDINATES_MARKER_RE = /\bmaven_coordinates\s*=/ - -// Reads file contents, refusing files that exceed MAX_WORKSPACE_FILE_BYTES. -// Returns null when the file is missing, oversized, or unreadable. -function safeReadFile(file: string): string | null { - if (!existsSync(file)) { - return null - } - try { - const stat = statSync(file) - if (stat.size > MAX_WORKSPACE_FILE_BYTES) { - return null - } - return readFileSync(file, 'utf8') - } catch { - return null - } -} - -// Walks workspace root for legacy Starlark sources we can scan: WORKSPACE -// (and WORKSPACE.bazel) plus top-level .bzl files. Non-recursive by design; -// Phase 1 explicitly avoids static Starlark parsing at depth. -function listLegacyStarlarkFiles(cwd: string): string[] { - const files: string[] = [] - const candidates = ['WORKSPACE', 'WORKSPACE.bazel'] - for (const c of candidates) { - const p = path.join(cwd, c) - if (existsSync(p)) { - files.push(p) - } - } - // Top-level .bzl files only. - try { - for (const entry of readdirSync(cwd)) { - if (entry.endsWith('.bzl')) { - files.push(path.join(cwd, entry)) - } - } - } catch { - // Ignore unreadable cwd. - } - return files -} - -// Returns deduplicated, sorted list of items, capped at MAX_CANDIDATES. -function uniqueSorted(items: string[]): string[] { - const seen = new Set() - const out: string[] = [] - for (const item of items) { - if (!seen.has(item)) { - seen.add(item) - out.push(item) - if (out.length >= MAX_CANDIDATES) { - break - } - } - } - return out.sort() -} - -function apparentNameFromJsonValue(value: unknown): string | undefined { - if (!value || typeof value !== 'object') { - return undefined - } - const obj = value as Record - const direct = obj['apparentName'] ?? obj['apparent_name'] - if (typeof direct === 'string') { - return direct - } - for (const nested of Object.values(obj)) { - const found = apparentNameFromJsonValue(nested) - if (found) { - return found - } - } - return undefined +export type ProbeResult = { + code: number + stdout: string + stderr: string } -function apparentNamesFromRepoMapping(value: unknown): string[] { - if (!value || typeof value !== 'object' || Array.isArray(value)) { +export type RepoProbe = (repoName: string) => Promise + +export type ProbeStatus = 'populated' | 'empty' | 'not-defined' + +// Conventional Maven hub names rules_jvm_external sets up under +// WORKSPACE-mode invocations. Probing each one is cheap (a failed visibility +// lookup never triggers a `repository_rule` fetch) so the orchestrator can +// try them all without paying the cost of a real cquery on undefined repos. +export const CONVENTIONAL_MAVEN_REPO_NAMES: readonly string[] = [ + 'maven', + 'maven_install', + 'maven_dev', + 'unpinned_maven', + 'maven_unpinned', +] + +// Pattern Bazel emits when a probed repo name isn't visible to the main +// module. Used to distinguish `not-defined` (skip silently) from `empty` +// (the repo exists but has no targets). Tolerant of either single- or +// double-quote styles Bazel has used across versions. +const NOT_VISIBLE_STDERR_RE = + /No repository visible as ['"]?@?[A-Za-z0-9._+-]+['"]? from/ +// Other "repo isn't analyzable" patterns Bazel emits, especially under +// WORKSPACE mode and on Bazel 6.x. They all map to `not-defined`. +const NO_SUCH_PACKAGE_STDERR_RE = /no such package ['"`]?@/ +// Pattern emitted when a repo IS visible / defined but yields no targets. +// `--keep_going` plus `'no targets found beneath'` is the empty-but-defined +// signature. The orchestrator treats `empty` and `not-defined` uniformly +// as skips, but the distinction is preserved in the sidecar status report. +const NO_TARGETS_STDERR_RE = /no targets found beneath/i +// Anchor for the maven extension's section header in +// `bazel mod show_extension` output. Tolerant of the canonical-name form +// Bazel uses across versions (`@@rules_jvm_external+`, `@@rules_jvm_external~`, +// or any future separator) and of trailing trailing whitespace. +const SHOW_EXT_SECTION_HEADER_RE = + /^## @@?[A-Za-z0-9._+~-]+\/\/:extensions\.bzl%maven:\s*$/m +// Bullet within `Fetched repositories:` that names a hub repo (one with an +// `(imported by ...)` annotation). Bullets without that annotation are +// generated per-artifact repos and are skipped. +const FETCHED_HUB_BULLET_RE = + /^ {2}- (?\S+) \(imported by (?[^)]+)\)\s*$/ + +// Pure parser for `bazel mod show_extension @rules_jvm_external//:extensions.bzl%maven` +// stdout. Returns the names of hub repos listed under `Fetched repositories:` +// — i.e. items annotated with `(imported by ...)`. Generated per-artifact +// repos (no annotation) are skipped. Output is deduplicated and sorted. +// Tolerant of `DEBUG:` / `WARNING:` lines from Bazel; the section header +// `## @@//:extensions.bzl%maven:` is the anchor. +export function parseShowExtensionOutput(stdout: string): string[] { + const headerMatch = SHOW_EXT_SECTION_HEADER_RE.exec(stdout) + if (!headerMatch) { return [] } - const candidates: string[] = [] - for (const [name, canonicalName] of Object.entries(value)) { - if (name.startsWith('@') || typeof canonicalName !== 'string') { - continue - } - if (BAZEL_REPO_NAME_RE.test(name)) { - candidates.push(name) - } - } - return candidates -} - -function normalizeRepoName(name: string): string | undefined { - const repo = name.startsWith('@') ? name.slice(1) : name - return BAZEL_REPO_NAME_RE.test(repo) ? repo : undefined -} - -// Parse `bazel mod dump_repo_mapping "" --output=json` output. Also accept the -// older streamed jsonproto shape in case older Bazel versions or fixtures still -// return repository records with apparentName fields. -export function parseVisibleRepoCandidates(output: string): string[] { - const candidates: string[] = [] - for (const line of output.split(/\r?\n/)) { - const trimmed = line.trim() - if (!trimmed) { - continue - } - try { - const parsed = JSON.parse(trimmed) as unknown - candidates.push(...apparentNamesFromRepoMapping(parsed)) - const apparentName = apparentNameFromJsonValue(parsed) - if (apparentName) { - const repo = normalizeRepoName(apparentName) - if (repo) { - candidates.push(repo) - } - } - } catch { - // Ignore malformed lines; caller will fall back to static discovery when - // no usable visible repo names are found. - } - } - return uniqueSorted(candidates) -} - -// Step 1: parse candidate Maven repo names from Bzlmod and legacy entry points. -export function parseMavenRepoCandidates( - cwd: string, - verbose?: boolean, -): string[] { - const candidates: string[] = [] - - // Bzlmod path: parse MODULE.bazel for use_repo(maven, ...). - const moduleBazel = path.join(cwd, 'MODULE.bazel') - const moduleContent = safeReadFile(moduleBazel) - if (moduleContent) { - const bzlmodHits: string[] = [] - for (const m of moduleContent.matchAll(USE_REPO_RE)) { - const argBlob = m[1] ?? '' - for (const n of argBlob.matchAll(QUOTED_NAME_RE)) { - bzlmodHits.push(n[1] as string) - } - } - candidates.push(...bzlmodHits) - if (verbose) { - logger.log( - '[VERBOSE] discovery: scanned', - moduleBazel, - `(${bzlmodHits.length} use_repo match(es))`, - ) - } - } else if (verbose) { - logger.log( - '[VERBOSE] discovery:', - moduleBazel, - 'not present (skipping bzlmod scan)', - ) - } - - // Legacy path: scan WORKSPACE + top-level .bzl files for maven_install(name=...). - const legacyFiles = listLegacyStarlarkFiles(cwd) - if (verbose) { - logger.log( - '[VERBOSE] discovery: legacy files considered:', - legacyFiles.length ? legacyFiles : '(none)', - ) + const tail = stdout.slice(headerMatch.index + headerMatch[0].length) + // Find the `Fetched repositories:` line within the section. + const fetchedIdx = tail.indexOf('\nFetched repositories:') + if (fetchedIdx === -1) { + return [] } - for (const file of legacyFiles) { - const content = safeReadFile(file) - if (!content) { + const afterFetched = tail.slice(fetchedIdx + '\nFetched repositories:'.length) + const seen = new Set() + for (const line of afterFetched.split(/\r?\n/)) { + // Stop at the next `## ` section header (some Bazel versions print + // multiple extensions in one report). + if (line.startsWith('## ')) { + break + } + // Empty line is fine; bullet that doesn't match is fine (it's an + // un-imported generated artifact repo) — skip it. + const match = FETCHED_HUB_BULLET_RE.exec(line) + if (!match || !match.groups) { continue } - const fileHits: string[] = [] - for (const m of content.matchAll(MAVEN_INSTALL_NAME_RE)) { - fileHits.push(m[1] as string) - } - candidates.push(...fileHits) - if (verbose) { - logger.log( - '[VERBOSE] discovery: scanned', - file, - `(${fileHits.length} maven_install name match(es))`, - ) + const name = match.groups['name'] + if (name && !seen.has(name)) { + seen.add(name) } } - - const deduped = uniqueSorted(candidates) - if (verbose) { - logger.log('[VERBOSE] discovery: candidate set (pre-seed):', deduped) - } - return deduped + return [...seen].sort() } -export type RepoProbe = ( - repoName: string, -) => Promise<{ stdout: string; code: number }> - -export type ValidationResult = { - valid: boolean - // Probe stdout — populated whenever the probe was reachable, even when - // validation rejects the repo. Empty string when the probe itself threw. - stdout: string +// Classify a raw probe result into one of three states. The probe contract +// is whatever the runner (layer 4) emits — typically a lightweight +// `cquery '@//...' --keep_going --output=label`. Distinguishing +// `empty` from `not-defined` lets the sidecar status report explain to the +// customer why a particular candidate was skipped; the orchestrator itself +// treats both as no-ops. +export function classifyProbeResult(result: ProbeResult): ProbeStatus { + // A successful probe with any stdout means the repo exists AND has at + // least one target — populated. + if (result.code === 0 && result.stdout.trim().length > 0) { + return 'populated' + } + // Code 1 with the "no repository visible" message → undefined. + if ( + result.code !== 0 && + (NOT_VISIBLE_STDERR_RE.test(result.stderr) || + NO_SUCH_PACKAGE_STDERR_RE.test(result.stderr)) + ) { + return 'not-defined' + } + // Code 1 with the "no targets" message → defined but empty. + if (result.code !== 0 && NO_TARGETS_STDERR_RE.test(result.stderr)) { + return 'empty' + } + // Code 0 with empty stdout: WORKSPACE-mode probes do this when the repo + // name isn't declared (Exp 5c). Treat as not-defined. + if (result.code === 0) { + return 'not-defined' + } + // Code 1 with no recognizable message: be conservative and call it + // not-defined so the orchestrator skips it without erroring the workspace. + return 'not-defined' } -// Step 2: validate a candidate by running the probe and confirming -// `maven_coordinates=` appears in stdout (the marker emitted by jvm_import / -// aar_import rules generated by rules_jvm_external). Returns the probe -// stdout alongside the verdict so the caller can cache it and reuse it -// instead of running an identical extraction query. -export async function validateMavenRepo( +// Convenience: probe a single candidate and return its classified status, +// with optional verbose logging. Pure orchestration around `probe` + +// `classifyProbeResult`; isolated so the test suite can exercise the +// logging contract independently of the runner implementation. +export async function probeCandidate( repoName: string, probe: RepoProbe, verbose?: boolean, -): Promise { +): Promise { + let result: ProbeResult try { - const result = await probe(repoName) - if (result.code !== 0) { - if (verbose) { - logger.log( - `[VERBOSE] discovery: probe @${repoName}: REJECT (code=${result.code})`, - ) - } - return { valid: false, stdout: result.stdout } - } - const valid = MAVEN_COORDINATES_MARKER_RE.test(result.stdout) - if (verbose) { - logger.log( - `[VERBOSE] discovery: probe @${repoName}:`, - valid - ? 'ACCEPT (maven_coordinates marker found)' - : 'REJECT (no maven_coordinates marker in probe stdout)', - ) - } - return { valid, stdout: result.stdout } + result = await probe(repoName) } catch (e) { if (verbose) { logger.log( - `[VERBOSE] discovery: probe @${repoName}: REJECT (probe threw):`, - getErrorCause(e), + `[VERBOSE] discovery: probe @${repoName}: not-defined (probe threw: ${ + e instanceof Error ? e.message : String(e) + })`, ) } - return { valid: false, stdout: '' } - } -} - -// The default maven_install repo name when no explicit `name=` is given. -// Included as a seed so repos that define maven_install in a subdirectory -// .bzl file (not scanned by parseMavenRepoCandidates) are still discovered. -const DEFAULT_MAVEN_REPO_SEED = 'maven' - -// Composition: parse, then validate each candidate; return validated subset -// as a Map keyed by repo name with the validated probe stdout as value. -// Map iteration order matches insertion order, so callers that just want -// the list of repo names can call `Array.from(repos.keys())`. Callers that -// want to skip re-running the same `bazel query` during extraction can read -// the cached stdout off the Map and parse it directly. -// -// Always seeds with the default `@maven` repo name so repos whose -// maven_install is defined in a sub-directory .bzl file (not reachable by -// the top-level static scan) can still be discovered via probe validation. -export async function discoverMavenRepos( - cwd: string, - probe: RepoProbe, - nativeCandidates?: string[], - verbose?: boolean, -): Promise> { - const parsed = - nativeCandidates && nativeCandidates.length - ? nativeCandidates - : parseMavenRepoCandidates(cwd, verbose) - if (verbose) { - logger.log( - '[VERBOSE] discovery: candidate source:', - nativeCandidates && nativeCandidates.length - ? `bzlmod visible-repos (${nativeCandidates.length})` - : `static parse (${parsed.length})`, - ) - } - // Seed with the default repo name first (so it appears first in output if - // validated). Dedup via Set before validation. - const seen = new Set([DEFAULT_MAVEN_REPO_SEED]) - const candidates: string[] = [DEFAULT_MAVEN_REPO_SEED] - for (const c of parsed) { - if (!seen.has(c)) { - seen.add(c) - candidates.push(c) - } - } - if (verbose) { - logger.log( - '[VERBOSE] discovery: candidate set to probe (seed-first, deduped):', - candidates, - ) - } - const validated = new Map() - for (const c of candidates) { - // eslint-disable-next-line no-await-in-loop - const result = await validateMavenRepo(c, probe, verbose) - if (result.valid) { - validated.set(c, result.stdout) - } + return 'not-defined' } + const status = classifyProbeResult(result) if (verbose) { - logger.log( - '[VERBOSE] discovery: validated repos:', - Array.from(validated.keys()), - ) + logger.log(`[VERBOSE] discovery: probe @${repoName}: ${status}`) } - return validated + return status } diff --git a/src/commands/manifest/bazel/bazel-repo-discovery.test.mts b/src/commands/manifest/bazel/bazel-repo-discovery.test.mts index 5755388df..8a7845fd6 100644 --- a/src/commands/manifest/bazel/bazel-repo-discovery.test.mts +++ b/src/commands/manifest/bazel/bazel-repo-discovery.test.mts @@ -1,247 +1,188 @@ -import { mkdtempSync, rmSync, writeFileSync } from 'node:fs' -import os from 'node:os' -import path from 'node:path' -import { fileURLToPath } from 'node:url' - import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' import { logger } from '@socketsecurity/registry/lib/logger' import { - discoverMavenRepos, - parseMavenRepoCandidates, - parseVisibleRepoCandidates, - validateMavenRepo, + CONVENTIONAL_MAVEN_REPO_NAMES, + classifyProbeResult, + parseShowExtensionOutput, + probeCandidate, } from './bazel-repo-discovery.mts' -import type { RepoProbe } from './bazel-repo-discovery.mts' - -const __filename = fileURLToPath(import.meta.url) -const __dirname = path.dirname(__filename) - -// from src/commands/manifest/bazel/ to repo root is four levels up, then into -// test/fixtures/manifest-bazel. -const FIXTURES = path.join( - __dirname, - '..', - '..', - '..', - '..', - 'test', - 'fixtures', - 'manifest-bazel', -) - -const acceptingProbe: RepoProbe = async () => ({ - stdout: - 'jvm_import(\n name = "guava",\n maven_coordinates = "com.google.guava:guava:33.0.0-jre",\n)', - code: 0, -}) +import type { + ProbeResult, + ProbeStatus, + RepoProbe, +} from './bazel-repo-discovery.mts' -const compactAcceptingProbe: RepoProbe = async () => ({ - stdout: - 'jvm_import(\n name = "guava",\n maven_coordinates="com.google.guava:guava:33.0.0-jre",\n)', +// Truncated text-format report Bazel 8.4.2 emits on tink-java for +// `bazel mod show_extension @rules_jvm_external//:extensions.bzl%maven`. +// The headline shape: a `## @@//:extensions.bzl%maven:` header, +// blank line, then `Fetched repositories:` and a bullet list. Hub repos +// carry `(imported by ...)`; generated artifact repos don't. +const TINK_SHOW_EXTENSION_FIXTURE = `DEBUG: irrelevant noise +WARNING: also irrelevant + +## @@rules_jvm_external+//:extensions.bzl%maven: + +Fetched repositories: + - android_ide_common_30_1_3 (imported by rules_android@0.6.6) + - maven (imported by , bazel_worker_java@0.0.4, protobuf@32.1) + - rules_android_maven (imported by rules_android@0.6.6) + - rules_jvm_external_deps (imported by rules_jvm_external@6.7) + - stardoc_maven (imported by stardoc@0.7.2) + - unpinned_rules_jvm_external_deps (imported by rules_jvm_external@6.7) + - aopalliance_aopalliance_1_0 + - aopalliance_aopalliance_jar_sources_1_0 + - androidx_annotation_annotation +` + +const probeResult = (over: Partial = {}): ProbeResult => ({ code: 0, + stdout: '', + stderr: '', + ...over, }) -const rejectingProbe: RepoProbe = async () => ({ stdout: '', code: 0 }) - -const failingProbe: RepoProbe = async () => ({ stdout: '', code: 1 }) - -const throwingProbe: RepoProbe = async () => { - throw new Error('bazel exploded') -} - -const selectiveProbe: RepoProbe = async name => - name === 'maven' - ? { stdout: 'maven_coordinates=foo', code: 0 } - : { stdout: '', code: 0 } - describe('bazel-repo-discovery', () => { - describe('parseMavenRepoCandidates', () => { - it('parses single use_repo from bzlmod-only', () => { - expect( - parseMavenRepoCandidates(path.join(FIXTURES, 'bzlmod-only')), - ).toEqual(['maven']) - }) - - it('parses multiple names from multi-repo-bzlmod', () => { - expect( - parseMavenRepoCandidates( - path.join(FIXTURES, 'multi-repo-bzlmod'), - ).sort(), - ).toEqual(['maven', 'maven_test'].sort()) + describe('parseShowExtensionOutput', () => { + it('extracts hub repos with (imported by ...) annotations only', () => { + // The 6 hub repos in the fixture are the ones with annotations; + // generated per-artifact repos (no annotation) are skipped. + expect(parseShowExtensionOutput(TINK_SHOW_EXTENSION_FIXTURE)).toEqual([ + 'android_ide_common_30_1_3', + 'maven', + 'rules_android_maven', + 'rules_jvm_external_deps', + 'stardoc_maven', + 'unpinned_rules_jvm_external_deps', + ]) }) - it('recovers custom name from custom-name-bzlmod', () => { + it('returns [] when the maven section is missing', () => { expect( - parseMavenRepoCandidates(path.join(FIXTURES, 'custom-name-bzlmod')), - ).toEqual(['maven_rules_kotlin_example']) + parseShowExtensionOutput( + 'DEBUG: noise\n\n## @@other//:extensions.bzl%other:\n\nFetched repositories:\n - foo (imported by )\n', + ), + ).toEqual([]) }) - it('parses maven_install name from legacy WORKSPACE', () => { + it('returns [] when Fetched repositories: is absent', () => { expect( - parseMavenRepoCandidates(path.join(FIXTURES, 'legacy-only')), - ).toEqual(['maven']) + parseShowExtensionOutput( + '## @@rules_jvm_external+//:extensions.bzl%maven:\n\nOther stuff\n', + ), + ).toEqual([]) }) - it('parses maven_install name from sibling .bzl file (legacy-with-load)', () => { - expect( - parseMavenRepoCandidates(path.join(FIXTURES, 'legacy-with-load')), - ).toEqual(['maven_legacy_app']) + it('stops at the next section header (multiple extensions in one report)', () => { + const input = + '## @@rules_jvm_external+//:extensions.bzl%maven:\n\nFetched repositories:\n - maven (imported by )\n - other (imported by foo)\n\n## @@rules_python+//:extensions.bzl%pip:\n\nFetched repositories:\n - pypi (imported by )\n' + expect(parseShowExtensionOutput(input)).toEqual(['maven', 'other']) }) - it('parses repo names containing hyphens and dots from static sources', () => { - const dir = mkdtempSync(path.join(os.tmpdir(), 'bazel-repos-')) - try { - writeFileSync( - path.join(dir, 'MODULE.bazel'), - 'use_repo(maven, "maven-prod", "third.party.maven")\n', - ) - writeFileSync( - path.join(dir, 'WORKSPACE'), - 'maven_install(name = "legacy-maven.prod", artifacts = [])\n', - ) - - expect(parseMavenRepoCandidates(dir)).toEqual([ - 'legacy-maven.prod', - 'maven-prod', - 'third.party.maven', - ]) - } finally { - rmSync(dir, { recursive: true, force: true }) + it('tolerates canonical-name separator variants (~ and +)', () => { + for (const sep of ['+', '~']) { + const input = `## @@rules_jvm_external${sep}//:extensions.bzl%maven:\n\nFetched repositories:\n - maven (imported by )\n` + expect(parseShowExtensionOutput(input)).toEqual(['maven']) } }) - it('returns empty array on a directory without bazel markers', () => { - // Use the fixtures root itself: no MODULE.bazel/WORKSPACE there. - expect(parseMavenRepoCandidates(FIXTURES)).toEqual([]) + it('deduplicates if the same hub appears twice (defensive)', () => { + const input = + '## @@rules_jvm_external+//:extensions.bzl%maven:\n\nFetched repositories:\n - maven (imported by )\n - maven (imported by foo)\n' + expect(parseShowExtensionOutput(input)).toEqual(['maven']) }) }) - describe('parseVisibleRepoCandidates', () => { - it('parses apparent repo names from dump_repo_mapping JSON output', () => { - const output = JSON.stringify({ - '': '', - '@invalid': 'canonical-invalid', - bazel_tools: 'bazel_tools', - maven: 'rules_jvm_external~~maven~maven', - 'maven-prod': 'rules_jvm_external~~maven~prod', - pypi: 'rules_python~~pip~pypi', - 'third.party.maven': 'rules_jvm_external~~maven~third_party', - }) - - expect(parseVisibleRepoCandidates(output)).toEqual([ - 'bazel_tools', - 'maven', - 'maven-prod', - 'pypi', - 'third.party.maven', - ]) - }) - - it('parses apparent repo names from streamed jsonproto output', () => { - const output = [ - JSON.stringify({ - repository: { - apparentName: '@maven', - canonicalName: 'rules_jvm_external~maven~maven', - }, - }), - JSON.stringify({ - repository: { - apparent_name: 'maven_rules_kotlin_example', - canonical_name: 'rules_jvm_external~maven~custom', - }, - }), - JSON.stringify({ - repository: { - apparentName: '@maven-prod', - canonicalName: 'rules_jvm_external~maven~prod', - }, - }), - JSON.stringify({ - repository: { - apparentName: 'third.party.maven', - canonicalName: 'rules_jvm_external~maven~third_party', - }, - }), - 'not json', - ].join('\n') - - expect(parseVisibleRepoCandidates(output)).toEqual([ - 'maven', - 'maven-prod', - 'maven_rules_kotlin_example', - 'third.party.maven', - ]) + describe('classifyProbeResult', () => { + it('classifies code=0 + non-empty stdout as populated', () => { + expect( + classifyProbeResult( + probeResult({ code: 0, stdout: '@maven//:guava\n' }), + ), + ).toBe('populated') }) - }) - describe('validateMavenRepo', () => { - it('accepts when probe stdout contains spaced maven_coordinates output', async () => { - const r = await validateMavenRepo('maven', acceptingProbe) - expect(r.valid).toBe(true) - expect(r.stdout).toContain('maven_coordinates') + it('classifies code=1 + "No repository visible" stderr as not-defined', () => { + expect( + classifyProbeResult( + probeResult({ + code: 1, + stderr: + "ERROR: No repository visible as '@nonexistent_repo_xyz' from main repository\n", + }), + ), + ).toBe('not-defined') }) - it('accepts when probe stdout contains compact maven_coordinates output', async () => { - const r = await validateMavenRepo('maven', compactAcceptingProbe) - expect(r.valid).toBe(true) - expect(r.stdout).toContain('maven_coordinates') + it('classifies code=1 + "no targets found beneath" stderr as empty', () => { + expect( + classifyProbeResult( + probeResult({ + code: 1, + stderr: + "WARNING: Evaluation of query \"@maven_install//...\" failed: no targets found beneath ''\n", + }), + ), + ).toBe('empty') }) - it('rejects when probe stdout lacks maven_coordinates=', async () => { - expect((await validateMavenRepo('not_maven', rejectingProbe)).valid).toBe( - false, - ) + it('classifies code=0 + empty stdout (WORKSPACE-mode silent miss) as not-defined', () => { + expect( + classifyProbeResult(probeResult({ code: 0, stdout: '' })), + ).toBe('not-defined') }) - it('rejects on non-zero exit code', async () => { + it('classifies code=1 + unrecognized stderr conservatively as not-defined', () => { expect( - (await validateMavenRepo('also_not_maven', failingProbe)).valid, - ).toBe(false) + classifyProbeResult( + probeResult({ code: 1, stderr: 'some other failure\n' }), + ), + ).toBe('not-defined') }) - it('rejects when probe throws', async () => { - expect((await validateMavenRepo('crash', throwingProbe)).valid).toBe( - false, - ) + it('classifies code=1 + "no such package" stderr as not-defined', () => { + expect( + classifyProbeResult( + probeResult({ + code: 1, + stderr: "ERROR: no such package '@unknown_repo//'\n", + }), + ), + ).toBe('not-defined') }) }) - describe('discoverMavenRepos', () => { - it('returns parsed candidates that the probe validates, with cached probe stdout', async () => { - // multi-repo-bzlmod parses to ['maven', 'maven_test']; the accepting probe - // validates both. The returned Map carries the probe stdout for each. - const result = await discoverMavenRepos( - path.join(FIXTURES, 'multi-repo-bzlmod'), - acceptingProbe, - ) - expect(Array.from(result.keys()).sort()).toEqual( - ['maven', 'maven_test'].sort(), - ) - for (const stdout of result.values()) { - expect(stdout).toContain('maven_coordinates') - } + describe('probeCandidate', () => { + it('returns the classified status from a probe', async () => { + const probe: RepoProbe = async () => ({ + code: 0, + stdout: '@maven//:guava\n', + stderr: '', + }) + expect(await probeCandidate('maven', probe)).toBe('populated') }) - it('uses native visible repo candidates instead of static parsing when provided', async () => { - const result = await discoverMavenRepos( - path.join(FIXTURES, 'multi-repo-bzlmod'), - acceptingProbe, - ['native_maven'], + it('returns not-defined when the probe throws', async () => { + const probe: RepoProbe = async () => { + throw new Error('bazel exploded') + } + expect(await probeCandidate('crash', probe)).toBe( + 'not-defined', ) - expect(Array.from(result.keys())).toEqual(['maven', 'native_maven']) }) + }) - it('filters out candidates the probe rejects', async () => { - // Probe accepts only when repo name === 'maven'; rejects 'maven_test'. - const result = await discoverMavenRepos( - path.join(FIXTURES, 'multi-repo-bzlmod'), - selectiveProbe, - ) - expect(Array.from(result.keys())).toEqual(['maven']) + describe('CONVENTIONAL_MAVEN_REPO_NAMES', () => { + it('includes the documented set', () => { + expect(CONVENTIONAL_MAVEN_REPO_NAMES).toEqual([ + 'maven', + 'maven_install', + 'maven_dev', + 'unpinned_maven', + 'maven_unpinned', + ]) }) }) @@ -262,83 +203,34 @@ describe('bazel-repo-discovery', () => { .join('\n') } - it('parseMavenRepoCandidates stays silent when verbose is unset', () => { - parseMavenRepoCandidates(path.join(FIXTURES, 'multi-repo-bzlmod')) + it('probeCandidate stays silent without verbose', async () => { + const probe: RepoProbe = async () => ({ + code: 0, + stdout: '@maven//:x\n', + stderr: '', + }) + await probeCandidate('maven', probe) expect(logSpy).not.toHaveBeenCalled() }) - it('parseMavenRepoCandidates emits scanned-files + candidate set when verbose=true', () => { - parseMavenRepoCandidates(path.join(FIXTURES, 'multi-repo-bzlmod'), true) - const text = loggedLines() - expect(text).toContain('discovery: scanned') - expect(text).toContain('MODULE.bazel') - expect(text).toContain('use_repo match') - expect(text).toContain('candidate set (pre-seed)') + it('probeCandidate logs the status under verbose', async () => { + const probe: RepoProbe = async () => ({ + code: 0, + stdout: '@maven//:x\n', + stderr: '', + }) + await probeCandidate('maven', probe, true) + expect(loggedLines()).toMatch(/probe @maven:\s*populated/) }) - it('validateMavenRepo logs ACCEPT under verbose', async () => { - await validateMavenRepo('maven', acceptingProbe, true) + it('probeCandidate logs the throw reason under verbose', async () => { + const probe: RepoProbe = async () => { + throw new Error('bazel exploded') + } + await probeCandidate('crash', probe, true) expect(loggedLines()).toMatch( - /probe @maven:\s*ACCEPT \(maven_coordinates marker found\)/, - ) - }) - - it('validateMavenRepo logs REJECT (no marker) under verbose', async () => { - await validateMavenRepo('not_maven', rejectingProbe, true) - expect(loggedLines()).toMatch(/probe @not_maven:\s*REJECT/) - }) - - it('validateMavenRepo logs REJECT (probe threw) under verbose', async () => { - await validateMavenRepo('crash', throwingProbe, true) - expect(loggedLines()).toMatch(/probe @crash:\s*REJECT \(probe threw\)/) - }) - - it('discoverMavenRepos propagates verbose into the full pipeline', async () => { - await discoverMavenRepos( - path.join(FIXTURES, 'multi-repo-bzlmod'), - selectiveProbe, - undefined, - true, + /probe @crash:\s*not-defined \(probe threw: bazel exploded\)/, ) - const text = loggedLines() - // Candidate-source label. - expect(text).toContain('candidate source: static parse') - // Seeded-and-deduped candidate set log. - expect(text).toContain('candidate set to probe') - // Per-candidate probe verdicts. - expect(text).toMatch(/probe @maven:\s*ACCEPT/) - expect(text).toMatch(/probe @maven_test:\s*REJECT/) - // Final validated set. - expect(text).toContain('validated repos') - }) - }) - - describe('DoS guard', () => { - it('completes parse on 1MB pathological input within 1s', () => { - // Synthesize a 1MB Bzlmod-shaped file in a tmp dir and feed it through - // parseMavenRepoCandidates. Exercises the bounded USE_REPO_RE + - // QUOTED_NAME_RE windows. - const dir = mkdtempSync(path.join(os.tmpdir(), 'bazel-discover-')) - try { - // Build the fixture content in a single pass (avoid O(n^2) join-in-loop). - const lines: string[] = [] - let totalLen = 0 - while (totalLen < 1_000_000) { - const line = 'use_repo(maven, "x_' + lines.length + '")' - lines.push(line) - // Plus 1 for the eventual newline separator. - totalLen += line.length + 1 - } - writeFileSync(path.join(dir, 'MODULE.bazel'), lines.join('\n')) - const start = process.hrtime.bigint() - const result = parseMavenRepoCandidates(dir) - const elapsed = process.hrtime.bigint() - start - expect(elapsed).toBeLessThan(1_000_000_000n) - // Verify the cap kicks in (length is bounded by MAX_CANDIDATES). - expect(result.length).toBeLessThanOrEqual(256) - } finally { - rmSync(dir, { recursive: true, force: true }) - } }) }) }) From 887f86a205b07d07817257c9e8f57a20b72fcc5f Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 14:11:29 +0200 Subject: [PATCH 05/10] refactor(manifest/bazel): switch Maven path to show_extension + tri-state probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bazel-query-runner now centralises startup-flag construction so every spawn — query, cquery, mod show_extension, mod dump_repo_mapping — threads `--bazel-rc`, `--output_user_root`, and `--output_base` consistently. The new optional `outputUserRoot` field on `BazelQueryOptions` is the Maven path's hook for per-invocation server isolation; the orchestrator (next commit) mkdtemp's a fresh path and will reap the server via `bazel shutdown` + `rm -rf` on success and on timeout, so timed-out servers no longer leak across CLI invocations. Add `runBazelModShowMavenExtension`: invokes bazel mod show_extension @rules_jvm_external//:extensions.bzl%maven to enumerate Maven hubs directly from the rules_jvm_external extension report, replacing the over-enumerating `dump_repo_mapping` surface on the Maven path. `runBazelModShowVisibleRepos` is kept around for the legacy PyPI extractor, which has not been rescoped yet. Replace the Maven-side `buildProbeFor` (which emitted a kind-only `kind("jvm_import rule|aar_import rule", @repo//:*)` query) with `buildMavenProbeFor`, a lightweight `cquery '@//... --output=label --keep_going'` presence check whose result feeds the new tri-state classifier in bazel-repo-discovery. Kind-only filtering missed POM-only / native / AAR-without-aar_import artefacts and any future rules_jvm_external rule shape; the metadata filter is now applied by the per-repo extraction cquery (next layer), not by the probe. Update `buildPypiProbeFor`'s return shape to include stderr so it satisfies the new `RepoProbe` type contract. Move `parseVisibleRepoCandidates` and the `ValidationResult` type into bazel-pypi-discovery (their only remaining consumer); the Maven module no longer carries dump_repo_mapping-shaped code. Tests cover the new argv shapes for every spawn surface, the outputUserRoot startup-flag placement (before subcommand), the Maven probe argv (cquery + @repo//... + --output=label + --keep_going), and the full result-triple propagation (code/stdout/stderr) that the tri-state classifier needs. --- .../manifest/bazel/bazel-pypi-discovery.mts | 90 +++++++- .../manifest/bazel/bazel-query-runner.mts | 196 ++++++++++++------ .../bazel/bazel-query-runner.test.mts | 128 ++++++++++-- .../manifest/bazel/extract_bazel_to_pypi.mts | 2 +- 4 files changed, 332 insertions(+), 84 deletions(-) diff --git a/src/commands/manifest/bazel/bazel-pypi-discovery.mts b/src/commands/manifest/bazel/bazel-pypi-discovery.mts index e92561cef..3b62610e8 100644 --- a/src/commands/manifest/bazel/bazel-pypi-discovery.mts +++ b/src/commands/manifest/bazel/bazel-pypi-discovery.mts @@ -5,7 +5,95 @@ import { logger } from '@socketsecurity/registry/lib/logger' import { getErrorCause } from '../../../utils/errors.mts' -import type { RepoProbe, ValidationResult } from './bazel-repo-discovery.mts' +import type { RepoProbe } from './bazel-repo-discovery.mts' + +// Result shape returned by `validatePypiHub`. Kept local to the PyPI module +// since validation here is hub-alias-marker based (different from the +// Maven-side tri-state classifier). +export type ValidationResult = { + valid: boolean + // Probe stdout — populated whenever the probe was reachable, even when + // validation rejects the hub. Empty string when the probe itself threw. + stdout: string +} + +// PyPI-only repo-name predicate (Bazel apparent-name grammar). +const PYPI_REPO_NAME_PATTERN = '[A-Za-z0-9._+-]{1,129}' +const PYPI_REPO_NAME_RE = new RegExp(`^${PYPI_REPO_NAME_PATTERN}$`) + +function pypiApparentNameFromJsonValue(value: unknown): string | undefined { + if (!value || typeof value !== 'object') { + return undefined + } + const obj = value as Record + const direct = obj['apparentName'] ?? obj['apparent_name'] + if (typeof direct === 'string') { + return direct + } + for (const nested of Object.values(obj)) { + const found = pypiApparentNameFromJsonValue(nested) + if (found) { + return found + } + } + return undefined +} + +function pypiApparentNamesFromRepoMapping(value: unknown): string[] { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return [] + } + const candidates: string[] = [] + for (const [name, canonicalName] of Object.entries(value)) { + if (name.startsWith('@') || typeof canonicalName !== 'string') { + continue + } + if (PYPI_REPO_NAME_RE.test(name)) { + candidates.push(name) + } + } + return candidates +} + +function pypiNormalizeRepoName(name: string): string | undefined { + const repo = name.startsWith('@') ? name.slice(1) : name + return PYPI_REPO_NAME_RE.test(repo) ? repo : undefined +} + +// Parse `bazel mod dump_repo_mapping "" --output=json` output. Also accepts +// the older streamed jsonproto shape (apparentName / apparent_name records). +// PyPI-only; the Maven path consumes `bazel mod show_extension` instead. +export function parseVisibleRepoCandidates(output: string): string[] { + const seen = new Set() + const candidates: string[] = [] + for (const line of output.split(/\r?\n/)) { + const trimmed = line.trim() + if (!trimmed) { + continue + } + try { + const parsed = JSON.parse(trimmed) as unknown + for (const c of pypiApparentNamesFromRepoMapping(parsed)) { + if (!seen.has(c)) { + seen.add(c) + candidates.push(c) + } + } + const apparentName = pypiApparentNameFromJsonValue(parsed) + if (apparentName) { + const repo = pypiNormalizeRepoName(apparentName) + if (repo && !seen.has(repo)) { + seen.add(repo) + candidates.push(repo) + } + } + } catch { + // Skip malformed lines; caller falls back to static discovery when no + // usable visible repo names are found. + } + } + return candidates.sort() +} // Maximum size (bytes) we will read for any single Bazel workspace file. // Prevents DoS via maliciously large MODULE.bazel / WORKSPACE / .bzl files. diff --git a/src/commands/manifest/bazel/bazel-query-runner.mts b/src/commands/manifest/bazel/bazel-query-runner.mts index 34300d487..2d6361de3 100644 --- a/src/commands/manifest/bazel/bazel-query-runner.mts +++ b/src/commands/manifest/bazel/bazel-query-runner.mts @@ -12,6 +12,13 @@ export type BazelQueryOptions = { bazelRc?: string bazelFlags?: string bazelOutputBase?: string + // Per-invocation `--output_user_root` for server isolation. When set, all + // argv builders inject it as a startup flag so a timed-out Bazel server + // can be reaped via `bazel --output_user_root= shutdown` + `rm -rf` + // without disturbing the user's shared output_user_root. The Maven + // orchestrator mkdtemp's a fresh path per invocation; the legacy PyPI + // path may leave it unset for now. + outputUserRoot?: string env?: NodeJS.ProcessEnv verbose?: boolean } @@ -39,17 +46,28 @@ export function splitBazelFlags(flags: string | undefined): string[] { return flags.split(/\s+/).filter(Boolean) } -function buildBazelModShowVisibleReposArgv(opts: BazelQueryOptions): string[] { +// Build the shared startup-flag prefix for any bazel invocation. Centralised +// so `--output_user_root` propagates to every spawn — principle 7 of the +// Maven design requires per-invocation server isolation across query, +// cquery, and `bazel mod` commands alike. +function buildStartupFlags(opts: BazelQueryOptions): string[] { const startup: string[] = [] if (opts.bazelRc) { startup.push(`--bazelrc=${opts.bazelRc}`) } + if (opts.outputUserRoot) { + startup.push(`--output_user_root=${opts.outputUserRoot}`) + } if (opts.bazelOutputBase) { startup.push(`--output_base=${opts.bazelOutputBase}`) } + return startup +} + +function buildBazelModShowVisibleReposArgv(opts: BazelQueryOptions): string[] { const userFlags = splitBazelFlags(opts.bazelFlags) return [ - ...startup, + ...buildStartupFlags(opts), 'mod', 'dump_repo_mapping', '', @@ -58,17 +76,23 @@ function buildBazelModShowVisibleReposArgv(opts: BazelQueryOptions): string[] { ] } +function buildBazelModShowMavenExtensionArgv( + opts: BazelQueryOptions, +): string[] { + const userFlags = splitBazelFlags(opts.bazelFlags) + return [ + ...buildStartupFlags(opts), + 'mod', + 'show_extension', + '@rules_jvm_external//:extensions.bzl%maven', + ...userFlags, + ] +} + function buildBazelModShowPipExtensionArgv(opts: BazelQueryOptions): string[] { - const startup: string[] = [] - if (opts.bazelRc) { - startup.push(`--bazelrc=${opts.bazelRc}`) - } - if (opts.bazelOutputBase) { - startup.push(`--output_base=${opts.bazelOutputBase}`) - } const userFlags = splitBazelFlags(opts.bazelFlags) return [ - ...startup, + ...buildStartupFlags(opts), 'mod', 'show_extension', '@rules_python//python/extensions:pip.bzl%pip', @@ -84,18 +108,11 @@ function buildBazelArgv( ): string[] { // Startup flags MUST precede the `query` subcommand. // Bazel argv shape: query --output= - const startup: string[] = [] - if (opts.bazelRc) { - startup.push(`--bazelrc=${opts.bazelRc}`) - } - if (opts.bazelOutputBase) { - startup.push(`--output_base=${opts.bazelOutputBase}`) - } // Keep query output stable and avoid updating Bazel lockfiles while extracting. const queryFlags = ['--lockfile_mode=off', '--noshow_progress'] const userFlags = splitBazelFlags(opts.bazelFlags) return [ - ...startup, + ...buildStartupFlags(opts), 'query', ...queryFlags, ...opts.invocationFlags, @@ -105,6 +122,29 @@ function buildBazelArgv( ] } +// Lightweight presence-check cquery used by the tri-state probe classifier. +// `--keep_going --output=label` keeps it fast even on partial-analysis +// repos and avoids paying for `--output=jsonproto` plus +// `--proto:output_rule_attrs` (which the heavier metadata extraction in +// `bazel-cquery.mts` needs but the probe does not). +function buildBazelProbeCqueryArgv( + repoName: string, + opts: BazelQueryOptions, +): string[] { + const userFlags = splitBazelFlags(opts.bazelFlags) + return [ + ...buildStartupFlags(opts), + 'cquery', + '--lockfile_mode=off', + '--noshow_progress', + ...opts.invocationFlags, + `@${repoName}//...`, + '--output=label', + '--keep_going', + ...userFlags, + ] +} + function stringField(value: unknown): string { return typeof value === 'string' ? value : '' } @@ -229,15 +269,11 @@ export async function runBazelQuery( } } -/** - * Bzlmod-native visible repository enumeration. This is only a candidate - * source; callers must still validate each returned apparent repo name with a - * semantic query for generated ecosystem rules. - */ -export async function runBazelModShowVisibleRepos( +async function runBazelOneShot( + argv: string[], opts: BazelQueryOptions, + step: string, ): Promise { - const argv = buildBazelModShowVisibleReposArgv(opts) if (opts.verbose) { logger.log('[VERBOSE] Executing:', opts.bin, ', args:', argv) } @@ -259,70 +295,100 @@ export async function runBazelModShowVisibleRepos( durationMs: Date.now() - startedAt, opts, result, - step: 'bazel mod dump_repo_mapping', + step, }) return result } /** - * Bzlmod-native rules_python pip extension usage inspection. This is the - * authoritative source for root-module pip.parse metadata when Bazel supports - * the command; callers keep bounded static parsing as fallback. + * Bzlmod-native visible repository enumeration. NOTE: only consumed by the + * legacy PyPI path; the Maven path uses `runBazelModShowMavenExtension` + * instead because `dump_repo_mapping` over-enumerates apparent names that + * are not Maven hubs. + */ +export async function runBazelModShowVisibleRepos( + opts: BazelQueryOptions, +): Promise { + return runBazelOneShot( + buildBazelModShowVisibleReposArgv(opts), + opts, + 'bazel mod dump_repo_mapping', + ) +} + +/** + * Bzlmod-native Maven hub enumeration via the rules_jvm_external maven + * extension. The text-format report lists every repo the extension + * generated; `parseShowExtensionOutput` (bazel-repo-discovery.mts) + * extracts the hubs from the `Fetched repositories:` section. + */ +export async function runBazelModShowMavenExtension( + opts: BazelQueryOptions, +): Promise { + return runBazelOneShot( + buildBazelModShowMavenExtensionArgv(opts), + opts, + 'bazel mod show_extension rules_jvm_external maven', + ) +} + +/** + * Bzlmod-native rules_python pip extension usage inspection. Used by the + * PyPI path; kept here since the argv shape is identical to the maven + * variant modulo the extension target. */ export async function runBazelModShowPipExtension( opts: BazelQueryOptions, ): Promise { - const argv = buildBazelModShowPipExtensionArgv(opts) - if (opts.verbose) { - logger.log('[VERBOSE] Executing:', opts.bin, ', args:', argv) - } - const startedAt = Date.now() - let result: BazelQueryResult - try { - const output = await spawn(opts.bin, argv, { - cwd: opts.cwd, - timeout: BAZEL_QUERY_TIMEOUT_MS, - ...(opts.env ? { env: opts.env } : {}), - }) - const { code, stderr, stdout } = output - result = { code, stdout, stderr } - } catch (e) { - result = normalizeSpawnError(e) - } - logBazelTrace({ - argv, - durationMs: Date.now() - startedAt, + return runBazelOneShot( + buildBazelModShowPipExtensionArgv(opts), opts, - result, - step: 'bazel mod show_extension rules_python pip', - }) - return result + 'bazel mod show_extension rules_python pip', + ) } /** - * Build a `RepoProbe` (compatible with bazel-repo-discovery) bound to opts. - * Used by `discoverMavenRepos` to validate candidate Maven repo - * names against the running workspace. + * Build a `RepoProbe` (compatible with bazel-repo-discovery's tri-state + * classifier) bound to opts. Runs the lightweight presence-check cquery + * `@//... --output=label --keep_going` — cheap enough to attempt + * every conventional Maven hub name without triggering `repository_rule` + * fetches on undefined names (Exp 3). */ -export function buildProbeFor(opts: BazelQueryOptions): RepoProbe { +export function buildMavenProbeFor(opts: BazelQueryOptions): RepoProbe { return async (repoName: string) => { - const queryStr = `kind("jvm_import rule|aar_import rule", @${repoName}//:*)` - const result = await runBazelQuery(queryStr, opts) - return { stdout: result.stdout, code: result.code } + const argv = buildBazelProbeCqueryArgv(repoName, opts) + const result = await runBazelOneShot( + argv, + opts, + `bazel cquery probe @${repoName}`, + ) + return { code: result.code, stdout: result.stdout, stderr: result.stderr } } } /** * Build a `RepoProbe` for validating pip hub candidates. - * Queries the hub for package targets (e.g. `@//...`) and returns - * stdout so the caller can check for `:pkg` labels or alias rules. - * Does NOT require `pypi_name=` tags in the hub output, because those - * tags live on spoke repos, not the hub alias layer. + * Queries the hub for package targets (e.g. `@//...`) and returns the + * full result triple so the caller can check for `:pkg` labels or alias + * rules. Does NOT require `pypi_name=` tags in the hub output, because + * those tags live on spoke repos, not the hub alias layer. */ export function buildPypiProbeFor(opts: BazelQueryOptions): RepoProbe { return async (hubName: string) => { const queryStr = `@${hubName}//...` const result = await runBazelQuery(queryStr, opts) - return { stdout: result.stdout, code: result.code } + return { code: result.code, stdout: result.stdout, stderr: result.stderr } } } + +// Re-exported for direct test access — useful when asserting on argv shape +// without spawning. Returns the exact argv `runBazelModShowMavenExtension` +// would pass to spawn. +export const _internalArgvBuilders = { + buildBazelArgv, + buildBazelModShowMavenExtensionArgv, + buildBazelModShowPipExtensionArgv, + buildBazelModShowVisibleReposArgv, + buildBazelProbeCqueryArgv, + buildStartupFlags, +} diff --git a/src/commands/manifest/bazel/bazel-query-runner.test.mts b/src/commands/manifest/bazel/bazel-query-runner.test.mts index 15cd2411f..ff9bda425 100644 --- a/src/commands/manifest/bazel/bazel-query-runner.test.mts +++ b/src/commands/manifest/bazel/bazel-query-runner.test.mts @@ -19,8 +19,9 @@ import { logger } from '@socketsecurity/registry/lib/logger' import { spawn } from '@socketsecurity/registry/lib/spawn' import { - buildProbeFor, + buildMavenProbeFor, buildPypiProbeFor, + runBazelModShowMavenExtension, runBazelModShowPipExtension, runBazelModShowVisibleRepos, runBazelQuery, @@ -40,7 +41,7 @@ describe('runBazelQuery', () => { }) it('builds the standard query argv shape', async () => { - await runBazelQuery('kind(jvm_import, @maven//:*)', { + await runBazelQuery('attr("tags", ".+", @maven//:*)', { bin: '/usr/local/bin/bazel', cwd: '/repo', invocationFlags: [], @@ -51,7 +52,7 @@ describe('runBazelQuery', () => { expect(argv[0]).toBe('query') expect(argv).toContain('--lockfile_mode=off') expect(argv).toContain('--noshow_progress') - expect(argv).toContain('kind(jvm_import, @maven//:*)') + expect(argv).toContain('attr("tags", ".+", @maven//:*)') expect(argv).toContain('--output=build') }) @@ -69,6 +70,20 @@ describe('runBazelQuery', () => { ) }) + it('forwards outputUserRoot as a startup flag BEFORE the subcommand', async () => { + await runBazelQuery('q', { + bin: 'bazel', + cwd: '/r', + invocationFlags: [], + outputUserRoot: '/tmp/socket-bazel-xyz', + }) + const argv = mocked.mock.calls[0]![1] as string[] + expect(argv).toContain('--output_user_root=/tmp/socket-bazel-xyz') + expect( + argv.indexOf('--output_user_root=/tmp/socket-bazel-xyz'), + ).toBeLessThan(argv.indexOf('query')) + }) + it('forwards bazelOutputBase as a startup flag BEFORE query', async () => { await runBazelQuery('q', { bin: 'bazel', @@ -217,6 +232,50 @@ describe('runBazelQuery', () => { }) }) +describe('runBazelModShowMavenExtension', () => { + const mocked = vi.mocked(spawn) + + beforeEach(() => { + mocked.mockReset() + // @ts-ignore — narrow return shape for the test's purposes. + mocked.mockResolvedValue({ + code: 0, + stdout: '## @@rules_jvm_external+//:extensions.bzl%maven:\n', + stderr: '', + }) + }) + + it('uses the rules_jvm_external maven extension target', async () => { + await runBazelModShowMavenExtension({ + bin: 'bazel', + cwd: '/repo', + invocationFlags: [], + }) + const argv = mocked.mock.calls[0]![1] as string[] + expect(argv).toEqual([ + 'mod', + 'show_extension', + '@rules_jvm_external//:extensions.bzl%maven', + ]) + }) + + it('threads outputUserRoot ahead of the subcommand', async () => { + await runBazelModShowMavenExtension({ + bin: 'bazel', + cwd: '/repo', + invocationFlags: [], + outputUserRoot: '/tmp/socket-bazel-abc', + }) + const argv = mocked.mock.calls[0]![1] as string[] + expect(argv).toEqual([ + '--output_user_root=/tmp/socket-bazel-abc', + 'mod', + 'show_extension', + '@rules_jvm_external//:extensions.bzl%maven', + ]) + }) +}) + describe('runBazelModShowVisibleRepos', () => { const mocked = vi.mocked(spawn) @@ -232,7 +291,6 @@ describe('runBazelModShowVisibleRepos', () => { cwd: '/repo', invocationFlags: [], }) - const argv = mocked.mock.calls[0]![1] as string[] expect(argv).toEqual(['mod', 'dump_repo_mapping', '', '--output=json']) expect(argv).not.toContain('--all_visible_repos') @@ -255,7 +313,6 @@ describe('runBazelModShowPipExtension', () => { cwd: '/repo', invocationFlags: [], }) - const argv = mocked.mock.calls[0]![1] as string[] expect(argv).toEqual([ 'mod', @@ -266,7 +323,7 @@ describe('runBazelModShowPipExtension', () => { }) }) -describe('buildProbeFor', () => { +describe('buildMavenProbeFor', () => { const mocked = vi.mocked(spawn) beforeEach(() => { @@ -274,25 +331,62 @@ describe('buildProbeFor', () => { // @ts-ignore — narrow return shape for the test's purposes. mocked.mockResolvedValue({ code: 0, - stdout: 'jvm_import(\n maven_coordinates="g:a:1",\n)', + stdout: '@maven//:foo\n@maven//:bar\n', stderr: '', }) }) - it('builds the probe query for a repo name', async () => { - const probe = buildProbeFor({ + it('builds the lightweight presence-check cquery for a repo name', async () => { + const probe = buildMavenProbeFor({ bin: 'bazel', cwd: '/r', invocationFlags: [], }) const result = await probe('my_maven_repo') const argv = mocked.mock.calls[0]![1] as string[] - expect(argv).toContain( - 'kind("jvm_import rule|aar_import rule", @my_maven_repo//:*)', - ) + expect(argv).toContain('cquery') + expect(argv).toContain('@my_maven_repo//...') + expect(argv).toContain('--output=label') + expect(argv).toContain('--keep_going') expect(result).toEqual({ - stdout: expect.stringContaining('maven_coordinates'), code: 0, + stdout: '@maven//:foo\n@maven//:bar\n', + stderr: '', + }) + }) + + it('threads outputUserRoot into the probe argv', async () => { + const probe = buildMavenProbeFor({ + bin: 'bazel', + cwd: '/r', + invocationFlags: [], + outputUserRoot: '/tmp/x', + }) + await probe('maven') + const argv = mocked.mock.calls[0]![1] as string[] + expect(argv[0]).toBe('--output_user_root=/tmp/x') + expect(argv).toContain('@maven//...') + }) + + it('returns the full result triple including stderr (tri-state classifier needs it)', async () => { + // @ts-ignore — narrow return shape for the test's purposes. + mocked.mockResolvedValueOnce({ + code: 1, + stdout: '', + stderr: + "ERROR: No repository visible as '@nope' from main repository\n", + }) + const probe = buildMavenProbeFor({ + bin: 'bazel', + cwd: '/r', + invocationFlags: [], + }) + const result = await probe('nope') + expect(result).toEqual({ + code: 1, + stdout: '', + stderr: + "ERROR: No repository visible as '@nope' from main repository\n", }) }) }) @@ -320,12 +414,13 @@ describe('buildPypiProbeFor', () => { const argv = mocked.mock.calls[0]![1] as string[] expect(argv).toContain('@pypi//...') expect(result).toEqual({ - stdout: expect.stringContaining('@pypi//requests:pkg'), code: 0, + stdout: expect.stringContaining('@pypi//requests:pkg'), + stderr: '', }) }) - it('returns non-zero code when the hub has no :pkg targets', async () => { + it('returns the full triple when the hub has no :pkg targets', async () => { mocked.mockReset() // @ts-ignore — narrow return shape for the test's purposes. mocked.mockResolvedValue({ @@ -339,7 +434,6 @@ describe('buildPypiProbeFor', () => { invocationFlags: [], }) const result = await probe('empty_hub') - expect(result.code).toBe(0) - expect(result.stdout).toBe('') + expect(result).toEqual({ code: 0, stdout: '', stderr: '' }) }) }) diff --git a/src/commands/manifest/bazel/extract_bazel_to_pypi.mts b/src/commands/manifest/bazel/extract_bazel_to_pypi.mts index c23f4fe6b..95385df42 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_pypi.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_pypi.mts @@ -25,7 +25,7 @@ import { runBazelModShowVisibleRepos, runBazelQuery, } from './bazel-query-runner.mts' -import { parseVisibleRepoCandidates } from './bazel-repo-discovery.mts' +import { parseVisibleRepoCandidates } from './bazel-pypi-discovery.mts' import { detectWorkspaceMode, getBazelInvocationFlags, From 203b59c5f151aa7dfec7f48f7cbde51a383b40f0 Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 14:14:38 +0200 Subject: [PATCH 06/10] feat(manifest/bazel): per-repo metadata cquery with jsonproto parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `runMetadataCqueryForRepo` executes the per-repo extraction cquery and returns a structured outcome (`ok` / `partial` / `timeout` / `empty` / `error`) so the orchestrator can populate sidecar status without custom error plumbing per call site. The cquery target expression is the union of three predicates — `attr("tags", "\bmaven_coordinates=", ...)`, `attr("maven_coordinates", ".+", ...)`, and `attr("maven_url", ".+", ...)`. That matches rules_jvm_external's `jvm_import` / `aar_import` shapes, Bazel-native `java_library` with direct `maven_coordinates`, and POM-only / source-jar shapes that carry only `maven_url`. Word-boundary `\b` in the tags predicate prevents matches on values like `pre_maven_coordinates=fake`. `parseCqueryJsonproto` is defensive about the jsonproto encoding: dispatches on `attribute[].type`, accepts both camelCase (`stringValue`, `stringListValue`) and snake_case (`string_value`, `string_list_value`) payload keys, and tolerates both the Bazel 5+ envelope shape (`{ "results": [{ "target": {...} }] }`) and the older per-line streamed shape. Coordinate extraction prefers the direct `maven_coordinates` attribute; falls back to scanning `tags` for `maven_coordinates=G:A:V`. Provenance lands in `sourceRepo` as `:` (or just `` at the root), so the orchestrator's dedup can attribute artifacts back to their discovery site. Timeout handling: spawn rejections with `timedOut` / `killed` / `SIGTERM` / `SIGKILL` map to `status: 'timeout'`. The runner does NOT delete the outputUserRoot — server lifecycle (reap via `bazel shutdown` + `rm -rf`) is the orchestrator's concern so that a single tempdir can hold multiple per-repo runs. Also widen `ExtractedArtifact.ruleKind` from the literal `'jvm_import' | 'aar_import'` union to `string`. The legacy text-format parsers only ever set those two values, but the metadata cquery returns whatever `ruleClass` Bazel reports (`java_library`, `kt_jvm_import`, any future rules_jvm_external rule). Existing consumers only read the field diagnostically; nothing else changes. Tests cover the parser (envelope, per-line stream, snake_case fallback, direct-vs-tag preference, missing-coordinate skip, empty input), the argv builder (target expression union, startup-flag placement, `--bazel-flag` placement, invocationFlags order), and the runner's status classification including the spawn-timeout branch. --- .../manifest/bazel/bazel-build-parser.mts | 7 +- src/commands/manifest/bazel/bazel-cquery.mts | 349 +++++++++++++++ .../manifest/bazel/bazel-cquery.test.mts | 412 ++++++++++++++++++ 3 files changed, 767 insertions(+), 1 deletion(-) create mode 100644 src/commands/manifest/bazel/bazel-cquery.mts create mode 100644 src/commands/manifest/bazel/bazel-cquery.test.mts diff --git a/src/commands/manifest/bazel/bazel-build-parser.mts b/src/commands/manifest/bazel/bazel-build-parser.mts index af30345b7..7090eafd6 100644 --- a/src/commands/manifest/bazel/bazel-build-parser.mts +++ b/src/commands/manifest/bazel/bazel-build-parser.mts @@ -9,8 +9,13 @@ * the input string. */ +// `ruleKind` is the rule class the artifact came from. Legacy text-format +// parsers only emit 'jvm_import' / 'aar_import' (the kinds rules_jvm_external +// historically generated); the metadata cquery in bazel-cquery.mts emits +// whatever `ruleClass` jsonproto reports — `java_library`, `kt_jvm_import`, +// any future rules_jvm_external rule — so the type is open. export type ExtractedArtifact = { - ruleKind: 'jvm_import' | 'aar_import' + ruleKind: string ruleName: string mavenCoordinates: string sourceRepo?: string | undefined diff --git a/src/commands/manifest/bazel/bazel-cquery.mts b/src/commands/manifest/bazel/bazel-cquery.mts new file mode 100644 index 000000000..819cce39c --- /dev/null +++ b/src/commands/manifest/bazel/bazel-cquery.mts @@ -0,0 +1,349 @@ +/** + * Per-repo metadata cquery + jsonproto parser for the Maven path. + * + * Pipeline: + * 1. Build a cquery argv targeting `attr("tags", "\bmaven_coordinates=", + * @//...)` plus union variants for direct `maven_coordinates` / + * `maven_url` attributes. `--output=jsonproto` + + * `--proto:output_rule_attrs=tags,maven_coordinates,maven_url` keeps the + * payload small. + * 2. Spawn under a caller-supplied `outputUserRoot` so the orchestrator can + * reap the server cleanly (`bazel --output_user_root= shutdown` + * followed by `rm -rf`). The runner itself never deletes anything — + * server lifecycle is the orchestrator's concern. + * 3. Parse the jsonproto stream defensively: dispatch on `attribute[].type` + * and accept both camelCase (`stringValue`, `stringListValue`) and + * snake_case (`string_value`, `string_list_value`) payload keys. + * 4. Extract the maven coordinate from the direct `maven_coordinates` attr + * when present, else scan `tags` for `maven_coordinates=`. + * 5. Tag every artifact with `workspace:` + `repo:` + * provenance via `sourceRepo`. + */ +import { spawn } from '@socketsecurity/registry/lib/spawn' + +import { splitBazelFlags } from './bazel-query-runner.mts' + +import type { ExtractedArtifact } from './bazel-build-parser.mts' +import type { BazelQueryOptions } from './bazel-query-runner.mts' + +export type CqueryStatus = 'ok' | 'partial' | 'timeout' | 'empty' | 'error' + +export type CqueryRepoResult = { + repoName: string + workspaceRelPath: string + status: CqueryStatus + artifacts: ExtractedArtifact[] + stderr: string + durationMs: number +} + +export type RunMetadataCqueryArgs = { + repoName: string + workspaceRoot: string + // Provenance label (e.g. "examples/dagger"). Empty string for the root + // workspace. Embedded in each artifact's `sourceRepo` as + // `workspace:+repo:`. + workspaceRelPath: string + // Per-repo timeout in milliseconds. 60s default for auto-manifest; + // 120s for explicit invocation. Orchestrator picks; runner just enforces. + timeoutMs: number + opts: BazelQueryOptions +} + +// Maven coordinate token: `g:a:v` (3 parts) or `g:a:v:classifier` / +// `g:a:packaging:v` (4-part rules_jvm_external shapes). Tolerant of dots, +// dashes, plus, underscores in any part. +const MAVEN_COORD_TAG_RE = /^maven_coordinates=(.+)$/ + +// Build the metadata cquery target expression for one repo. The union of +// three predicates picks up artifacts that: +// - encode the coordinate in the conventional `tags = ["maven_coordinates=..."]` +// list (rules_jvm_external's emission for `jvm_import` and friends), +// - declare the coordinate as a direct `maven_coordinates` attribute +// (Bazel-native java_library / kt_jvm_import shape), or +// - declare a `maven_url` (POM-only and source-jar shapes that omit the +// coordinates tag but still represent a Maven artefact). +function buildMetadataCqueryExpr(repoName: string): string { + const r = `@${repoName}//...` + // The `\b` boundary in the tags predicate prevents matches on tag values + // like `pre_maven_coordinates=fake`; see todo 2 acceptance test (10). + return [ + `attr("tags", "\\bmaven_coordinates=", ${r})`, + `attr("maven_coordinates", ".+", ${r})`, + `attr("maven_url", ".+", ${r})`, + ].join(' union ') +} + +// Build the full cquery argv for a per-repo metadata cquery. Exposed for +// argv-shape unit tests without touching `spawn`. +export function buildMetadataCqueryArgv( + repoName: string, + opts: BazelQueryOptions, +): string[] { + const startup: string[] = [] + if (opts.bazelRc) { + startup.push(`--bazelrc=${opts.bazelRc}`) + } + if (opts.outputUserRoot) { + startup.push(`--output_user_root=${opts.outputUserRoot}`) + } + if (opts.bazelOutputBase) { + startup.push(`--output_base=${opts.bazelOutputBase}`) + } + const userFlags = splitBazelFlags(opts.bazelFlags) + return [ + ...startup, + 'cquery', + '--lockfile_mode=off', + '--noshow_progress', + ...opts.invocationFlags, + buildMetadataCqueryExpr(repoName), + '--output=jsonproto', + '--proto:output_rule_attrs=tags,maven_coordinates,maven_url', + '--keep_going', + ...userFlags, + ] +} + +type JsonprotoAttribute = { + name?: string + type?: string + stringValue?: string + string_value?: string + stringListValue?: string[] + string_list_value?: string[] +} + +type JsonprotoRule = { + name?: string + ruleClass?: string + rule_class?: string + attribute?: JsonprotoAttribute[] +} + +type JsonprotoTarget = { + type?: string + rule?: JsonprotoRule +} + +type JsonprotoEnvelope = { + // Bazel 5+ wraps the stream in `{ "results": [ { "target": {...} } ] }`; + // older shapes streamed one target per line. Accept either. + results?: Array<{ target?: JsonprotoTarget }> +} + +function readStringAttr(attr: JsonprotoAttribute): string | undefined { + if (attr.type !== 'STRING') { + return undefined + } + if (typeof attr.stringValue === 'string') { + return attr.stringValue + } + if (typeof attr.string_value === 'string') { + return attr.string_value + } + return undefined +} + +function readStringListAttr(attr: JsonprotoAttribute): string[] | undefined { + if (attr.type !== 'STRING_LIST') { + return undefined + } + if (Array.isArray(attr.stringListValue)) { + return attr.stringListValue + } + if (Array.isArray(attr.string_list_value)) { + return attr.string_list_value + } + return undefined +} + +// Extract the maven coordinate from a rule's attributes. Prefers the direct +// `maven_coordinates` attribute (Bazel-native shape); falls back to scanning +// `tags` for a `maven_coordinates=` entry (rules_jvm_external shape). +// Returns undefined if neither yields a non-empty value. +function extractMavenCoordinate( + rule: JsonprotoRule, +): { coord: string; url?: string | undefined } | undefined { + let coord: string | undefined + let url: string | undefined + for (const attr of rule.attribute ?? []) { + if (attr.name === 'maven_coordinates') { + const direct = readStringAttr(attr) + if (direct && direct.length) { + coord = direct + } + } else if (attr.name === 'maven_url') { + const direct = readStringAttr(attr) + if (direct && direct.length) { + url = direct + } + } else if (attr.name === 'tags') { + const tags = readStringListAttr(attr) + if (tags) { + for (const tag of tags) { + const m = MAVEN_COORD_TAG_RE.exec(tag) + if (m && !coord) { + coord = m[1] + } + } + } + } + } + if (!coord) { + return undefined + } + return url ? { coord, url } : { coord } +} + +// Strip the leading `@//:` prefix from a fully-qualified target label +// to recover the bare rule name (e.g. `com_google_guava_guava`). +function ruleNameFromLabel(label: string): string { + const colon = label.lastIndexOf(':') + return colon >= 0 ? label.slice(colon + 1) : label +} + +// Pure parser for the jsonproto cquery stream. Returns one +// `ExtractedArtifact` per rule with a recoverable maven coordinate. The +// `sourceRepo` field carries `:` provenance +// when a workspace path was provided; otherwise just the repo name. +export function parseCqueryJsonproto( + stdout: string, + repoName: string, + workspaceRelPath: string, +): ExtractedArtifact[] { + if (!stdout.trim()) { + return [] + } + // Bazel 5+ emits a single JSON envelope; older versions stream one target + // per line. Try envelope-first, then fall back to per-line. + const targets: JsonprotoTarget[] = [] + try { + const parsed = JSON.parse(stdout) as JsonprotoEnvelope + if (parsed.results) { + for (const r of parsed.results) { + if (r.target) { + targets.push(r.target) + } + } + } + } catch { + // Fall through to per-line scanning. + } + if (!targets.length) { + for (const line of stdout.split(/\r?\n/)) { + const trimmed = line.trim() + if (!trimmed) { + continue + } + try { + const parsed = JSON.parse(trimmed) as JsonprotoTarget + if (parsed?.rule) { + targets.push(parsed) + } + } catch { + // Skip malformed lines. + } + } + } + const provenance = workspaceRelPath + ? `${workspaceRelPath}:${repoName}` + : repoName + const out: ExtractedArtifact[] = [] + for (const target of targets) { + if (target.type && target.type !== 'RULE') { + continue + } + const rule = target.rule + if (!rule || !rule.name) { + continue + } + const extracted = extractMavenCoordinate(rule) + if (!extracted) { + continue + } + const ruleKind = rule.ruleClass ?? rule.rule_class ?? 'unknown' + out.push({ + deps: [], + mavenCoordinates: extracted.coord, + ruleKind, + ruleName: ruleNameFromLabel(rule.name), + sourceRepo: provenance, + ...(extracted.url ? { mavenUrl: extracted.url } : {}), + }) + } + return out +} + +// Classify the runner's raw outcome. Non-zero exit with `--keep_going` is a +// `partial` (some target analysis failed; the successful subset is still in +// stdout). Zero exit with no parsed artefacts is `empty`. Spawn timeout is +// signalled separately; this helper handles the post-spawn case. +function classifyCqueryOutcome( + code: number, + artifactCount: number, +): CqueryStatus { + if (code === 0) { + return artifactCount > 0 ? 'ok' : 'empty' + } + // --keep_going treats partial-analysis failures with non-zero exit but + // still yields the successful subset on stdout. Anything we parsed is + // worth keeping. + return artifactCount > 0 ? 'partial' : 'error' +} + +// Spawn the per-repo metadata cquery, parse the result, and return a +// structured outcome. On spawn timeout, return `status: 'timeout'` so the +// orchestrator can reap the server (`bazel --output_user_root= +// shutdown` + `rm -rf`) before moving on. +export async function runMetadataCqueryForRepo( + args: RunMetadataCqueryArgs, +): Promise { + const { opts, repoName, timeoutMs, workspaceRelPath, workspaceRoot } = args + const argv = buildMetadataCqueryArgv(repoName, opts) + const startedAt = Date.now() + try { + const result = await spawn(opts.bin, argv, { + cwd: workspaceRoot, + timeout: timeoutMs, + ...(opts.env ? { env: opts.env } : {}), + }) + const { code, stderr, stdout } = result + const artifacts = parseCqueryJsonproto(stdout, repoName, workspaceRelPath) + return { + artifacts, + durationMs: Date.now() - startedAt, + repoName, + status: classifyCqueryOutcome(code, artifacts.length), + stderr, + workspaceRelPath, + } + } catch (e) { + const err = e as { + code?: unknown + killed?: unknown + signal?: unknown + stderr?: unknown + stdout?: unknown + timedOut?: unknown + } + const stdout = typeof err.stdout === 'string' ? err.stdout : '' + const stderr = typeof err.stderr === 'string' ? err.stderr : '' + const timedOut = + err.timedOut === true || + err.killed === true || + err.signal === 'SIGTERM' || + err.signal === 'SIGKILL' + const artifacts = stdout + ? parseCqueryJsonproto(stdout, repoName, workspaceRelPath) + : [] + return { + artifacts, + durationMs: Date.now() - startedAt, + repoName, + status: timedOut ? 'timeout' : 'error', + stderr, + workspaceRelPath, + } + } +} diff --git a/src/commands/manifest/bazel/bazel-cquery.test.mts b/src/commands/manifest/bazel/bazel-cquery.test.mts new file mode 100644 index 000000000..05149e222 --- /dev/null +++ b/src/commands/manifest/bazel/bazel-cquery.test.mts @@ -0,0 +1,412 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' + +vi.mock('@socketsecurity/registry/lib/spawn', () => ({ + spawn: vi.fn(), +})) + +import { spawn } from '@socketsecurity/registry/lib/spawn' + +import { + buildMetadataCqueryArgv, + parseCqueryJsonproto, + runMetadataCqueryForRepo, +} from './bazel-cquery.mts' + +// Sample envelope shape Bazel 5+ emits: `{ "results": [ { "target": {...} } ] }`. +// Two rules: one with `tags`/`maven_coordinates` (rules_jvm_external shape) +// and one with the direct `maven_coordinates` attr only (Bazel-native shape). +const ENVELOPE_FIXTURE = JSON.stringify({ + results: [ + { + target: { + type: 'RULE', + rule: { + name: '@maven//:androidx_annotation_annotation', + ruleClass: 'jvm_import', + attribute: [ + { + name: 'maven_coordinates', + type: 'STRING', + stringValue: 'androidx.annotation:annotation:1.8.2', + }, + { + name: 'maven_url', + type: 'STRING', + stringValue: + 'https://maven.google.com/androidx/annotation/annotation/1.8.2/annotation-1.8.2.jar', + }, + { + name: 'tags', + type: 'STRING_LIST', + stringListValue: [ + 'maven_coordinates=androidx.annotation:annotation:1.8.2', + 'maven_repository=https://maven.google.com', + ], + }, + ], + }, + }, + }, + { + target: { + type: 'RULE', + rule: { + name: '@maven//:plain_lib', + ruleClass: 'java_library', + attribute: [ + { + name: 'tags', + type: 'STRING_LIST', + stringListValue: ['maven_coordinates=com.example:plain:1.0'], + }, + ], + }, + }, + }, + ], +}) + +describe('buildMetadataCqueryArgv', () => { + it('builds the union expression and the documented flag set', () => { + const argv = buildMetadataCqueryArgv('maven', { + bin: 'bazel', + cwd: '/repo', + invocationFlags: [], + }) + expect(argv).toContain('cquery') + expect(argv).toContain('--output=jsonproto') + expect(argv).toContain('--proto:output_rule_attrs=tags,maven_coordinates,maven_url') + expect(argv).toContain('--keep_going') + expect(argv).toContain('--lockfile_mode=off') + const expr = argv.find(a => a.includes('attr("tags"')) + expect(expr).toContain('attr("tags", "\\bmaven_coordinates=", @maven//...)') + expect(expr).toContain('attr("maven_coordinates", ".+", @maven//...)') + expect(expr).toContain('attr("maven_url", ".+", @maven//...)') + }) + + it('threads outputUserRoot, bazelRc, and bazelOutputBase as startup flags before cquery', () => { + const argv = buildMetadataCqueryArgv('maven', { + bin: 'bazel', + cwd: '/r', + invocationFlags: [], + bazelRc: '/etc/bazel.rc', + outputUserRoot: '/tmp/socket-bazel-1', + bazelOutputBase: '/tmp/output-base', + }) + expect(argv[0]).toBe('--bazelrc=/etc/bazel.rc') + expect(argv[1]).toBe('--output_user_root=/tmp/socket-bazel-1') + expect(argv[2]).toBe('--output_base=/tmp/output-base') + expect(argv[3]).toBe('cquery') + }) + + it('appends user --bazel-flag args AFTER the standard cquery flags', () => { + const argv = buildMetadataCqueryArgv('maven', { + bin: 'bazel', + cwd: '/r', + invocationFlags: [], + bazelFlags: '--config=ci --repo_env=SCALA_VERSION=2.13.18', + }) + const cqueryIdx = argv.indexOf('cquery') + const userIdx = argv.indexOf('--config=ci') + expect(userIdx).toBeGreaterThan(cqueryIdx) + expect(argv).toContain('--repo_env=SCALA_VERSION=2.13.18') + }) + + it('includes invocationFlags between subcommand and target expression', () => { + const argv = buildMetadataCqueryArgv('maven', { + bin: 'bazel', + cwd: '/r', + invocationFlags: ['--noenable_bzlmod', '--enable_workspace'], + }) + expect(argv).toContain('--noenable_bzlmod') + expect(argv).toContain('--enable_workspace') + }) +}) + +describe('parseCqueryJsonproto', () => { + it('parses Bazel-5+ envelope shape and returns one artifact per rule', () => { + const out = parseCqueryJsonproto(ENVELOPE_FIXTURE, 'maven', '') + expect(out).toHaveLength(2) + const first = out[0]! + expect(first.mavenCoordinates).toBe('androidx.annotation:annotation:1.8.2') + expect(first.mavenUrl).toBe( + 'https://maven.google.com/androidx/annotation/annotation/1.8.2/annotation-1.8.2.jar', + ) + expect(first.ruleKind).toBe('jvm_import') + expect(first.ruleName).toBe('androidx_annotation_annotation') + expect(first.sourceRepo).toBe('maven') + + const second = out[1]! + expect(second.mavenCoordinates).toBe('com.example:plain:1.0') + expect(second.ruleKind).toBe('java_library') + expect(second.ruleName).toBe('plain_lib') + }) + + it('emits workspace:+repo: provenance via sourceRepo when workspaceRelPath is set', () => { + const out = parseCqueryJsonproto( + ENVELOPE_FIXTURE, + 'maven', + 'examples/dagger', + ) + expect(out[0]?.sourceRepo).toBe('examples/dagger:maven') + }) + + it('falls back to snake_case payload keys (string_value, string_list_value)', () => { + const snakeCase = JSON.stringify({ + results: [ + { + target: { + type: 'RULE', + rule: { + name: '@maven//:snake_case_artifact', + rule_class: 'kt_jvm_import', + attribute: [ + { + name: 'tags', + type: 'STRING_LIST', + string_list_value: ['maven_coordinates=com.example:snake:2.0'], + }, + ], + }, + }, + }, + ], + }) + const out = parseCqueryJsonproto(snakeCase, 'maven', '') + expect(out).toHaveLength(1) + expect(out[0]?.mavenCoordinates).toBe('com.example:snake:2.0') + expect(out[0]?.ruleKind).toBe('kt_jvm_import') + }) + + it('falls back to per-line jsonproto stream when envelope is absent', () => { + const streamed = [ + JSON.stringify({ + type: 'RULE', + rule: { + name: '@maven//:a', + ruleClass: 'jvm_import', + attribute: [ + { + name: 'maven_coordinates', + type: 'STRING', + stringValue: 'g:a:1', + }, + ], + }, + }), + JSON.stringify({ + type: 'RULE', + rule: { + name: '@maven//:b', + ruleClass: 'jvm_import', + attribute: [ + { + name: 'maven_coordinates', + type: 'STRING', + stringValue: 'g:b:2', + }, + ], + }, + }), + ].join('\n') + const out = parseCqueryJsonproto(streamed, 'maven', '') + expect(out.map(a => a.mavenCoordinates)).toEqual(['g:a:1', 'g:b:2']) + }) + + it('skips rules with no recoverable maven coordinate', () => { + const noCoord = JSON.stringify({ + results: [ + { + target: { + type: 'RULE', + rule: { + name: '@maven//:no_coord', + ruleClass: 'java_library', + attribute: [ + { + name: 'tags', + type: 'STRING_LIST', + stringListValue: ['some_other_tag=value'], + }, + ], + }, + }, + }, + ], + }) + expect(parseCqueryJsonproto(noCoord, 'maven', '')).toEqual([]) + }) + + it('prefers the direct maven_coordinates attr over the tag fallback', () => { + const conflicting = JSON.stringify({ + results: [ + { + target: { + type: 'RULE', + rule: { + name: '@maven//:dual', + ruleClass: 'jvm_import', + attribute: [ + { + name: 'maven_coordinates', + type: 'STRING', + stringValue: 'g:direct:1', + }, + { + name: 'tags', + type: 'STRING_LIST', + stringListValue: ['maven_coordinates=g:via_tag:2'], + }, + ], + }, + }, + }, + ], + }) + const out = parseCqueryJsonproto(conflicting, 'maven', '') + expect(out[0]?.mavenCoordinates).toBe('g:direct:1') + }) + + it('returns [] on empty stdout', () => { + expect(parseCqueryJsonproto('', 'maven', '')).toEqual([]) + expect(parseCqueryJsonproto(' \n\n', 'maven', '')).toEqual([]) + }) +}) + +describe('runMetadataCqueryForRepo', () => { + const mocked = vi.mocked(spawn) + + beforeEach(() => { + mocked.mockReset() + }) + + it('returns status:ok with parsed artifacts on a clean run', async () => { + // @ts-ignore — narrow return shape for the test. + mocked.mockResolvedValueOnce({ + code: 0, + stdout: ENVELOPE_FIXTURE, + stderr: '', + }) + const r = await runMetadataCqueryForRepo({ + opts: { bin: 'bazel', cwd: '/r', invocationFlags: [] }, + repoName: 'maven', + timeoutMs: 60_000, + workspaceRelPath: '', + workspaceRoot: '/r', + }) + expect(r.status).toBe('ok') + expect(r.artifacts).toHaveLength(2) + expect(r.stderr).toBe('') + }) + + it('returns status:empty when stdout has no parsed artifacts on exit 0', async () => { + // @ts-ignore — narrow return shape for the test. + mocked.mockResolvedValueOnce({ code: 0, stdout: '', stderr: '' }) + const r = await runMetadataCqueryForRepo({ + opts: { bin: 'bazel', cwd: '/r', invocationFlags: [] }, + repoName: 'maven', + timeoutMs: 60_000, + workspaceRelPath: '', + workspaceRoot: '/r', + }) + expect(r.status).toBe('empty') + expect(r.artifacts).toEqual([]) + }) + + it('returns status:partial when --keep_going emits non-zero but still parses targets', async () => { + // Bazel: exit 1 + "Analysis succeeded for only 118 of 122 top-level targets" + // is the normal --keep_going outcome. + // @ts-ignore — narrow return shape for the test. + mocked.mockResolvedValueOnce({ + code: 1, + stdout: ENVELOPE_FIXTURE, + stderr: 'WARNING: analysis failed for some targets\n', + }) + const r = await runMetadataCqueryForRepo({ + opts: { bin: 'bazel', cwd: '/r', invocationFlags: [] }, + repoName: 'maven', + timeoutMs: 60_000, + workspaceRelPath: '', + workspaceRoot: '/r', + }) + expect(r.status).toBe('partial') + expect(r.artifacts).toHaveLength(2) + }) + + it('returns status:error on non-zero exit with no parsed targets', async () => { + // @ts-ignore — narrow return shape for the test. + mocked.mockResolvedValueOnce({ + code: 1, + stdout: '', + stderr: 'ERROR: something broke\n', + }) + const r = await runMetadataCqueryForRepo({ + opts: { bin: 'bazel', cwd: '/r', invocationFlags: [] }, + repoName: 'maven', + timeoutMs: 60_000, + workspaceRelPath: '', + workspaceRoot: '/r', + }) + expect(r.status).toBe('error') + expect(r.artifacts).toEqual([]) + }) + + it('returns status:timeout when spawn rejects with timedOut=true', async () => { + mocked.mockRejectedValueOnce( + Object.assign(new Error('command timed out'), { + code: null, + timedOut: true, + stderr: '', + stdout: '', + }), + ) + const r = await runMetadataCqueryForRepo({ + opts: { bin: 'bazel', cwd: '/r', invocationFlags: [] }, + repoName: 'maven', + timeoutMs: 60_000, + workspaceRelPath: '', + workspaceRoot: '/r', + }) + expect(r.status).toBe('timeout') + expect(r.artifacts).toEqual([]) + }) + + it('returns status:timeout when spawn signals SIGTERM/SIGKILL', async () => { + mocked.mockRejectedValueOnce( + Object.assign(new Error('killed'), { + signal: 'SIGTERM', + stderr: '', + stdout: '', + }), + ) + const r = await runMetadataCqueryForRepo({ + opts: { bin: 'bazel', cwd: '/r', invocationFlags: [] }, + repoName: 'maven', + timeoutMs: 60_000, + workspaceRelPath: '', + workspaceRoot: '/r', + }) + expect(r.status).toBe('timeout') + }) + + it('passes workspaceRoot as cwd and outputUserRoot as startup flag', async () => { + // @ts-ignore — narrow return shape for the test. + mocked.mockResolvedValueOnce({ code: 0, stdout: '', stderr: '' }) + await runMetadataCqueryForRepo({ + opts: { + bin: 'bazel', + cwd: '/anywhere', + invocationFlags: [], + outputUserRoot: '/tmp/socket-bazel-xyz', + }, + repoName: 'maven', + timeoutMs: 60_000, + workspaceRelPath: '', + workspaceRoot: '/repo/sub', + }) + const call = mocked.mock.calls[0]! + expect(call[2]).toMatchObject({ cwd: '/repo/sub', timeout: 60_000 }) + const argv = call[1] as string[] + expect(argv).toContain('--output_user_root=/tmp/socket-bazel-xyz') + }) +}) From 2e172f823f61c24d413f35ab016fd538e4df3b01 Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 14:18:00 +0200 Subject: [PATCH 07/10] refactor(manifest/bazel): orchestrator wraps the per-workspace algorithm in a tree walk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `extractBazelToMaven` now walks the scan root for every workspace (MODULE.bazel / WORKSPACE / WORKSPACE.bazel) and runs the per-workspace extraction algorithm in each one. Monorepos like rules_kotlin (examples//MODULE.bazel) and projects with mobile sub-workspaces (mobile/MODULE.bazel under a non-Bazel root) are no longer silently dropped to the root-only path. Per workspace: 1. Detect Bzlmod vs WORKSPACE mode. 2. Discover candidate Maven hubs: - Bzlmod: bazel mod show_extension @rules_jvm_external//:extensions.bzl%maven, parsed via parseShowExtensionOutput. - WORKSPACE (or Bzlmod fallback): probe the conventional names (maven, maven_install, maven_dev, unpinned_maven, maven_unpinned) plus any customer-supplied extras via the tri-state classifier. 3. Per populated candidate: run the metadata cquery (`attr("tags", "\bmaven_coordinates=", @//...)` ∪ direct `maven_coordinates` / `maven_url` attrs) and accept the parsed artefacts. 4. Aggregate, then dedup across workspaces by full Maven coordinate. Server isolation is now invariant: every Bazel invocation runs under a per-CLI-call --output_user_root=. On per-repo cquery timeout the orchestrator reaps the server (`bazel shutdown`) and `rm -rf`'s the tempdir, then mints a fresh one for subsequent repos — a single bad hub no longer cascades into the rest of the run. The finally-block cleanup reaps every tempdir that was minted, including the last one. Sidecar `manifest-status.json` lands beside the synthesized `maven_install.json`. Each entry records the repo's classified status (ok / partial / timeout / empty / error), artifact count, and duration, so the server-side can surface partial results to the customer. The top-level `complete: false` flag fires iff any repo timed out. Deleted: the unsorted_deps.json fast path (`extractFromOneRepo`, `bazelExternalDir`, `isForceQueryFallbackEnabled` env knob) — the metadata cquery returns the same GAVs the fast path used to recover, without depending on bazel-out symlinks or generated artefacts. Deleted: the lockfile merge (already done in a previous commit on this branch); deleted: the kind-only probe and dump_repo_mapping enumeration. The orchestrator's `ExtractBazelOptions` now accepts `extraMavenRepoNames` (legacy WORKSPACE non-conventional hub names) and `perRepoTimeoutMs` (per-repo cquery cap). The CLI flag wiring lands in a sibling commit; existing call sites continue to pass the same fields they did before. Existing `extract_bazel_to_maven.test.mts` is pinned to the old unsorted_deps fast path and is replaced wholesale in the next commit (test layer). --- .../manifest/bazel/extract_bazel_to_maven.mts | 512 ++++++++++-------- 1 file changed, 291 insertions(+), 221 deletions(-) diff --git a/src/commands/manifest/bazel/extract_bazel_to_maven.mts b/src/commands/manifest/bazel/extract_bazel_to_maven.mts index 334b116db..42a892519 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_maven.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_maven.mts @@ -2,37 +2,42 @@ import { existsSync, promises as fs, mkdirSync, - readFileSync, - realpathSync, + mkdtempSync, } from 'node:fs' +import os from 'node:os' import path from 'node:path' import { logger } from '@socketsecurity/registry/lib/logger' +import { spawn } from '@socketsecurity/registry/lib/spawn' import { resolveBazelBinary } from './bazel-bin-detect.mts' -import { - parseBazelBuildOutput, - parseUnsortedDepsJson, -} from './bazel-build-parser.mts' +import { runMetadataCqueryForRepo } from './bazel-cquery.mts' import { ensureJavaOnPath } from './bazel-java-shim.mts' import { validateOutputBase } from './bazel-output-base-check.mts' import { provisionPythonShim } from './bazel-python-shim.mts' import { - buildProbeFor, - runBazelModShowVisibleRepos, + buildMavenProbeFor, + runBazelModShowMavenExtension, } from './bazel-query-runner.mts' import { - discoverMavenRepos, - parseVisibleRepoCandidates, + CONVENTIONAL_MAVEN_REPO_NAMES, + parseShowExtensionOutput, + probeCandidate, } from './bazel-repo-discovery.mts' import { detectWorkspaceMode, getBazelInvocationFlags, } from './bazel-workspace-detect.mts' +import { findWorkspaceRoots } from './bazel-workspace-walk.mts' import { getErrorCause } from '../../../utils/errors.mts' +import type { + CqueryRepoResult, + CqueryStatus, +} from './bazel-cquery.mts' import type { ExtractedArtifact } from './bazel-build-parser.mts' import type { BazelQueryOptions } from './bazel-query-runner.mts' +import type { WorkspaceMode } from './bazel-workspace-detect.mts' export type ExtractBazelOptions = { bazelFlags: string | undefined @@ -42,9 +47,18 @@ export type ExtractBazelOptions = { cwd: string // Optional env override used for python-shim PATH augmentation. env?: NodeJS.ProcessEnv + // Customer-supplied Maven hub names augmenting the auto-discovery + // candidate set. Wired in by the `--bazel-maven-repo=` flag for + // legacy WORKSPACE workspaces whose hubs use non-conventional names + // (or custom Bzlmod extensions `mod show_extension` doesn't enumerate). + extraMavenRepoNames?: string[] | undefined out: string // Use the auto-manifest sibling directory instead of writing directly to `out`. outLayout?: 'flat' + // Per-repo cquery timeout in milliseconds. Auto-manifest default is 60s + // (the orchestrator's job is to not stall the wider scan); explicit + // invocations may bump it. + perRepoTimeoutMs?: number | undefined verbose: boolean } @@ -53,8 +67,33 @@ export type ExtractBazelResult = { manifestPath?: string | undefined noEcosystemFound?: boolean | undefined ok: boolean + // Path to the per-invocation status sidecar describing which repos + // produced artefacts and which timed out / were empty. Useful for the + // server-side to surface partial results to the customer. + statusPath?: string | undefined +} + +type SidecarRepoEntry = { + name: string + status: CqueryStatus + artifactCount: number + durationMs: number +} + +type SidecarWorkspaceEntry = { + relPath: string + mode: { bzlmod: boolean; workspace: boolean } + repos: SidecarRepoEntry[] +} + +type Sidecar = { + complete: boolean + workspaces: SidecarWorkspaceEntry[] } +const DEFAULT_PER_REPO_TIMEOUT_MS = 60_000 +const REAP_TIMEOUT_MS = 10_000 + type CoordPair = { groupArtifact: string; version: string } // Splits "g:a:v" -> { groupArtifact: "g:a", version: "v" }. @@ -171,15 +210,11 @@ export function normalizeToMavenInstallJson( } // Dependency keys in maven_install.json use "g:a" (no version), // matching the canonical rules_jvm_external lockfile shape. - // Only emit an entry when there are actual dependencies (lockfile omits - // artifacts with an empty dep list). const depKey = split.groupArtifact const depCoords = dependencySets.get(depKey) ?? new Set() for (const depLabel of a.deps) { - // First try our rule-label lookup (the common case for --output=build text). const c = depLabelToCoord(depLabel, labelToCoord) if (c) { - // c is "g:a:v"; strip the version to produce "g:a" per lockfile shape. const cs = splitCoord(c) depCoords.add(cs ? cs.groupArtifact : c) } else if ( @@ -187,9 +222,6 @@ export function normalizeToMavenInstallJson( !depLabel.startsWith('@') && !depLabel.startsWith(':') ) { - // unsorted_deps.json deps may be "g:a:v" in older files or - // "g:a" in v2 lock-file-shaped maps. Strip only when a version is - // present. const parts = depLabel.split(':') depCoords.add( parts.length >= 3 ? parts.slice(0, -1).join(':') : depLabel, @@ -206,127 +238,103 @@ export function normalizeToMavenInstallJson( return out } -// Resolves the bazel `external/` dir for the given workspace. -// -// Bazel's `bazel-out/` convenience symlink points at -// `/execroot//bazel-out/`; the `external/` dir we -// want is at `/external/`. `path.join` is purely lexical and -// would collapse `bazel-out/..` to the cwd itself, which is the wrong place -// Resolve the symlink at the filesystem level and walk up to -// `` instead. -function bazelExternalDir( - cwd: string, - outputBase: string | undefined, -): string | null { - if (outputBase) { - return path.join(outputBase, 'external') - } - const bazelOutLink = path.join(cwd, 'bazel-out') - if (!existsSync(bazelOutLink)) { - return null - } - try { - // realpath follows symlinks: ...//execroot//bazel-out - const real = realpathSync(bazelOutLink) - // Walk up bazel-out -> -> execroot -> , then into external/. - return path.join(real, '..', '..', '..', 'external') - } catch { - return null - } -} - -// Internal diagnostic: when truthy, skip the unsorted_deps.json fast path -// and force the bazel-query regex fallback. Used by bazel-bench to -// deterministically exercise parseBazelBuildOutput on every CI run. Truthy -// values are '1', 'true', 'yes' (case-insensitive); anything else (unset, -// '', '0', 'false') is treated as off. Not exposed as a user-facing CLI -// flag, so it is read here rather than added to constants.mts. -function isForceQueryFallbackEnabled(): boolean { - const raw = process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] - if (!raw) { - return false +// Cross-workspace dedup keyed on the full Maven coordinate string +// (`g:a:v[:classifier]`). The metadata cquery emits one entry per rule, +// so the same `androidx.annotation:annotation:1.8.2` can show up in +// `examples/dagger/@maven` and `examples/ksp/@maven` in rules_kotlin — +// downstream only needs it once. +function dedupArtifactsByCoord( + artifacts: ExtractedArtifact[], +): ExtractedArtifact[] { + const seen = new Set() + const out: ExtractedArtifact[] = [] + for (const a of artifacts) { + if (seen.has(a.mavenCoordinates)) { + continue + } + seen.add(a.mavenCoordinates) + out.push(a) } - const normalized = raw.toLowerCase() - return normalized === '1' || normalized === 'true' || normalized === 'yes' + return out } -// Tries `external//unsorted_deps.json` first; falls back to parsing the -// probe stdout the caller already captured during discovery. Discovery runs -// the same `kind("jvm_import rule|aar_import rule", @//:*)` query that -// extraction needs, so reusing its stdout skips one bazel-query invocation -// per repo on the unpinned path (where unsorted_deps.json isn't on disk). -async function extractFromOneRepo( - repoName: string, +// Build the per-workspace candidate Maven hub list. Bzlmod mode prefers +// `bazel mod show_extension`; WORKSPACE mode (and Bzlmod fallback when +// show_extension yields nothing) probes the conventional names plus any +// customer-supplied extras. Returns the list in discovery order. +async function discoverCandidatesForWorkspace( + workspaceRoot: string, + mode: WorkspaceMode, queryOpts: BazelQueryOptions, - cachedProbeStdout: string, -): Promise { - const verbose = queryOpts.verbose - // unsorted_deps.json lives under the bazel external dir. - // When --output_base is set, it's under that; otherwise under the workspace's - // bazel-out symlink (resolved via realpath, NOT lexical path.join — the - // lexical form would collapse `bazel-out/..` to cwd and miss the file). - const externalDir = bazelExternalDir(queryOpts.cwd, queryOpts.bazelOutputBase) - if (verbose) { - logger.log( - `[VERBOSE] @${repoName}: external dir:`, - externalDir ?? '(unresolved — bazel-out symlink absent)', - ) - } - const forceFallback = isForceQueryFallbackEnabled() - if (forceFallback && verbose) { - logger.log( - `[VERBOSE] @${repoName}: SOCKET_BAZEL_FORCE_QUERY_FALLBACK set; skipping unsorted_deps.json fast path.`, - ) - } - const candidates = forceFallback - ? [] - : externalDir - ? [path.join(externalDir, repoName, 'unsorted_deps.json')] - : [] - for (const c of candidates) { - if (existsSync(c)) { - // Bound the read to 1GB to prevent OOM on hostile content while allowing large real-world lockfiles. - // eslint-disable-next-line no-await-in-loop - const stat = await fs.stat(c) - if (stat.size > 1024 * 1024 * 1024) { - logger.warn( - `Skipping oversized ${c} (${stat.size} bytes); falling back to cached probe stdout.`, + extras: readonly string[], + verbose: boolean, +): Promise { + const candidates: string[] = [] + if (mode.bzlmod) { + const extResult = await runBazelModShowMavenExtension(queryOpts) + if (extResult.code === 0) { + candidates.push(...parseShowExtensionOutput(extResult.stdout)) + if (verbose) { + logger.log( + `[VERBOSE] workspace ${workspaceRoot}: show_extension yielded`, + candidates, ) - break - } - const json = readFileSync(c, 'utf8') - const parsed = parseUnsortedDepsJson(json) - if (parsed.length) { - if (verbose) { - logger.log( - `[VERBOSE] @${repoName}: source=unsorted_deps.json (${c}, ${parsed.length} artifact(s))`, - ) - } - return parsed.map(a => ({ ...a, sourceRepo: repoName })) } } else if (verbose) { - logger.log(`[VERBOSE] @${repoName}: unsorted_deps.json miss at`, c) + logger.log( + `[VERBOSE] workspace ${workspaceRoot}: show_extension failed (code=${extResult.code}); falling back to conventional probe`, + ) } } - // Reuse the probe stdout that discovery already captured for this repo. - // The probe ran exactly this query during validation and only validated - // repos with code === 0 make it into the cache, so retry is unnecessary - // — if the probe was flaky, the repo wouldn't be in the map. - if (!cachedProbeStdout) { - logger.warn( - `No cached probe stdout for @${repoName}; skipping. (This shouldn't happen — discovery should have populated it.)`, - ) - return [] + // Probe conventional names + extras for any candidate not already + // discovered. WORKSPACE mode relies entirely on the probe; Bzlmod + // mode uses it as a defensive fallback (e.g. custom Maven extensions + // mod show_extension doesn't enumerate). + const seen = new Set(candidates) + const probe = buildMavenProbeFor(queryOpts) + const toProbe = [...CONVENTIONAL_MAVEN_REPO_NAMES, ...extras].filter( + name => !seen.has(name), + ) + for (const name of toProbe) { + // eslint-disable-next-line no-await-in-loop + const status = await probeCandidate(name, probe, verbose) + if (status === 'populated') { + candidates.push(name) + seen.add(name) + } } - if (verbose) { - logger.log( - `[VERBOSE] @${repoName}: source=cached probe stdout (${cachedProbeStdout.length} bytes)`, + return candidates +} + +// Best-effort reap of a Bazel server. Spawned with a short timeout so +// a wedged server can't itself hang the cleanup; failures are swallowed +// because the caller will `rm -rf` the output_user_root regardless. +async function reapBazelServer( + bin: string, + outputUserRoot: string, +): Promise { + try { + await spawn( + bin, + [`--output_user_root=${outputUserRoot}`, 'shutdown'], + { timeout: REAP_TIMEOUT_MS }, ) + } catch { + // Server may already be dead, or shutdown itself timed out — the + // tempdir removal below is sufficient cleanup. + } +} + +async function removeTempdir(dir: string): Promise { + try { + await fs.rm(dir, { recursive: true, force: true }) + } catch { + // Best effort. The next CLI invocation lands a fresh tempdir. } - return parseBazelBuildOutput(cachedProbeStdout).map(a => ({ - ...a, - sourceRepo: repoName, - })) +} + +function makeOutputUserRoot(): string { + return mkdtempSync(path.join(os.tmpdir(), 'socket-bazel-')) } export async function extractBazelToMaven( @@ -341,102 +349,160 @@ export async function extractBazelToMaven( } logger.groupEnd() + const perRepoTimeoutMs = + opts.perRepoTimeoutMs ?? DEFAULT_PER_REPO_TIMEOUT_MS + const extras = opts.extraMavenRepoNames ?? [] + + // Validate config + ensure toolchains BEFORE we mint a tempdir. + let bin: string + let baseEnv: NodeJS.ProcessEnv | undefined try { - // Validate caller-provided Bazel filesystem settings before invoking Bazel. if (opts.bazelOutputBase) { validateOutputBase(opts.bazelOutputBase, opts.cwd) } - // Java must be available before rules_jvm_external/Coursier runs; - // python shim follows so its augmented PATH inherits the JDK prefix. ensureJavaOnPath() const shim = await provisionPythonShim() - const baseEnv = shim.augmentedEnv ?? opts.env + baseEnv = shim.augmentedEnv ?? opts.env + bin = await resolveBazelBinary(opts.bin) + } catch (e) { + logger.fail(`Unexpected error in bazel2maven: ${getErrorCause(e)}`) + if (verbose) { + logger.group('[VERBOSE] error:') + logger.log(e) + logger.groupEnd() + } + return { artifactCount: 0, ok: false } + } + logger.info(`Using bazel: ${bin}`) - // Step 1: workspace detection. - const mode = detectWorkspaceMode(cwd) - logger.info( - `Workspace mode: bzlmod=${mode.bzlmod} workspace=${mode.workspace}`, + // Track every output_user_root we mint so we can reap them all in + // the cleanup pass, even if a per-repo timeout forced a re-mint. + let outputUserRoot = makeOutputUserRoot() + const mintedRoots: string[] = [outputUserRoot] + if (verbose) { + logger.log( + `[VERBOSE] initial --output_user_root=${outputUserRoot} (will be reaped on completion)`, ) - const invocationFlags = getBazelInvocationFlags(mode) + } - // Step 2: bazel binary resolution. - const bin = await resolveBazelBinary(opts.bin) - logger.info(`Using bazel: ${bin}`) - if (verbose) { - logger.log('[VERBOSE] resolved options:', { - bin, - bazelRc: opts.bazelRc ?? '(unset)', - bazelOutputBase: opts.bazelOutputBase ?? '(unset)', - bazelFlags: opts.bazelFlags ?? '(unset)', - invocationFlags, - }) - } + const sidecar: Sidecar = { complete: true, workspaces: [] } + const allArtifacts: ExtractedArtifact[] = [] - // Step 3: build the shared query options object. - const queryOpts: BazelQueryOptions = { - bin, - cwd, - invocationFlags, - ...(opts.bazelRc ? { bazelRc: opts.bazelRc } : {}), - ...(opts.bazelFlags ? { bazelFlags: opts.bazelFlags } : {}), - ...(opts.bazelOutputBase - ? { bazelOutputBase: opts.bazelOutputBase } - : {}), - ...(baseEnv ? { env: baseEnv } : {}), - verbose, + try { + const workspaceRoots = findWorkspaceRoots(cwd, verbose) + if (!workspaceRoots.length) { + logger.warn( + `No Bazel workspace found at ${cwd} or beneath (looked for MODULE.bazel / WORKSPACE / WORKSPACE.bazel).`, + ) + return { artifactCount: 0, noEcosystemFound: true, ok: false } + } + if (verbose) { + logger.log( + `[VERBOSE] discovered ${workspaceRoots.length} workspace root(s):`, + workspaceRoots, + ) } - // Step 4: discover validated Maven repos via the two-step recipe. - // Bzlmod has a native visible-repository surface; prefer that over static - // MODULE.bazel parsing and keep bounded parsing as the legacy/fallback path. - let nativeCandidates: string[] | undefined - if (mode.bzlmod) { - const visibleRepos = await runBazelModShowVisibleRepos(queryOpts) - if (visibleRepos.code === 0) { - nativeCandidates = parseVisibleRepoCandidates(visibleRepos.stdout) + for (const workspaceRoot of workspaceRoots) { + const relPath = path.relative(cwd, workspaceRoot) + let mode: WorkspaceMode + try { + mode = detectWorkspaceMode(workspaceRoot) + } catch (e) { if (verbose) { logger.log( - '[VERBOSE] Bzlmod visible repo candidates:', - nativeCandidates, + `[VERBOSE] workspace ${workspaceRoot}: detect failed (${getErrorCause(e)}); skipping`, ) } - } else if (verbose) { - logger.log( - '[VERBOSE] bazel mod show_repo failed; falling back to static candidate parsing:', - visibleRepos.stderr, - ) + continue } - } - // Returns Map so extraction can reuse the probe - // output and skip running an identical bazel-query a second time. - const probe = buildProbeFor(queryOpts) - const repos = await discoverMavenRepos( - cwd, - probe, - nativeCandidates, - verbose, - ) - const repoNames = Array.from(repos.keys()) - logger.info( - `Discovered ${repos.size} Maven repo(s): ${repoNames.join(', ') || '(none)'}`, - ) + logger.info( + `Workspace ${relPath || '.'}: bzlmod=${mode.bzlmod} workspace=${mode.workspace}`, + ) + const invocationFlags = getBazelInvocationFlags(mode) + const buildQueryOpts = ( + userRoot: string, + spawnCwd: string, + ): BazelQueryOptions => ({ + bin, + cwd: spawnCwd, + invocationFlags, + outputUserRoot: userRoot, + ...(opts.bazelRc ? { bazelRc: opts.bazelRc } : {}), + ...(opts.bazelFlags ? { bazelFlags: opts.bazelFlags } : {}), + ...(opts.bazelOutputBase + ? { bazelOutputBase: opts.bazelOutputBase } + : {}), + ...(baseEnv ? { env: baseEnv } : {}), + verbose, + }) - // Step 5: extract artifacts from each repo (preferring unsorted_deps.json). - const allArtifacts: ExtractedArtifact[] = [] - for (const [repo, probeStdout] of repos) { // eslint-disable-next-line no-await-in-loop - const artifacts = await extractFromOneRepo(repo, queryOpts, probeStdout) - allArtifacts.push(...artifacts) - logger.info(`@${repo}: ${artifacts.length} artifact(s)`) - } + const candidates = await discoverCandidatesForWorkspace( + workspaceRoot, + mode, + buildQueryOpts(outputUserRoot, workspaceRoot), + extras, + verbose, + ) + logger.info( + `Workspace ${relPath || '.'}: discovered ${candidates.length} Maven repo(s): ${ + candidates.join(', ') || '(none)' + }`, + ) + const wsEntry: SidecarWorkspaceEntry = { + mode: { bzlmod: mode.bzlmod, workspace: mode.workspace }, + relPath, + repos: [], + } - // Step 6: normalize to maven_install.json shape. - const normalized = normalizeToMavenInstallJson(allArtifacts) + for (const repoName of candidates) { + // eslint-disable-next-line no-await-in-loop + const result: CqueryRepoResult = await runMetadataCqueryForRepo({ + opts: buildQueryOpts(outputUserRoot, workspaceRoot), + repoName, + timeoutMs: perRepoTimeoutMs, + workspaceRelPath: relPath, + workspaceRoot, + }) + wsEntry.repos.push({ + artifactCount: result.artifacts.length, + durationMs: result.durationMs, + name: repoName, + status: result.status, + }) + allArtifacts.push(...result.artifacts) + if (result.status === 'ok' || result.status === 'partial') { + logger.info( + `@${repoName}: ${result.artifacts.length} artifact(s) (status=${result.status})`, + ) + } else if (result.status === 'timeout') { + logger.warn( + `@${repoName}: cquery timed out after ${perRepoTimeoutMs}ms; reaping server`, + ) + sidecar.complete = false + // eslint-disable-next-line no-await-in-loop + await reapBazelServer(bin, outputUserRoot) + // eslint-disable-next-line no-await-in-loop + await removeTempdir(outputUserRoot) + outputUserRoot = makeOutputUserRoot() + mintedRoots.push(outputUserRoot) + if (verbose) { + logger.log( + `[VERBOSE] minted fresh --output_user_root=${outputUserRoot} after timeout`, + ) + } + } else if (verbose) { + logger.log( + `[VERBOSE] @${repoName}: status=${result.status} (no artifacts)`, + ) + } + } + sidecar.workspaces.push(wsEntry) + } - // Step 7: write outputs. - // Standalone output writes directly to `out`; auto-manifest uses a sibling directory - // to avoid colliding with a repo's checked-in rules_jvm_external lockfile and - // to avoid repo-root gitignore patterns such as `/maven_install.json`. + const deduped = dedupArtifactsByCoord(allArtifacts) + const normalized = normalizeToMavenInstallJson(deduped) const layout = opts.outLayout ?? 'standalone' const manifestDir = layout === 'flat' ? path.join(out, '.socket-auto-manifest') : out @@ -447,21 +513,23 @@ export async function extractBazelToMaven( JSON.stringify(normalized, null, 2), 'utf8', ) + const statusPath = path.join(manifestDir, 'manifest-status.json') + await fs.writeFile(statusPath, JSON.stringify(sidecar, null, 2), 'utf8') if (verbose) { logger.log('[VERBOSE] outputs:', { - artifactCount: allArtifacts.length, - generatedManifest: path.relative(out, manifestPath), + artifactCount: deduped.length, + complete: sidecar.complete, layout, - manifest: manifestPath, - mavenRepos: repoNames, - tool: 'socket manifest bazel', - workspace: { bzlmod: mode.bzlmod, legacyWorkspace: mode.workspace }, + manifestPath, + statusPath, + workspaceCount: sidecar.workspaces.length, }) } - if (!allArtifacts.length) { - if (!repos.size) { + const anyRepos = sidecar.workspaces.some(w => w.repos.length > 0) + if (!deduped.length) { + if (!anyRepos) { if (verbose) { logger.info( 'No Maven artifacts extracted. failureCategory=no-supported-ecosystem', @@ -472,29 +540,24 @@ export async function extractBazelToMaven( manifestPath, noEcosystemFound: true, ok: false, + statusPath, } } logger.fail( - `Discovered Maven repo(s) ${repoNames.join(', ')} but extracted zero artifacts. failureCategory=ecosystem-detected-but-empty`, + 'Discovered Maven repo(s) but extracted zero artifacts. failureCategory=ecosystem-detected-but-empty', ) - return { - artifactCount: 0, - manifestPath, - ok: false, - } + return { artifactCount: 0, manifestPath, ok: false, statusPath } } logger.success( - `Wrote ${allArtifacts.length} artifact(s) to ${path.relative(cwd, manifestPath)}.`, + `Wrote ${deduped.length} artifact(s) to ${path.relative(cwd, manifestPath)}.`, ) return { - artifactCount: allArtifacts.length, + artifactCount: deduped.length, manifestPath, - ok: true, + ok: sidecar.complete, + statusPath, } } catch (e) { - // Always surface the error message; users should not have to - // re-run a multi-minute bazel build with --verbose just to see whether - // the failure was a missing dependency, permission error, or network blip. logger.fail(`Unexpected error in bazel2maven: ${getErrorCause(e)}`) if (verbose) { logger.group('[VERBOSE] error:') @@ -504,5 +567,12 @@ export async function extractBazelToMaven( logger.info('Re-run with --verbose for the full stack.') } return { artifactCount: 0, ok: false } + } finally { + for (const dir of mintedRoots) { + // eslint-disable-next-line no-await-in-loop + await reapBazelServer(bin, dir) + // eslint-disable-next-line no-await-in-loop + await removeTempdir(dir) + } } } From 931436592e679cd42ecaa5ec800e753f0e3c461a Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 14:23:09 +0200 Subject: [PATCH 08/10] test(manifest/bazel): rewrite orchestrator tests for the per-workspace pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous tests pinned the legacy unsorted_deps.json fast path, kind-only probes, and dump_repo_mapping enumeration. The new tests mock the orchestrator's three external collaborators — findWorkspaceRoots, runBazelModShowMavenExtension, runMetadataCqueryForRepo — and assert on the contract that matters: end-to-end Bzlmod and WORKSPACE-mode flows, the per-repo cquery loop, cross-workspace coordinate dedup, the timeout → re-mint loop, sidecar `manifest-status.json` shape, and `extraMavenRepoNames` threading. Pure-function `normalizeToMavenInstallJson` keeps a focused trio of unit tests (dedup, version-conflict, sha256-preservation). The fixture-driven .socket.facts.json non-emission assertion stays so the Maven-path-vs-facts-path invariant is exercised. Also patch the PyPI test mock: parseVisibleRepoCandidates moved from bazel-repo-discovery to bazel-pypi-discovery in a previous commit, so the test's vi.mock now mirrors the actual export surface. The probe fixture grows a `stderr` field to match the new RepoProbe contract. --- .../bazel/extract_bazel_to_maven.test.mts | 811 ++++++++---------- .../bazel/extract_bazel_to_pypi.test.mts | 4 +- 2 files changed, 354 insertions(+), 461 deletions(-) diff --git a/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts b/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts index 4d43c1da5..9d7f9821f 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts @@ -3,7 +3,6 @@ import { mkdirSync, mkdtempSync, readFileSync, - readdirSync, rmSync, writeFileSync, } from 'node:fs' @@ -12,157 +11,174 @@ import path from 'node:path' import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' -// Mock the helpers BEFORE importing the orchestrator. -vi.mock('./bazel-workspace-detect.mts', () => ({ - detectWorkspaceMode: vi.fn(), - getBazelInvocationFlags: vi.fn(() => []), -})) +import type { CqueryRepoResult } from './bazel-cquery.mts' +import type { ExtractedArtifact } from './bazel-build-parser.mts' + +// Mock collaborators BEFORE importing the orchestrator. The orchestrator +// composes pure-function discovery + the metadata cquery + a workspace +// walker; mocking these lets us drive end-to-end behaviour without a +// real Bazel toolchain. vi.mock('./bazel-bin-detect.mts', () => ({ resolveBazelBinary: vi.fn(async () => '/usr/local/bin/bazel'), })) -vi.mock('./bazel-repo-discovery.mts', () => ({ - discoverMavenRepos: vi.fn(), - parseVisibleRepoCandidates: vi.fn(() => []), - parseMavenRepoCandidates: vi.fn(), - validateMavenRepo: vi.fn(), -})) -const { probe } = vi.hoisted(() => ({ - probe: async () => ({ code: 0, stdout: 'maven_coordinates=' }), -})) -vi.mock('./bazel-query-runner.mts', () => ({ - buildProbeFor: vi.fn(() => probe), - runBazelModShowVisibleRepos: vi.fn(async () => ({ - code: 0, - stderr: '', - stdout: '', - })), - runBazelQuery: vi.fn(), -})) -// Mock hardening helpers so unit tests run without real fs/network side-effects. vi.mock('./bazel-output-base-check.mts', () => ({ validateOutputBase: vi.fn(), })) +vi.mock('./bazel-java-shim.mts', () => ({ + ensureJavaOnPath: vi.fn(), +})) vi.mock('./bazel-python-shim.mts', () => ({ provisionPythonShim: vi.fn(async () => ({ augmentedEnv: undefined, shimDir: undefined, })), })) -// ensureJavaOnPath now throws when java is missing; unit tests must not -// depend on the host having a JDK installed. -vi.mock('./bazel-java-shim.mts', () => ({ - ensureJavaOnPath: vi.fn(), +vi.mock('./bazel-workspace-detect.mts', () => ({ + detectWorkspaceMode: vi.fn(), + getBazelInvocationFlags: vi.fn(() => []), +})) +vi.mock('./bazel-workspace-walk.mts', () => ({ + findWorkspaceRoots: vi.fn(), +})) +vi.mock('./bazel-query-runner.mts', () => ({ + buildMavenProbeFor: vi.fn(() => async (_: string) => ({ + code: 1, + stdout: '', + stderr: "ERROR: No repository visible as '@x' from main repository\n", + })), + runBazelModShowMavenExtension: vi.fn(), +})) +vi.mock('./bazel-repo-discovery.mts', async () => { + // Preserve `CONVENTIONAL_MAVEN_REPO_NAMES` + `probeCandidate` while + // overriding `parseShowExtensionOutput` with a spy. + const actual = await vi.importActual< + typeof import('./bazel-repo-discovery.mts') + >('./bazel-repo-discovery.mts') + return { + ...actual, + parseShowExtensionOutput: vi.fn(actual.parseShowExtensionOutput), + } +}) +vi.mock('./bazel-cquery.mts', () => ({ + runMetadataCqueryForRepo: vi.fn(), +})) +// Quiet the spawn calls reapBazelServer makes during cleanup. +vi.mock('@socketsecurity/registry/lib/spawn', () => ({ + spawn: vi.fn(async () => ({ code: 0, stdout: '', stderr: '' })), })) -import { validateOutputBase } from './bazel-output-base-check.mts' -import { discoverMavenRepos } from './bazel-repo-discovery.mts' +import { runMetadataCqueryForRepo } from './bazel-cquery.mts' +import { + buildMavenProbeFor, + runBazelModShowMavenExtension, +} from './bazel-query-runner.mts' +import { parseShowExtensionOutput } from './bazel-repo-discovery.mts' import { detectWorkspaceMode } from './bazel-workspace-detect.mts' +import { findWorkspaceRoots } from './bazel-workspace-walk.mts' import { extractBazelToMaven, normalizeToMavenInstallJson, } from './extract_bazel_to_maven.mts' -const FIXTURES = path.join( - import.meta.dirname, - '..', - '..', - '..', - '..', - 'test', - 'fixtures', - 'manifest-bazel', - 'query-output', -) +const mkResult = (over: Partial): CqueryRepoResult => ({ + artifacts: [], + durationMs: 0, + repoName: 'maven', + status: 'ok', + stderr: '', + workspaceRelPath: '', + ...over, +}) -// Walk a directory recursively and return all file paths. -function walk(dir: string): string[] { - const acc: string[] = [] - for (const e of readdirSync(dir, { withFileTypes: true })) { - const p = path.join(dir, e.name) - if (e.isDirectory()) { - acc.push(...walk(p)) - } else { - acc.push(p) - } - } - return acc -} +const mkArt = ( + coord: string, + ruleName: string, + over: Partial = {}, +): ExtractedArtifact => ({ + deps: [], + mavenCoordinates: coord, + ruleKind: 'jvm_import', + ruleName, + sourceRepo: 'maven', + ...over, +}) + +const SHOW_EXT_HUB_ONLY = `## @@rules_jvm_external+//:extensions.bzl%maven: + +Fetched repositories: + - maven (imported by ) +` describe('extractBazelToMaven', () => { let tmp: string beforeEach(() => { - tmp = mkdtempSync(path.join(os.tmpdir(), 'bazel-extract-')) + tmp = mkdtempSync(path.join(os.tmpdir(), 'sock-bazel-x2m-')) vi.mocked(detectWorkspaceMode).mockReturnValue({ bzlmod: true, workspace: false, }) - process.exitCode = 0 + vi.mocked(findWorkspaceRoots).mockReturnValue([tmp]) + vi.mocked(runBazelModShowMavenExtension).mockResolvedValue({ + code: 0, + stdout: SHOW_EXT_HUB_ONLY, + stderr: '', + }) + vi.mocked(parseShowExtensionOutput).mockClear() + vi.mocked(runBazelModShowMavenExtension).mockClear() + vi.mocked(runMetadataCqueryForRepo).mockReset() + vi.mocked(buildMavenProbeFor).mockReset() + vi.mocked(buildMavenProbeFor).mockReturnValue(async () => ({ + code: 1, + stdout: '', + stderr: "ERROR: No repository visible as '@x' from main repository\n", + })) }) afterEach(() => { rmSync(tmp, { recursive: true, force: true }) - vi.resetAllMocks() - process.exitCode = 0 }) - it('dedupes exact duplicate coordinates without failing', () => { - const manifest = normalizeToMavenInstallJson([ - { - ruleKind: 'jvm_import', - ruleName: 'com_example_demo', - mavenCoordinates: 'com.example:demo:1.0.0', - deps: [], - }, - { - ruleKind: 'jvm_import', - ruleName: 'com_example_demo', - mavenCoordinates: 'com.example:demo:1.0.0', - deps: [], - }, - ]) - - expect(Object.keys(manifest.artifacts)).toEqual(['com.example:demo']) - expect(manifest.artifacts['com.example:demo']).toEqual({ - shasums: {}, - version: '1.0.0', - }) - }) - - it('fails on duplicate label suffixes when dependency resolution is ambiguous', () => { - expect(() => - normalizeToMavenInstallJson([ - { - ruleKind: 'jvm_import', - ruleName: 'root', - mavenCoordinates: 'com.example:root:1.0.0', - deps: [':shared_rule_name'], - }, - { - ruleKind: 'jvm_import', - ruleName: 'shared_rule_name', - mavenCoordinates: 'com.one:lib:1.0.0', - deps: [], - }, - { - ruleKind: 'jvm_import', - ruleName: 'shared_rule_name', - mavenCoordinates: 'com.two:lib:1.0.0', - deps: [], - }, - ]), - ).toThrow(/Ambiguous Bazel dependency label :shared_rule_name/) - }) - - it('writes maven_install.json directly under out without a summary sidecar', async () => { - const sample = readFileSync( - path.join(FIXTURES, 'jvm-import-sample.txt'), - 'utf8', + function readManifest(out: string): unknown { + return JSON.parse( + readFileSync( + path.join(out, '.socket-auto-manifest', 'maven_install.json'), + 'utf8', + ), ) - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([['maven', sample]]), + } + + function readSidecar(out: string): { + complete: boolean + workspaces: Array<{ + relPath: string + mode: { bzlmod: boolean; workspace: boolean } + repos: Array<{ + name: string + status: string + artifactCount: number + durationMs: number + }> + }> + } { + return JSON.parse( + readFileSync( + path.join(out, '.socket-auto-manifest', 'manifest-status.json'), + 'utf8', + ), ) + } + it('extracts a single Bzlmod workspace end-to-end', async () => { + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ + artifacts: [ + mkArt('com.google.guava:guava:33.0.0-jre', 'com_google_guava_guava'), + mkArt('androidx.annotation:annotation:1.8.2', 'androidx_annotation'), + ], + repoName: 'maven', + }), + ) const result = await extractBazelToMaven({ bazelFlags: undefined, bazelOutputBase: undefined, @@ -170,43 +186,30 @@ describe('extractBazelToMaven', () => { bin: undefined, cwd: tmp, out: tmp, + outLayout: 'flat', verbose: false, }) - expect(result).toEqual({ + expect(result.ok).toBe(true) + expect(result.artifactCount).toBe(2) + const manifest = readManifest(tmp) as { + artifacts: Record + } + expect(Object.keys(manifest.artifacts).sort()).toEqual([ + 'androidx.annotation:annotation', + 'com.google.guava:guava', + ]) + const sidecar = readSidecar(tmp) + expect(sidecar.complete).toBe(true) + expect(sidecar.workspaces).toHaveLength(1) + expect(sidecar.workspaces[0]!.repos[0]).toMatchObject({ artifactCount: 2, - manifestPath: path.join(tmp, 'maven_install.json'), - ok: true, + name: 'maven', + status: 'ok', }) - - const manifestText = readFileSync( - path.join(tmp, 'maven_install.json'), - 'utf8', - ) - const manifest = JSON.parse(manifestText) - expect(manifest.artifacts['com.google.guava:guava']).toEqual({ - shasums: { jar: expect.stringMatching(/^9408c2c4/) }, - version: '33.0.0-jre', - }) - // Per the canonical rules_jvm_external maven_install.json shape (see - // normalizeToMavenInstallJson), dependency keys and values use "g:a" - // (no version) — matching rules_jvm_external lockfile output. - expect(manifest.dependencies['com.google.guava:guava']).toContain( - 'com.google.guava:failureaccess', - ) - - expect(existsSync(path.join(tmp, 'socket-bazel-summary.json'))).toBe(false) - expect(existsSync(path.join(tmp, '_whole_repo'))).toBe(false) }) - it('writes outputs to .socket-auto-manifest/ when outLayout is "flat"', async () => { - const sample = readFileSync( - path.join(FIXTURES, 'jvm-import-sample.txt'), - 'utf8', - ) - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([['maven', sample]]), - ) - + it('returns noEcosystemFound when no workspace roots are discovered', async () => { + vi.mocked(findWorkspaceRoots).mockReturnValue([]) const result = await extractBazelToMaven({ bazelFlags: undefined, bazelOutputBase: undefined, @@ -217,40 +220,14 @@ describe('extractBazelToMaven', () => { outLayout: 'flat', verbose: false, }) - expect(result).toEqual({ - artifactCount: 2, - manifestPath: path.join( - tmp, - '.socket-auto-manifest', - 'maven_install.json', - ), - ok: true, - }) - - // Manifest lands inside the sibling dir. - expect( - existsSync(path.join(tmp, '.socket-auto-manifest', 'maven_install.json')), - ).toBe(true) - expect( - existsSync( - path.join(tmp, '.socket-auto-manifest', 'socket-bazel-summary.json'), - ), - ).toBe(false) - // Neither output bleeds into / itself nor a _whole_repo/ wrapper. - expect(existsSync(path.join(tmp, 'maven_install.json'))).toBe(false) - expect(existsSync(path.join(tmp, 'socket-bazel-summary.json'))).toBe(false) - expect(existsSync(path.join(tmp, '_whole_repo'))).toBe(false) + expect(result.ok).toBe(false) + expect(result.noEcosystemFound).toBe(true) }) - it('writes NO .socket.facts.json files anywhere under out', async () => { - const sample = readFileSync( - path.join(FIXTURES, 'jvm-import-sample.txt'), - 'utf8', + it('reports detected-but-empty when discovered repos extract zero artifacts', async () => { + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ artifacts: [], status: 'empty', repoName: 'maven' }), ) - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([['maven', sample]]), - ) - const result = await extractBazelToMaven({ bazelFlags: undefined, bazelOutputBase: undefined, @@ -258,19 +235,41 @@ describe('extractBazelToMaven', () => { bin: undefined, cwd: tmp, out: tmp, + outLayout: 'flat', verbose: false, }) - - const files = walk(tmp) - expect( - files.find(f => path.basename(f) === '.socket.facts.json'), - ).toBeUndefined() - expect(result.ok).toBe(true) + expect(result.ok).toBe(false) + expect(result.noEcosystemFound).toBeUndefined() }) - it('reports noEcosystemFound without mutating process.exitCode when no repos discovered', async () => { - vi.mocked(discoverMavenRepos).mockResolvedValue(new Map()) - + it('dedups artifacts across multiple workspaces by full Maven coordinate', async () => { + const nested = path.join(tmp, 'examples', 'dagger') + mkdirSync(nested, { recursive: true }) + vi.mocked(findWorkspaceRoots).mockReturnValue([tmp, nested]) + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ + artifacts: [ + mkArt('com.google.guava:guava:33.0.0-jre', 'com_google_guava_guava'), + ], + repoName: 'maven', + workspaceRelPath: '', + }), + ) + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ + artifacts: [ + // Same coord as the root workspace — must be deduped. + mkArt('com.google.guava:guava:33.0.0-jre', 'com_google_guava_guava', { + sourceRepo: 'examples/dagger:maven', + }), + mkArt('com.google.dagger:dagger:2.50', 'com_google_dagger_dagger', { + sourceRepo: 'examples/dagger:maven', + }), + ], + repoName: 'maven', + workspaceRelPath: 'examples/dagger', + }), + ) const result = await extractBazelToMaven({ bazelFlags: undefined, bazelOutputBase: undefined, @@ -278,30 +277,46 @@ describe('extractBazelToMaven', () => { bin: undefined, cwd: tmp, out: tmp, + outLayout: 'flat', verbose: false, }) + expect(result.artifactCount).toBe(2) + const manifest = readManifest(tmp) as { + artifacts: Record + } + expect(Object.keys(manifest.artifacts).sort()).toEqual([ + 'com.google.dagger:dagger', + 'com.google.guava:guava', + ]) + const sidecar = readSidecar(tmp) + expect(sidecar.workspaces.map(w => w.relPath)).toEqual([ + '', + 'examples/dagger', + ]) + }) - expect(process.exitCode).toBe(0) - expect(result).toEqual({ - artifactCount: 0, - manifestPath: path.join(tmp, 'maven_install.json'), - noEcosystemFound: true, - ok: false, + it('marks the sidecar complete:false on per-repo timeout and keeps going', async () => { + // Two candidates: first times out, second succeeds. The orchestrator + // re-mints --output_user_root after the timeout. + vi.mocked(runBazelModShowMavenExtension).mockResolvedValue({ + code: 0, + stdout: `## @@rules_jvm_external+//:extensions.bzl%maven: + +Fetched repositories: + - maven (imported by ) + - maven_dev (imported by ) +`, + stderr: '', }) - // Empty manifest is still written. - const manifestText = readFileSync( - path.join(tmp, 'maven_install.json'), - 'utf8', + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ artifacts: [], status: 'timeout', repoName: 'maven' }), ) - const manifest = JSON.parse(manifestText) - expect(manifest.artifacts).toEqual({}) - }) - - it('reports hard failure when discovered repos extract zero artifacts', async () => { - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([['maven', '# no parseable rules\n']]), + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ + artifacts: [mkArt('com.example:after:1.0', 'after')], + repoName: 'maven_dev', + }), ) - const result = await extractBazelToMaven({ bazelFlags: undefined, bazelOutputBase: undefined, @@ -309,319 +324,197 @@ describe('extractBazelToMaven', () => { bin: undefined, cwd: tmp, out: tmp, + outLayout: 'flat', + perRepoTimeoutMs: 60_000, verbose: false, }) - - expect(result).toEqual({ - artifactCount: 0, - manifestPath: path.join(tmp, 'maven_install.json'), - ok: false, - }) - expect(result.noEcosystemFound).toBeUndefined() + expect(result.ok).toBe(false) + expect(result.artifactCount).toBe(1) + const sidecar = readSidecar(tmp) + expect(sidecar.complete).toBe(false) + expect(sidecar.workspaces[0]!.repos.map(r => r.status)).toEqual([ + 'timeout', + 'ok', + ]) }) - it('iterates each discovered repo independently when one has no parseable rules', async () => { - const sample = readFileSync( - path.join(FIXTURES, 'jvm-import-sample.txt'), - 'utf8', - ) - // First repo's probe stdout has the canonical sample (2 artifacts). - // Second repo's probe stdout has no parseable jvm_import / aar_import - // blocks, so the parser yields 0 artifacts for it — the iteration must - // still surface the first repo's results. - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([ - ['maven', sample], - ['maven_test', '# no rules here\n'], - ]), + it('threads extraMavenRepoNames into the candidate list (WORKSPACE mode)', async () => { + vi.mocked(detectWorkspaceMode).mockReturnValue({ + bzlmod: false, + workspace: true, + }) + // Probe accepts only `my_jars`; conventional names all return not-defined. + vi.mocked(buildMavenProbeFor).mockReturnValue(async (name: string) => { + if (name === 'my_jars') { + return { code: 0, stdout: '@my_jars//:foo\n', stderr: '' } + } + return { + code: 1, + stdout: '', + stderr: "ERROR: No repository visible as '@x' from main repository\n", + } + }) + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ + artifacts: [mkArt('com.example:custom:1.0', 'custom')], + repoName: 'my_jars', + }), ) - const result = await extractBazelToMaven({ bazelFlags: undefined, bazelOutputBase: undefined, bazelRc: undefined, bin: undefined, cwd: tmp, + extraMavenRepoNames: ['my_jars'], out: tmp, + outLayout: 'flat', verbose: false, }) - - const manifest = JSON.parse( - readFileSync(path.join(tmp, 'maven_install.json'), 'utf8'), + expect(result.ok).toBe(true) + expect(result.artifactCount).toBe(1) + expect(runMetadataCqueryForRepo).toHaveBeenCalledTimes(1) + expect(vi.mocked(runMetadataCqueryForRepo).mock.calls[0]![0]).toMatchObject( + { repoName: 'my_jars' }, ) - // Only the successful repo's artifacts (2); maven_test was skipped. - expect(Object.keys(manifest.artifacts)).toHaveLength(2) - expect(result).toEqual({ - artifactCount: 2, - manifestPath: path.join(tmp, 'maven_install.json'), - ok: true, - }) + // show_extension must NOT be called in pure WORKSPACE mode. + expect(runBazelModShowMavenExtension).not.toHaveBeenCalled() }) - it('returns failure without mutating process.exitCode when one group:artifact has conflicting versions', async () => { - const conflictingStdout = [ - 'jvm_import(', - ' name = "com_example_demo_v1",', - ' maven_coordinates = "com.example:demo:1.0.0",', - ')', - 'jvm_import(', - ' name = "com_example_demo_v2",', - ' maven_coordinates = "com.example:demo:2.0.0",', - ')', - ].join('\n') - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([['maven', conflictingStdout]]), + it('writes manifest-status.json beside maven_install.json in flat layout', async () => { + vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( + mkResult({ + artifacts: [mkArt('com.example:a:1.0', 'a')], + repoName: 'maven', + }), ) - - const result = await extractBazelToMaven({ + await extractBazelToMaven({ bazelFlags: undefined, bazelOutputBase: undefined, bazelRc: undefined, bin: undefined, cwd: tmp, out: tmp, + outLayout: 'flat', verbose: false, }) - - expect(process.exitCode).toBe(0) - expect(result).toEqual({ - artifactCount: 0, - ok: false, - }) - expect(existsSync(path.join(tmp, 'maven_install.json'))).toBe(false) + expect( + existsSync(path.join(tmp, '.socket-auto-manifest', 'maven_install.json')), + ).toBe(true) + expect( + existsSync( + path.join(tmp, '.socket-auto-manifest', 'manifest-status.json'), + ), + ).toBe(true) }) +}) - it('calls validateOutputBase when bazelOutputBase is set', async () => { - vi.mocked(discoverMavenRepos).mockResolvedValue(new Map()) - await extractBazelToMaven({ - bazelFlags: undefined, - bazelOutputBase: tmp, - bazelRc: undefined, - bin: undefined, - cwd: tmp, - out: tmp, - verbose: false, - }) - // validateOutputBase is mocked; verify it was called with the provided path. - expect(vi.mocked(validateOutputBase)).toHaveBeenCalledWith(tmp, tmp) +describe('normalizeToMavenInstallJson', () => { + it('dedupes exact duplicate coordinates without failing', () => { + const result = normalizeToMavenInstallJson([ + { + deps: [], + mavenCoordinates: 'com.google.guava:guava:33.0.0-jre', + ruleKind: 'jvm_import', + ruleName: 'com_google_guava_guava', + }, + { + deps: [], + mavenCoordinates: 'com.google.guava:guava:33.0.0-jre', + ruleKind: 'jvm_import', + ruleName: 'com_google_guava_guava', + }, + ]) + expect(Object.keys(result.artifacts)).toEqual(['com.google.guava:guava']) }) - it('propagates verbose into discovery and emits resolved-options / outputs diagnostics', async () => { - const sample = readFileSync( - path.join(FIXTURES, 'jvm-import-sample.txt'), - 'utf8', - ) - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([['maven', sample]]), - ) - const { logger } = await import('@socketsecurity/registry/lib/logger') - const logSpy = vi.spyOn(logger, 'log').mockImplementation(() => logger) - - try { - await extractBazelToMaven({ - bazelFlags: undefined, - bazelOutputBase: undefined, - bazelRc: undefined, - bin: undefined, - cwd: tmp, - out: tmp, - verbose: true, - }) - - const text = logSpy.mock.calls - .map(args => - args - .map(a => (typeof a === 'string' ? a : JSON.stringify(a))) - .join(' '), - ) - .join('\n') - // Resolved-options block — names a few known load-bearing fields. - expect(text).toContain('[VERBOSE] resolved options:') - expect(text).toContain('bin') - expect(text).toContain('bazelRc') - expect(text).toContain('bazelOutputBase') - // Outputs block names manifest path and extracted summary fields. - expect(text).toContain('[VERBOSE] outputs:') - expect(text).toContain('manifest') - expect(text).toContain('artifactCount') - expect(text).toContain('generatedManifest') - expect(text).toContain('mavenRepos') - - // Discovery was called with verbose=true as the 4th positional. The - // 3rd positional reflects whatever parseVisibleRepoCandidates returned - // (an empty array in this mocked setup). - expect(vi.mocked(discoverMavenRepos)).toHaveBeenCalledWith( - expect.any(String), - expect.any(Function), - expect.any(Array), - true, - ) - } finally { - logSpy.mockRestore() - } + it('fails on conflicting versions for the same group:artifact', () => { + expect(() => + normalizeToMavenInstallJson([ + { + deps: [], + mavenCoordinates: 'com.example:lib:1.0', + ruleKind: 'jvm_import', + ruleName: 'a', + }, + { + deps: [], + mavenCoordinates: 'com.example:lib:2.0', + ruleKind: 'jvm_import', + ruleName: 'b', + }, + ]), + ).toThrow(/Conflicting versions/) }) -}) -describe('SOCKET_BAZEL_FORCE_QUERY_FALLBACK', () => { - // These tests pit two parsers against each other by giving each a - // coordinate the other does not produce, then assert which one ran by - // checking which coordinate landed in the manifest. - // - unsorted_deps.json (fast path) → `com.example:from-json:9.9.9` - // - cached probe stdout (regex fallback) → `com.example:from-regex:1.0.0` - const FAST_PATH_JSON = JSON.stringify({ - artifacts: [ + it('preserves the first artifact’s sha256 when subsequent dupes lack one', () => { + const result = normalizeToMavenInstallJson([ { - coordinates: 'com.example:from-json:9.9.9', - url: 'https://example.invalid/from-json-9.9.9.jar', - sha256: - '1111111111111111111111111111111111111111111111111111111111111111', deps: [], + mavenCoordinates: 'com.example:lib:1.0', + mavenSha256: 'a'.repeat(64), + ruleKind: 'jvm_import', + ruleName: 'a', }, - ], + { + deps: [], + mavenCoordinates: 'com.example:lib:1.0', + ruleKind: 'jvm_import', + ruleName: 'a', + }, + ]) + expect(result.artifacts['com.example:lib']?.shasums.jar).toBe('a'.repeat(64)) }) +}) - const FALLBACK_PROBE_STDOUT = [ - 'jvm_import(', - ' name = "com_example_from_regex",', - ' jars = ["@maven//:from-regex-1.0.0.jar"],', - ' maven_coordinates = "com.example:from-regex:1.0.0",', - ' deps = [],', - ')', - '', - ].join('\n') - +describe('fixture-driven write-output', () => { let tmp: string - let originalEnv: string | undefined beforeEach(() => { - tmp = mkdtempSync(path.join(os.tmpdir(), 'bazel-extract-fallback-')) - // Place unsorted_deps.json under /external/maven/. - // This is what bazelExternalDir resolves to when bazelOutputBase is set. - const externalRepoDir = path.join(tmp, 'external', 'maven') - mkdirSync(externalRepoDir, { recursive: true }) - writeFileSync( - path.join(externalRepoDir, 'unsorted_deps.json'), - FAST_PATH_JSON, - 'utf8', - ) + tmp = mkdtempSync(path.join(os.tmpdir(), 'sock-bazel-write-')) vi.mocked(detectWorkspaceMode).mockReturnValue({ bzlmod: true, workspace: false, }) - vi.mocked(discoverMavenRepos).mockResolvedValue( - new Map([['maven', FALLBACK_PROBE_STDOUT]]), + vi.mocked(findWorkspaceRoots).mockReturnValue([tmp]) + vi.mocked(runBazelModShowMavenExtension).mockResolvedValue({ + code: 0, + stdout: SHOW_EXT_HUB_ONLY, + stderr: '', + }) + vi.mocked(runMetadataCqueryForRepo).mockReset() + vi.mocked(runMetadataCqueryForRepo).mockResolvedValue( + mkResult({ + artifacts: [mkArt('com.example:lib:1.0', 'lib')], + repoName: 'maven', + }), ) - originalEnv = process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] - process.exitCode = 0 }) afterEach(() => { - if (originalEnv === undefined) { - delete process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] - } else { - process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] = originalEnv - } rmSync(tmp, { recursive: true, force: true }) - vi.resetAllMocks() - process.exitCode = 0 }) - it('uses the unsorted_deps.json fast path when the env var is unset', async () => { - delete process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] - - const result = await extractBazelToMaven({ - bazelFlags: undefined, - bazelOutputBase: tmp, - bazelRc: undefined, - bin: undefined, - cwd: tmp, - out: tmp, - verbose: false, - }) - - expect(result.ok).toBe(true) - const manifest = JSON.parse( - readFileSync(path.join(tmp, 'maven_install.json'), 'utf8'), - ) - // The JSON parser ran: from-json coord is present, from-regex is absent. - expect(manifest.artifacts['com.example:from-json']).toBeDefined() - expect(manifest.artifacts['com.example:from-regex']).toBeUndefined() - }) - - it('skips the unsorted_deps.json fast path and uses the regex fallback when the env var is "1"', async () => { - process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] = '1' - - const result = await extractBazelToMaven({ - bazelFlags: undefined, - bazelOutputBase: tmp, - bazelRc: undefined, - bin: undefined, - cwd: tmp, - out: tmp, - verbose: false, - }) - - expect(result.ok).toBe(true) - const manifest = JSON.parse( - readFileSync(path.join(tmp, 'maven_install.json'), 'utf8'), - ) - // The regex parser ran: from-regex coord is present, from-json is absent. - expect(manifest.artifacts['com.example:from-regex']).toBeDefined() - expect(manifest.artifacts['com.example:from-json']).toBeUndefined() - }) - - it.each([ - ['unset', undefined], - ['empty string', ''], - ['"0"', '0'], - ['"false"', 'false'], - ])('treats %s as falsy and uses the fast path', async (_label, value) => { - if (value === undefined) { - delete process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] - } else { - process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] = value - } - - const result = await extractBazelToMaven({ - bazelFlags: undefined, - bazelOutputBase: tmp, - bazelRc: undefined, - bin: undefined, - cwd: tmp, - out: tmp, - verbose: false, - }) - - expect(result.ok).toBe(true) - const manifest = JSON.parse( - readFileSync(path.join(tmp, 'maven_install.json'), 'utf8'), - ) - expect(manifest.artifacts['com.example:from-json']).toBeDefined() - expect(manifest.artifacts['com.example:from-regex']).toBeUndefined() - }) - - it.each([ - ['"1"', '1'], - ['"true"', 'true'], - ['"YES"', 'YES'], - ])('treats %s as truthy and forces the fallback', async (_label, value) => { - process.env['SOCKET_BAZEL_FORCE_QUERY_FALLBACK'] = value - - const result = await extractBazelToMaven({ + it('does not emit any .socket.facts.json file (Maven path is BOM-only)', async () => { + const outDir = path.join(tmp, 'out') + mkdirSync(outDir, { recursive: true }) + // Sanity: ensure unrelated files in out/ are not touched. + writeFileSync(path.join(outDir, 'README.md'), '') + await extractBazelToMaven({ bazelFlags: undefined, - bazelOutputBase: tmp, + bazelOutputBase: undefined, bazelRc: undefined, bin: undefined, cwd: tmp, - out: tmp, + out: outDir, + outLayout: 'flat', verbose: false, }) - - expect(result.ok).toBe(true) - const manifest = JSON.parse( - readFileSync(path.join(tmp, 'maven_install.json'), 'utf8'), - ) - expect(manifest.artifacts['com.example:from-regex']).toBeDefined() - expect(manifest.artifacts['com.example:from-json']).toBeUndefined() + expect( + existsSync(path.join(outDir, '.socket-auto-manifest', '.socket.facts.json')), + ).toBe(false) + expect( + existsSync(path.join(outDir, '.socket-auto-manifest', 'maven_install.json')), + ).toBe(true) }) }) diff --git a/src/commands/manifest/bazel/extract_bazel_to_pypi.test.mts b/src/commands/manifest/bazel/extract_bazel_to_pypi.test.mts index 652d4eb40..df3d4a1b0 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_pypi.test.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_pypi.test.mts @@ -29,13 +29,13 @@ vi.mock('./bazel-pypi-discovery.mts', () => ({ workspaceMode: 'bzlmod', }, ]), + parseVisibleRepoCandidates: vi.fn(() => []), })) const { probe } = vi.hoisted(() => ({ - probe: async () => ({ code: 0, stdout: '@pypi//requests:pkg\n' }), + probe: async () => ({ code: 0, stdout: '@pypi//requests:pkg\n', stderr: '' }), })) vi.mock('./bazel-query-runner.mts', () => ({ buildPypiProbeFor: vi.fn(() => probe), - buildProbeFor: vi.fn(() => probe), runBazelModShowVisibleRepos: vi.fn(async () => ({ code: 0, stderr: '', From 23e2f96680ab499331317ab3505b6209be30ebd9 Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 21:25:52 +0200 Subject: [PATCH 09/10] refactor(manifest/bazel): walker takes injected prune policy; reuse IGNORED_DIRS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `findWorkspaceRoots` no longer hardcodes the directory-prune set — callers pass `ignoreDirNames: ReadonlySet` and `ignoreDirPrefixes: readonly string[]` via options. Neither defaults to anything; absent means no pruning. This keeps the walker decoupled from any particular ignore policy and avoids duplicating the codebase-wide `IGNORED_DIRS` list. `src/utils/glob.mts` exports `IGNORED_DIRS` so the orchestrator can compose it with Bazel-specific extras. The orchestrator's composed set: `IGNORED_DIRS` plus `.hg`, `.idea`, `.pnpm-store`, `.socket-auto-manifest`, `.svn`, `.vscode`; prefixes `bazel-` and `dist`. Also tighten `MAX_WALK_DEPTH` from 16 → 8. Deepest workspace marker observed across the surveyed OSS corpus is 9 (bazel-self test fixtures); deepest in realistic application code is 7 (checkmk's thirdparty layout). The cap gives one level of headroom over the realistic max while still guarding against pathological symlink loops that slipped past any prefix prune the caller supplied. Walker test rewritten against the new injected API: covers the no-prune-by-default case (`node_modules/MODULE.bazel` surfaces unless the caller ignores `node_modules`), injected name and prefix prunes, and the bazel-* symlink case under the prefix injection. --- .../manifest/bazel/bazel-workspace-walk.mts | 67 +++++++++-------- .../bazel/bazel-workspace-walk.test.mts | 75 ++++++++++++------- .../manifest/bazel/extract_bazel_to_maven.mts | 24 +++++- src/utils/glob.mts | 2 +- 4 files changed, 111 insertions(+), 57 deletions(-) diff --git a/src/commands/manifest/bazel/bazel-workspace-walk.mts b/src/commands/manifest/bazel/bazel-workspace-walk.mts index 2e1f66ed8..8f7d95654 100644 --- a/src/commands/manifest/bazel/bazel-workspace-walk.mts +++ b/src/commands/manifest/bazel/bazel-workspace-walk.mts @@ -6,11 +6,13 @@ * `examples//MODULE.bazel`); the per-workspace algorithm in the * orchestrator runs once per discovered root. * - * Pruning matches the now-deleted `bazel-lockfile-discovery.mts`: skip - * directories that obviously aren't Bazel workspaces (`.git`, `node_modules`, - * `.socket-auto-manifest`, etc.) and Bazel's `bazel-*` convenience symlinks - * that point into (tens of GiB of generated state). Also - * prunes `dist*` build-output directories. + * The walker is dependency-injected with the directory-prune policy: + * callers pass the set of basenames and basename prefixes the walk must + * refuse to descend into. This module intentionally hardcodes none of + * the "common" prunes (`.git`, `node_modules`, …) — Bazel callers compose + * the codebase-wide `IGNORED_DIRS` list (`src/utils/glob.mts`) with the + * Bazel-specific bits (`bazel-*` output_base symlinks, + * `.socket-auto-manifest`, build-output `dist*`). */ import { readdirSync } from 'node:fs' @@ -21,27 +23,13 @@ import { logger } from '@socketsecurity/registry/lib/logger' // Hard ceiling on number of workspace roots we will surface. Real monorepos // have well under 50; this cap is a guard against pathological inputs. const MAX_WORKSPACE_ROOTS = 256 -// Hard ceiling on directory walk depth. Real workspaces nest <8 deep; the -// cap protects against pathological symlink loops that slipped past the -// `bazel-*` prefix prune. -const MAX_WALK_DEPTH = 16 -// Directory basenames the walk refuses to descend into. None of these -// contain Bazel workspaces, and node_modules / .git can be enormous. -const PRUNE_DIR_NAMES = new Set([ - '.git', - '.hg', - '.idea', - '.pnpm-store', - '.socket-auto-manifest', - '.svn', - '.vscode', - 'node_modules', -]) -// Directory basename prefixes the walk refuses to descend into. Bazel's -// `bazel-out`, `bazel-bin`, `bazel-testlogs`, and `bazel-` -// convenience symlinks all point into the output_base. `dist`-prefixed -// directories are build artefacts, not workspaces. -const PRUNE_DIR_PREFIXES = ['bazel-', 'dist'] +// Hard ceiling on directory walk depth. Deepest workspace marker observed +// across the OSS corpus surveyed is 9 (bazel-self test fixtures); deepest +// in realistic application code is 7 (checkmk's thirdparty layout). Cap +// is set to 8 — one level of headroom over the realistic max, while still +// guarding against pathological symlink loops that slipped past any +// prefix prune. +const MAX_WALK_DEPTH = 8 // Files whose presence promotes a directory to a workspace root. const WORKSPACE_MARKER_FILES = new Set([ 'MODULE.bazel', @@ -49,10 +37,29 @@ const WORKSPACE_MARKER_FILES = new Set([ 'WORKSPACE.bazel', ]) -// Walks the tree rooted at `cwd` and returns absolute paths to every +export type FindWorkspaceRootsOptions = { + cwd: string + // Directory basenames to skip outright (exact match). Pass the union of + // the codebase-wide ignore set (`IGNORED_DIRS` in `src/utils/glob.mts`) + // and any caller-specific additions (e.g. `.socket-auto-manifest`). + ignoreDirNames?: ReadonlySet + // Directory basename prefixes to skip. Bazel callers pass `['bazel-', + // 'dist']` so the walk never descends into Bazel's output_base symlinks + // or build-output directories. + ignoreDirPrefixes?: readonly string[] + verbose?: boolean +} + +const EMPTY_SET: ReadonlySet = new Set() +const EMPTY_ARRAY: readonly string[] = [] + +// Walks the tree rooted at `opts.cwd` and returns absolute paths to every // directory that contains at least one workspace marker file. Output is // sorted for determinism. -export function findWorkspaceRoots(cwd: string, verbose?: boolean): string[] { +export function findWorkspaceRoots(opts: FindWorkspaceRootsOptions): string[] { + const { cwd, verbose } = opts + const ignoreDirNames = opts.ignoreDirNames ?? EMPTY_SET + const ignoreDirPrefixes = opts.ignoreDirPrefixes ?? EMPTY_ARRAY const out: string[] = [] // Tuple stack: [absolute dir, depth from cwd]. const stack: Array<[string, number]> = [[cwd, 0]] @@ -98,11 +105,11 @@ export function findWorkspaceRoots(cwd: string, verbose?: boolean): string[] { continue } const name = entry.name - if (PRUNE_DIR_NAMES.has(name)) { + if (ignoreDirNames.has(name)) { continue } let pruned = false - for (const prefix of PRUNE_DIR_PREFIXES) { + for (const prefix of ignoreDirPrefixes) { if (name.startsWith(prefix)) { pruned = true break diff --git a/src/commands/manifest/bazel/bazel-workspace-walk.test.mts b/src/commands/manifest/bazel/bazel-workspace-walk.test.mts index 99307b1af..9ca249dce 100644 --- a/src/commands/manifest/bazel/bazel-workspace-walk.test.mts +++ b/src/commands/manifest/bazel/bazel-workspace-walk.test.mts @@ -17,6 +17,22 @@ function touch(file: string): void { writeFileSync(file, '') } +// Standard prune set Bazel callers pass: the codebase-wide IGNORED_DIRS +// (.git, node_modules, etc.) plus the walker's own output dir, plus +// `bazel-*` output_base symlinks and `dist*` build outputs. Replicated +// inline here so the test stays decoupled from `src/utils/glob.mts`. +const BAZEL_IGNORE_NAMES: ReadonlySet = new Set([ + '.git', + '.hg', + '.idea', + '.pnpm-store', + '.socket-auto-manifest', + '.svn', + '.vscode', + 'node_modules', +]) +const BAZEL_IGNORE_PREFIXES: readonly string[] = ['bazel-', 'dist'] + describe('bazel-workspace-walk', () => { let tmp: string @@ -31,48 +47,54 @@ describe('bazel-workspace-walk', () => { describe('findWorkspaceRoots', () => { it('returns the root when only the root has MODULE.bazel', () => { touch(path.join(tmp, 'MODULE.bazel')) - expect(findWorkspaceRoots(tmp)).toEqual([tmp]) + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp]) }) it('detects WORKSPACE and WORKSPACE.bazel as root markers', () => { touch(path.join(tmp, 'WORKSPACE')) - expect(findWorkspaceRoots(tmp)).toEqual([tmp]) + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp]) rmSync(path.join(tmp, 'WORKSPACE')) touch(path.join(tmp, 'WORKSPACE.bazel')) - expect(findWorkspaceRoots(tmp)).toEqual([tmp]) + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp]) }) it('finds nested workspaces at arbitrary depth', () => { touch(path.join(tmp, 'MODULE.bazel')) touch(path.join(tmp, 'examples', 'dagger', 'MODULE.bazel')) touch(path.join(tmp, 'examples', 'android', 'nested', 'WORKSPACE.bazel')) - const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) - expect(found).toEqual([ - '', - 'examples/android/nested', - 'examples/dagger', - ]) + const found = findWorkspaceRoots({ cwd: tmp }).map(p => + path.relative(tmp, p), + ) + expect(found).toEqual(['', 'examples/android/nested', 'examples/dagger']) }) it('returns [] when there is no workspace root', () => { writeFileSync(path.join(tmp, 'README.md'), '') - expect(findWorkspaceRoots(tmp)).toEqual([]) + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([]) + }) + + it('does NOT prune by default — pruning policy is caller-supplied', () => { + touch(path.join(tmp, 'MODULE.bazel')) + touch(path.join(tmp, 'node_modules', 'MODULE.bazel')) + const found = findWorkspaceRoots({ cwd: tmp }).map(p => + path.relative(tmp, p), + ) + expect(found).toEqual(['', 'node_modules']) }) - it('prunes .git / node_modules / .socket-auto-manifest', () => { + it('prunes injected ignoreDirNames', () => { touch(path.join(tmp, 'MODULE.bazel')) - // Sub-MODULE.bazel files inside pruned dirs must not be surfaced. for (const dir of ['node_modules', '.git', '.socket-auto-manifest']) { touch(path.join(tmp, dir, 'sub', 'MODULE.bazel')) } - const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) + const found = findWorkspaceRoots({ + cwd: tmp, + ignoreDirNames: BAZEL_IGNORE_NAMES, + }).map(p => path.relative(tmp, p)) expect(found).toEqual(['']) }) - it('prunes bazel-* convenience symlinks', () => { - // Simulate `bazel-out` pointing at a directory that contains a copy of - // MODULE.bazel. The walk must skip it; otherwise discovery would - // surface generated workspaces from . + it('prunes injected ignoreDirPrefixes (bazel-* symlinks)', () => { const fakeOutputBase = mkdtempSync( path.join(os.tmpdir(), 'sock-fake-outbase-'), ) @@ -83,18 +105,24 @@ describe('bazel-workspace-walk', () => { touch(path.join(fakeOutputBase, 'external', 'maven', 'MODULE.bazel')) symlinkSync(fakeOutputBase, path.join(tmp, 'bazel-out')) touch(path.join(tmp, 'MODULE.bazel')) - const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) + const found = findWorkspaceRoots({ + cwd: tmp, + ignoreDirPrefixes: BAZEL_IGNORE_PREFIXES, + }).map(p => path.relative(tmp, p)) expect(found).toEqual(['']) } finally { rmSync(fakeOutputBase, { recursive: true, force: true }) } }) - it('prunes dist* build-output directories', () => { + it('prunes injected dist* prefix', () => { touch(path.join(tmp, 'MODULE.bazel')) touch(path.join(tmp, 'dist', 'MODULE.bazel')) touch(path.join(tmp, 'distribution', 'MODULE.bazel')) - const found = findWorkspaceRoots(tmp).map(p => path.relative(tmp, p)) + const found = findWorkspaceRoots({ + cwd: tmp, + ignoreDirPrefixes: BAZEL_IGNORE_PREFIXES, + }).map(p => path.relative(tmp, p)) expect(found).toEqual(['']) }) @@ -102,13 +130,12 @@ describe('bazel-workspace-walk', () => { touch(path.join(tmp, 'z', 'MODULE.bazel')) touch(path.join(tmp, 'a', 'MODULE.bazel')) touch(path.join(tmp, 'm', 'MODULE.bazel')) - const found = findWorkspaceRoots(tmp) + const found = findWorkspaceRoots({ cwd: tmp }) expect(found).toEqual([ path.join(tmp, 'a'), path.join(tmp, 'm'), path.join(tmp, 'z'), ]) - // Absolute. for (const p of found) { expect(path.isAbsolute(p)).toBe(true) } @@ -116,9 +143,7 @@ describe('bazel-workspace-walk', () => { it('handles an unreadable directory by skipping it (no throw)', () => { touch(path.join(tmp, 'MODULE.bazel')) - // Reference a path that does not exist as cwd; the walker must not - // throw — it should return [] (no entries to read). - expect(findWorkspaceRoots(path.join(tmp, 'nope'))).toEqual([]) + expect(findWorkspaceRoots({ cwd: path.join(tmp, 'nope') })).toEqual([]) }) }) }) diff --git a/src/commands/manifest/bazel/extract_bazel_to_maven.mts b/src/commands/manifest/bazel/extract_bazel_to_maven.mts index 42a892519..e9ad2e452 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_maven.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_maven.mts @@ -30,6 +30,7 @@ import { } from './bazel-workspace-detect.mts' import { findWorkspaceRoots } from './bazel-workspace-walk.mts' import { getErrorCause } from '../../../utils/errors.mts' +import { IGNORED_DIRS } from '../../../utils/glob.mts' import type { CqueryRepoResult, @@ -94,6 +95,22 @@ type Sidecar = { const DEFAULT_PER_REPO_TIMEOUT_MS = 60_000 const REAP_TIMEOUT_MS = 10_000 +// Composed prune policy passed to the workspace walker. Reuses the +// codebase-wide `IGNORED_DIRS` and augments it with: the walker's own +// output dir (`.socket-auto-manifest`), VCS/IDE dirs not in the shared +// list (`.hg`, `.svn`, `.idea`, `.vscode`, `.pnpm-store`), Bazel's +// `bazel-*` output_base symlinks, and `dist*` build-output dirs. +const WORKSPACE_WALK_IGNORE_NAMES: ReadonlySet = new Set([ + ...IGNORED_DIRS, + '.hg', + '.idea', + '.pnpm-store', + '.socket-auto-manifest', + '.svn', + '.vscode', +]) +const WORKSPACE_WALK_IGNORE_PREFIXES: readonly string[] = ['bazel-', 'dist'] + type CoordPair = { groupArtifact: string; version: string } // Splits "g:a:v" -> { groupArtifact: "g:a", version: "v" }. @@ -389,7 +406,12 @@ export async function extractBazelToMaven( const allArtifacts: ExtractedArtifact[] = [] try { - const workspaceRoots = findWorkspaceRoots(cwd, verbose) + const workspaceRoots = findWorkspaceRoots({ + cwd, + ignoreDirNames: WORKSPACE_WALK_IGNORE_NAMES, + ignoreDirPrefixes: WORKSPACE_WALK_IGNORE_PREFIXES, + verbose, + }) if (!workspaceRoots.length) { logger.warn( `No Bazel workspace found at ${cwd} or beneath (looked for MODULE.bazel / WORKSPACE / WORKSPACE.bazel).`, diff --git a/src/utils/glob.mts b/src/utils/glob.mts index dd89f37ef..2d3561869 100644 --- a/src/utils/glob.mts +++ b/src/utils/glob.mts @@ -22,7 +22,7 @@ const DEFAULT_IGNORE_FOR_GIT_IGNORE = defaultIgnore.filter( p => !p.endsWith('.gitignore'), ) -const IGNORED_DIRS = [ +export const IGNORED_DIRS = [ // Taken from ignore-by-default: // https://github.com/novemberborn/ignore-by-default/blob/v2.1.0/index.js '.git', // Git repository files, see From 14f593d6e3bb0d54cbbb68b49307f4fc13f3361f Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Thu, 28 May 2026 21:38:42 +0200 Subject: [PATCH 10/10] refactor(manifest/bazel): drop manifest-status.json sidecar No consumer reads it today. The orchestrator still tracks per-repo timeouts to decide ExtractBazelResult.ok and to reap+remint the output_user_root, but no longer serialises the per-workspace / per-repo status report to disk. --- .../manifest/bazel/bazel-repo-discovery.mts | 11 ++-- .../manifest/bazel/extract_bazel_to_maven.mts | 59 +++---------------- .../bazel/extract_bazel_to_maven.test.mts | 49 +-------------- 3 files changed, 15 insertions(+), 104 deletions(-) diff --git a/src/commands/manifest/bazel/bazel-repo-discovery.mts b/src/commands/manifest/bazel/bazel-repo-discovery.mts index 539c8e6ef..494dcb017 100644 --- a/src/commands/manifest/bazel/bazel-repo-discovery.mts +++ b/src/commands/manifest/bazel/bazel-repo-discovery.mts @@ -49,7 +49,7 @@ const NO_SUCH_PACKAGE_STDERR_RE = /no such package ['"`]?@/ // Pattern emitted when a repo IS visible / defined but yields no targets. // `--keep_going` plus `'no targets found beneath'` is the empty-but-defined // signature. The orchestrator treats `empty` and `not-defined` uniformly -// as skips, but the distinction is preserved in the sidecar status report. +// as skips. const NO_TARGETS_STDERR_RE = /no targets found beneath/i // Anchor for the maven extension's section header in // `bazel mod show_extension` output. Tolerant of the canonical-name form @@ -103,11 +103,10 @@ export function parseShowExtensionOutput(stdout: string): string[] { } // Classify a raw probe result into one of three states. The probe contract -// is whatever the runner (layer 4) emits — typically a lightweight -// `cquery '@//...' --keep_going --output=label`. Distinguishing -// `empty` from `not-defined` lets the sidecar status report explain to the -// customer why a particular candidate was skipped; the orchestrator itself -// treats both as no-ops. +// is whatever the runner emits — typically a lightweight +// `cquery '@//...' --keep_going --output=label`. The orchestrator +// treats `empty` and `not-defined` uniformly as no-ops; the distinction +// is preserved for verbose-mode diagnostics. export function classifyProbeResult(result: ProbeResult): ProbeStatus { // A successful probe with any stdout means the repo exists AND has at // least one target — populated. diff --git a/src/commands/manifest/bazel/extract_bazel_to_maven.mts b/src/commands/manifest/bazel/extract_bazel_to_maven.mts index e9ad2e452..612ee5198 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_maven.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_maven.mts @@ -32,10 +32,7 @@ import { findWorkspaceRoots } from './bazel-workspace-walk.mts' import { getErrorCause } from '../../../utils/errors.mts' import { IGNORED_DIRS } from '../../../utils/glob.mts' -import type { - CqueryRepoResult, - CqueryStatus, -} from './bazel-cquery.mts' +import type { CqueryRepoResult } from './bazel-cquery.mts' import type { ExtractedArtifact } from './bazel-build-parser.mts' import type { BazelQueryOptions } from './bazel-query-runner.mts' import type { WorkspaceMode } from './bazel-workspace-detect.mts' @@ -68,28 +65,6 @@ export type ExtractBazelResult = { manifestPath?: string | undefined noEcosystemFound?: boolean | undefined ok: boolean - // Path to the per-invocation status sidecar describing which repos - // produced artefacts and which timed out / were empty. Useful for the - // server-side to surface partial results to the customer. - statusPath?: string | undefined -} - -type SidecarRepoEntry = { - name: string - status: CqueryStatus - artifactCount: number - durationMs: number -} - -type SidecarWorkspaceEntry = { - relPath: string - mode: { bzlmod: boolean; workspace: boolean } - repos: SidecarRepoEntry[] -} - -type Sidecar = { - complete: boolean - workspaces: SidecarWorkspaceEntry[] } const DEFAULT_PER_REPO_TIMEOUT_MS = 60_000 @@ -402,7 +377,8 @@ export async function extractBazelToMaven( ) } - const sidecar: Sidecar = { complete: true, workspaces: [] } + let anyTimeout = false + let anyRepos = false const allArtifacts: ExtractedArtifact[] = [] try { @@ -472,13 +448,8 @@ export async function extractBazelToMaven( candidates.join(', ') || '(none)' }`, ) - const wsEntry: SidecarWorkspaceEntry = { - mode: { bzlmod: mode.bzlmod, workspace: mode.workspace }, - relPath, - repos: [], - } - for (const repoName of candidates) { + anyRepos = true // eslint-disable-next-line no-await-in-loop const result: CqueryRepoResult = await runMetadataCqueryForRepo({ opts: buildQueryOpts(outputUserRoot, workspaceRoot), @@ -487,12 +458,6 @@ export async function extractBazelToMaven( workspaceRelPath: relPath, workspaceRoot, }) - wsEntry.repos.push({ - artifactCount: result.artifacts.length, - durationMs: result.durationMs, - name: repoName, - status: result.status, - }) allArtifacts.push(...result.artifacts) if (result.status === 'ok' || result.status === 'partial') { logger.info( @@ -502,7 +467,7 @@ export async function extractBazelToMaven( logger.warn( `@${repoName}: cquery timed out after ${perRepoTimeoutMs}ms; reaping server`, ) - sidecar.complete = false + anyTimeout = true // eslint-disable-next-line no-await-in-loop await reapBazelServer(bin, outputUserRoot) // eslint-disable-next-line no-await-in-loop @@ -520,7 +485,6 @@ export async function extractBazelToMaven( ) } } - sidecar.workspaces.push(wsEntry) } const deduped = dedupArtifactsByCoord(allArtifacts) @@ -535,21 +499,16 @@ export async function extractBazelToMaven( JSON.stringify(normalized, null, 2), 'utf8', ) - const statusPath = path.join(manifestDir, 'manifest-status.json') - await fs.writeFile(statusPath, JSON.stringify(sidecar, null, 2), 'utf8') if (verbose) { logger.log('[VERBOSE] outputs:', { artifactCount: deduped.length, - complete: sidecar.complete, + complete: !anyTimeout, layout, manifestPath, - statusPath, - workspaceCount: sidecar.workspaces.length, }) } - const anyRepos = sidecar.workspaces.some(w => w.repos.length > 0) if (!deduped.length) { if (!anyRepos) { if (verbose) { @@ -562,13 +521,12 @@ export async function extractBazelToMaven( manifestPath, noEcosystemFound: true, ok: false, - statusPath, } } logger.fail( 'Discovered Maven repo(s) but extracted zero artifacts. failureCategory=ecosystem-detected-but-empty', ) - return { artifactCount: 0, manifestPath, ok: false, statusPath } + return { artifactCount: 0, manifestPath, ok: false } } logger.success( `Wrote ${deduped.length} artifact(s) to ${path.relative(cwd, manifestPath)}.`, @@ -576,8 +534,7 @@ export async function extractBazelToMaven( return { artifactCount: deduped.length, manifestPath, - ok: sidecar.complete, - statusPath, + ok: !anyTimeout, } } catch (e) { logger.fail(`Unexpected error in bazel2maven: ${getErrorCause(e)}`) diff --git a/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts b/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts index 9d7f9821f..4e3923d27 100644 --- a/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts +++ b/src/commands/manifest/bazel/extract_bazel_to_maven.test.mts @@ -148,27 +148,6 @@ describe('extractBazelToMaven', () => { ) } - function readSidecar(out: string): { - complete: boolean - workspaces: Array<{ - relPath: string - mode: { bzlmod: boolean; workspace: boolean } - repos: Array<{ - name: string - status: string - artifactCount: number - durationMs: number - }> - }> - } { - return JSON.parse( - readFileSync( - path.join(out, '.socket-auto-manifest', 'manifest-status.json'), - 'utf8', - ), - ) - } - it('extracts a single Bzlmod workspace end-to-end', async () => { vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( mkResult({ @@ -198,14 +177,6 @@ describe('extractBazelToMaven', () => { 'androidx.annotation:annotation', 'com.google.guava:guava', ]) - const sidecar = readSidecar(tmp) - expect(sidecar.complete).toBe(true) - expect(sidecar.workspaces).toHaveLength(1) - expect(sidecar.workspaces[0]!.repos[0]).toMatchObject({ - artifactCount: 2, - name: 'maven', - status: 'ok', - }) }) it('returns noEcosystemFound when no workspace roots are discovered', async () => { @@ -288,14 +259,9 @@ describe('extractBazelToMaven', () => { 'com.google.dagger:dagger', 'com.google.guava:guava', ]) - const sidecar = readSidecar(tmp) - expect(sidecar.workspaces.map(w => w.relPath)).toEqual([ - '', - 'examples/dagger', - ]) }) - it('marks the sidecar complete:false on per-repo timeout and keeps going', async () => { + it('reports ok:false on per-repo timeout but keeps going', async () => { // Two candidates: first times out, second succeeds. The orchestrator // re-mints --output_user_root after the timeout. vi.mocked(runBazelModShowMavenExtension).mockResolvedValue({ @@ -330,12 +296,6 @@ Fetched repositories: }) expect(result.ok).toBe(false) expect(result.artifactCount).toBe(1) - const sidecar = readSidecar(tmp) - expect(sidecar.complete).toBe(false) - expect(sidecar.workspaces[0]!.repos.map(r => r.status)).toEqual([ - 'timeout', - 'ok', - ]) }) it('threads extraMavenRepoNames into the candidate list (WORKSPACE mode)', async () => { @@ -381,7 +341,7 @@ Fetched repositories: expect(runBazelModShowMavenExtension).not.toHaveBeenCalled() }) - it('writes manifest-status.json beside maven_install.json in flat layout', async () => { + it('writes maven_install.json into .socket-auto-manifest in flat layout', async () => { vi.mocked(runMetadataCqueryForRepo).mockResolvedValueOnce( mkResult({ artifacts: [mkArt('com.example:a:1.0', 'a')], @@ -401,11 +361,6 @@ Fetched repositories: expect( existsSync(path.join(tmp, '.socket-auto-manifest', 'maven_install.json')), ).toBe(true) - expect( - existsSync( - path.join(tmp, '.socket-auto-manifest', 'manifest-status.json'), - ), - ).toBe(true) }) })