From d50254729c57feeb34b77f89404e8fcd336cb9ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 25 May 2026 22:01:05 +0200 Subject: [PATCH 01/15] chore: packages db with initial migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- backend/.env.dist.composed | 9 +- backend/.env.dist.local | 9 +- backend/src/osspckgs/Dockerfile.flyway | 17 + backend/src/osspckgs/flyway_migrate.sh | 17 + .../V1779710880__initial_schema.sql | 652 ++++++++++++++++++ scripts/cli | 53 +- scripts/scaffold.yaml | 16 + 7 files changed, 770 insertions(+), 3 deletions(-) create mode 100644 backend/src/osspckgs/Dockerfile.flyway create mode 100755 backend/src/osspckgs/flyway_migrate.sh create mode 100644 backend/src/osspckgs/migrations/V1779710880__initial_schema.sql diff --git a/backend/.env.dist.composed b/backend/.env.dist.composed index f5bea47823..0bf0dd50d7 100644 --- a/backend/.env.dist.composed +++ b/backend/.env.dist.composed @@ -27,4 +27,11 @@ CROWD_OPENSEARCH_NODE=http://open-search:9200 CROWD_TEMPORAL_SERVER_URL=temporal:7233 # Seach sync api -CROWD_SEARCH_SYNC_API_URL=http://search-sync-api:8083 \ No newline at end of file +CROWD_SEARCH_SYNC_API_URL=http://search-sync-api:8083 +# packages DB (osspckgs) +CROWD_PACKAGES_DB_READ_HOST=packages +CROWD_PACKAGES_DB_WRITE_HOST=packages +CROWD_PACKAGES_DB_PORT=5432 +CROWD_PACKAGES_DB_USERNAME=postgres +CROWD_PACKAGES_DB_PASSWORD=example +CROWD_PACKAGES_DB_DATABASE=packages-db diff --git a/backend/.env.dist.local b/backend/.env.dist.local index ace5cb6ec7..5ac8b67df4 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -166,4 +166,11 @@ CROWD_TINYBIRD_BASE_URL=http://localhost:7181/ # Auth0 CROWD_AUTH0_ISSUER_BASE_URLS= -CROWD_AUTH0_AUDIENCE= \ No newline at end of file +CROWD_AUTH0_AUDIENCE= +# packages DB (osspckgs) +CROWD_PACKAGES_DB_READ_HOST=localhost +CROWD_PACKAGES_DB_WRITE_HOST=localhost +CROWD_PACKAGES_DB_PORT=5434 +CROWD_PACKAGES_DB_USERNAME=postgres +CROWD_PACKAGES_DB_PASSWORD=example +CROWD_PACKAGES_DB_DATABASE=packages-db diff --git a/backend/src/osspckgs/Dockerfile.flyway b/backend/src/osspckgs/Dockerfile.flyway new file mode 100644 index 0000000000..1615fac033 --- /dev/null +++ b/backend/src/osspckgs/Dockerfile.flyway @@ -0,0 +1,17 @@ +FROM flyway/flyway:7.8.1-alpine + +USER root + +# Install envsubst from gettext used for templating. +RUN apk update \ + && apk add --no-cache gettext + +USER flyway + +COPY ./flyway_migrate.sh /migrate.sh + +# Override default `flyway` entrypoint. +ENTRYPOINT ["/migrate.sh"] + +# Copy migrations. +COPY ./migrations /tmp/migrations diff --git a/backend/src/osspckgs/flyway_migrate.sh b/backend/src/osspckgs/flyway_migrate.sh new file mode 100755 index 0000000000..b2a4582979 --- /dev/null +++ b/backend/src/osspckgs/flyway_migrate.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e +echo "Migrating jdbc:postgresql://${PGHOST}:${PGPORT}/${PGDATABASE}" + +flyway \ + -locations="filesystem:/tmp/migrations" \ + -url="jdbc:postgresql://${PGHOST}:${PGPORT}/${PGDATABASE}" \ + -user="$PGUSER" \ + -password="$PGPASSWORD" \ + -connectRetries=60 \ + -outOfOrder=true \ + -mixed=true \ + -placeholderReplacement=false \ + -schemas=public \ + -X \ + migrate diff --git a/backend/src/osspckgs/migrations/V1779710880__initial_schema.sql b/backend/src/osspckgs/migrations/V1779710880__initial_schema.sql new file mode 100644 index 0000000000..2b6d79ea94 --- /dev/null +++ b/backend/src/osspckgs/migrations/V1779710880__initial_schema.sql @@ -0,0 +1,652 @@ +-- ============================================================ +-- DOMAIN 1: UNIVERSE (Tier 3 → Tier 2 ranking input) +-- ============================================================ +CREATE TABLE packages_universe ( + id bigserial PRIMARY KEY, + purl text UNIQUE, + ecosystem text NOT NULL, + namespace text, + name text NOT NULL, + downloads_30d bigint, + dependent_packages_count int, + dependent_repos_count int, + criticality_score numeric(10, 4), + rank_in_ecosystem int, + is_critical bool NOT NULL DEFAULT FALSE, + last_ranked_at timestamptz +); + +CREATE INDEX ON packages_universe (ecosystem, rank_in_ecosystem); + +CREATE INDEX ON packages_universe (is_critical) +WHERE + is_critical; + +-- ============================================================ +-- DOMAIN 2: TIER 2 PACKAGE DATA +-- ============================================================ +CREATE TABLE packages ( + id bigserial PRIMARY KEY, + purl text UNIQUE NOT NULL, + ecosystem text NOT NULL, + namespace text, + name text NOT NULL, + registry_url text, + status text, -- 'active' | 'deprecated' | 'unpublished' | 'yanked' + description text, + homepage text, + declared_repository_url text, + repository_url text, + licenses text[], -- SPDX normalized + licenses_raw text, + keywords text[], + -- npm-specific (NULL for other ecosystems) + dist_tags_latest text, + dist_tags_next text, + dist_tags_beta text, + -- Aggregates (refreshed each ingestion run) + versions_count int, + latest_version text, + first_release_at timestamptz, + latest_release_at timestamptz, + dependent_packages_count int, + dependent_repos_count int, + downloads_last_month bigint, + -- TODO: define semantics before enabling. Options: + -- a) fixed_version IS NULL on any advisory range → "no fix released yet" (simple, opt 2) + -- b) latest_version falls inside an affected range → "currently vulnerable" (correct, needs semver comparison per ecosystem) + -- has_critical_vulnerability bool NOT NULL DEFAULT FALSE, + criticality_score numeric(10, 4), + ingestion_source text, + last_synced_at timestamptz NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX ON packages (ecosystem, COALESCE(namespace, ''), name); + +CREATE INDEX ON packages (ecosystem, name); + +CREATE INDEX ON packages USING gin (keywords); + +CREATE INDEX ON packages (downloads_last_month DESC) +WHERE + status = 'active'; + +-- INDEX on has_critical_vulnerability removed — column is commented out above. +-- Uncomment both when semantics are decided. + +CREATE INDEX ON packages (criticality_score DESC) +WHERE + criticality_score IS NOT NULL; + +CREATE TABLE package_name_history ( + id bigserial PRIMARY KEY, + package_id bigint NOT NULL REFERENCES packages (id), + old_name text NOT NULL, + new_name text NOT NULL, + changed_at timestamptz NOT NULL DEFAULT NOW() +); + +CREATE INDEX ON package_name_history (package_id); + +CREATE TABLE package_funding_links ( + id bigserial PRIMARY KEY, + package_id bigint NOT NULL REFERENCES packages (id), + type TEXT, -- 'github' | 'patreon' | 'opencollective' | 'individual' | 'other' + url text NOT NULL, + UNIQUE (package_id, url) +); + +-- ============================================================ +-- VERSIONS — PARTITION BY HASH(package_id) +-- Hot query: WHERE package_id = X (all versions of a package). +-- 32 buckets → ~2.8M rows each at 90M total. +-- ============================================================ +CREATE TABLE versions ( + id bigserial, + package_id bigint NOT NULL REFERENCES packages (id), + ecosystem text NOT NULL, + number text NOT NULL, + published_at timestamptz, + is_latest bool NOT NULL DEFAULT FALSE, + is_yanked bool NOT NULL DEFAULT FALSE, + is_prerelease bool NOT NULL DEFAULT FALSE, + license text, -- SPDX where available; can differ per version + download_count bigint, -- per-version where available (npm, crates) + last_synced_at timestamptz NOT NULL DEFAULT NOW(), + PRIMARY KEY (id, package_id), + UNIQUE (package_id, number) +) +PARTITION BY HASH (package_id); + +CREATE TABLE versions_p0 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 0); + +CREATE TABLE versions_p1 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 1); + +CREATE TABLE versions_p2 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 2); + +CREATE TABLE versions_p3 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 3); + +CREATE TABLE versions_p4 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 4); + +CREATE TABLE versions_p5 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 5); + +CREATE TABLE versions_p6 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 6); + +CREATE TABLE versions_p7 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 7); + +CREATE TABLE versions_p8 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 8); + +CREATE TABLE versions_p9 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 9); + +CREATE TABLE versions_p10 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 10); + +CREATE TABLE versions_p11 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 11); + +CREATE TABLE versions_p12 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 12); + +CREATE TABLE versions_p13 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 13); + +CREATE TABLE versions_p14 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 14); + +CREATE TABLE versions_p15 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 15); + +CREATE TABLE versions_p16 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 16); + +CREATE TABLE versions_p17 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 17); + +CREATE TABLE versions_p18 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 18); + +CREATE TABLE versions_p19 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 19); + +CREATE TABLE versions_p20 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 20); + +CREATE TABLE versions_p21 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 21); + +CREATE TABLE versions_p22 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 22); + +CREATE TABLE versions_p23 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 23); + +CREATE TABLE versions_p24 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 24); + +CREATE TABLE versions_p25 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 25); + +CREATE TABLE versions_p26 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 26); + +CREATE TABLE versions_p27 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 27); + +CREATE TABLE versions_p28 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 28); + +CREATE TABLE versions_p29 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 29); + +CREATE TABLE versions_p30 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 30); + +CREATE TABLE versions_p31 PARTITION OF versions +FOR VALUES WITH (MODULUS 32, REMAINDER 31); + +CREATE INDEX ON versions (published_at DESC); + +CREATE INDEX ON versions (package_id) +WHERE + is_latest; + +-- ============================================================ +-- PACKAGE DEPENDENCIES — PARTITION BY HASH(depends_on_id) +-- +-- Terminology: +-- downstream = packages that depend ON a given package (its consumers) +-- upstream = packages that a given package depends ON (its suppliers) +-- +-- Security alerting hot query: "vulnerability in X — who is at risk?" +-- → WHERE depends_on_id = X (find all downstream consumers of X) +-- → lands in one partition → fast +-- 64 buckets → ~18M rows each at 1.15B total. +-- +-- Upstream query: "what does version X depend on?" +-- → WHERE version_id = X (scatters across all 64 partitions) +-- → slower by design; use the index on (version_id, depends_on_id, dependency_kind) +-- +-- package_id: no standalone FK; satisfies composite FK to +-- partitioned versions(id, package_id). +-- depends_on_id: FK to packages; also second component of +-- composite FK for resolved depends_on_version_id. +-- ============================================================ +CREATE TABLE package_dependencies ( + id bigserial, + package_id bigint NOT NULL, + version_id bigint NOT NULL, + depends_on_id bigint NOT NULL REFERENCES packages (id), + depends_on_version_id bigint, -- resolved version; NULL if unknown + version_constraint text, -- declared constraint e.g. '^1.2.3' + dependency_kind text NOT NULL, -- 'direct' | 'dev' | 'peer' + is_optional bool NOT NULL DEFAULT FALSE, + PRIMARY KEY (id, depends_on_id), + UNIQUE (version_id, depends_on_id, dependency_kind), + FOREIGN KEY (version_id, package_id) REFERENCES versions (id, package_id), + FOREIGN KEY (depends_on_version_id, depends_on_id) REFERENCES versions (id, package_id) +) +PARTITION BY HASH (depends_on_id); + +CREATE TABLE package_dependencies_p0 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 0); + +CREATE TABLE package_dependencies_p1 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 1); + +CREATE TABLE package_dependencies_p2 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 2); + +CREATE TABLE package_dependencies_p3 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 3); + +CREATE TABLE package_dependencies_p4 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 4); + +CREATE TABLE package_dependencies_p5 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 5); + +CREATE TABLE package_dependencies_p6 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 6); + +CREATE TABLE package_dependencies_p7 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 7); + +CREATE TABLE package_dependencies_p8 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 8); + +CREATE TABLE package_dependencies_p9 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 9); + +CREATE TABLE package_dependencies_p10 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 10); + +CREATE TABLE package_dependencies_p11 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 11); + +CREATE TABLE package_dependencies_p12 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 12); + +CREATE TABLE package_dependencies_p13 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 13); + +CREATE TABLE package_dependencies_p14 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 14); + +CREATE TABLE package_dependencies_p15 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 15); + +CREATE TABLE package_dependencies_p16 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 16); + +CREATE TABLE package_dependencies_p17 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 17); + +CREATE TABLE package_dependencies_p18 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 18); + +CREATE TABLE package_dependencies_p19 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 19); + +CREATE TABLE package_dependencies_p20 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 20); + +CREATE TABLE package_dependencies_p21 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 21); + +CREATE TABLE package_dependencies_p22 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 22); + +CREATE TABLE package_dependencies_p23 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 23); + +CREATE TABLE package_dependencies_p24 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 24); + +CREATE TABLE package_dependencies_p25 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 25); + +CREATE TABLE package_dependencies_p26 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 26); + +CREATE TABLE package_dependencies_p27 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 27); + +CREATE TABLE package_dependencies_p28 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 28); + +CREATE TABLE package_dependencies_p29 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 29); + +CREATE TABLE package_dependencies_p30 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 30); + +CREATE TABLE package_dependencies_p31 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 31); + +CREATE TABLE package_dependencies_p32 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 32); + +CREATE TABLE package_dependencies_p33 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 33); + +CREATE TABLE package_dependencies_p34 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 34); + +CREATE TABLE package_dependencies_p35 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 35); + +CREATE TABLE package_dependencies_p36 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 36); + +CREATE TABLE package_dependencies_p37 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 37); + +CREATE TABLE package_dependencies_p38 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 38); + +CREATE TABLE package_dependencies_p39 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 39); + +CREATE TABLE package_dependencies_p40 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 40); + +CREATE TABLE package_dependencies_p41 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 41); + +CREATE TABLE package_dependencies_p42 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 42); + +CREATE TABLE package_dependencies_p43 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 43); + +CREATE TABLE package_dependencies_p44 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 44); + +CREATE TABLE package_dependencies_p45 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 45); + +CREATE TABLE package_dependencies_p46 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 46); + +CREATE TABLE package_dependencies_p47 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 47); + +CREATE TABLE package_dependencies_p48 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 48); + +CREATE TABLE package_dependencies_p49 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 49); + +CREATE TABLE package_dependencies_p50 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 50); + +CREATE TABLE package_dependencies_p51 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 51); + +CREATE TABLE package_dependencies_p52 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 52); + +CREATE TABLE package_dependencies_p53 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 53); + +CREATE TABLE package_dependencies_p54 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 54); + +CREATE TABLE package_dependencies_p55 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 55); + +CREATE TABLE package_dependencies_p56 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 56); + +CREATE TABLE package_dependencies_p57 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 57); + +CREATE TABLE package_dependencies_p58 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 58); + +CREATE TABLE package_dependencies_p59 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 59); + +CREATE TABLE package_dependencies_p60 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 60); + +CREATE TABLE package_dependencies_p61 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 61); + +CREATE TABLE package_dependencies_p62 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 62); + +CREATE TABLE package_dependencies_p63 PARTITION OF package_dependencies +FOR VALUES WITH (MODULUS 64, REMAINDER 63); + +-- upstream: version-specific lookup within the single partition +CREATE INDEX ON package_dependencies (depends_on_id, depends_on_version_id); + +-- downstream: version_id queries scatter across 64 partitions but use this index +CREATE INDEX ON package_dependencies (version_id); + +-- ============================================================ +-- DOMAIN 3: REPOSITORIES +-- ============================================================ +CREATE TABLE repos ( + id bigserial PRIMARY KEY, + url text UNIQUE NOT NULL, + host text, -- 'github' | 'gitlab' | 'bitbucket' | 'other' + owner TEXT, + name text, + description text, + primary_language text, + topics text[], + stars int, + forks int, + watchers int, + open_issues int, + last_commit_at timestamptz, + archived bool NOT NULL DEFAULT FALSE, + disabled bool NOT NULL DEFAULT FALSE, + is_fork bool NOT NULL DEFAULT FALSE, + created_at timestamptz, + -- Scorecard aggregate; per-check detail in repo_scorecard_checks + scorecard_score numeric(3, 1), + scorecard_last_run_at timestamptz, + last_synced_at timestamptz NOT NULL DEFAULT NOW() +); + +CREATE INDEX ON repos (host, OWNER, name); + +CREATE INDEX ON repos (stars DESC); + +CREATE INDEX ON repos (scorecard_score) +WHERE + scorecard_score IS NOT NULL; + +-- OpenSSF Scorecard per-check detail (~18 named checks) +CREATE TABLE repo_scorecard_checks ( + id bigserial PRIMARY KEY, + repo_id bigint NOT NULL REFERENCES repos (id), + check_name text NOT NULL, -- 'Binary-Artifacts' | 'Branch-Protection' | ... + score numeric(3, 1), + reason text, + UNIQUE (repo_id, check_name) +); + +-- Docker images published by a repo (one-to-many) +CREATE TABLE repo_docker ( + id bigserial PRIMARY KEY, + repo_id bigint REFERENCES repos (id), -- nullable: image may predate repo link + image_name text NOT NULL, + pulls bigint, + stars int, + last_synced_at timestamptz NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX ON repo_docker (image_name); + +CREATE INDEX ON repo_docker (repo_id) +WHERE + repo_id IS NOT NULL; + +-- Package → repo provenance (monorepos publish N packages from one repo) +CREATE TABLE package_repos ( + id bigserial PRIMARY KEY, + package_id bigint NOT NULL REFERENCES packages (id), + repo_id bigint NOT NULL REFERENCES repos (id), + -- source values TBD pending alignment on data provider (e.g. deps.dev) + source text NOT NULL, -- 'declared' | 'deps_dev' | 'heuristic' | 'manual' + confidence numeric(3, 2) NOT NULL CHECK (confidence BETWEEN 0.00 AND 1.00), + verified_at timestamptz NOT NULL DEFAULT NOW(), + UNIQUE (package_id, repo_id) +); + +CREATE INDEX ON package_repos (repo_id); + +-- ============================================================ +-- DOMAIN 4: SECURITY (OSV-shaped) +-- One advisory → many affected packages → many version ranges. +-- Mirrors OSV schema: a single advisory can affect N packages +-- across different ecosystems (e.g. a vuln in a shared C lib). +-- ============================================================ +CREATE TABLE advisories ( + id bigserial PRIMARY KEY, + osv_id text UNIQUE NOT NULL, + aliases text[], -- CVE-XXXX, GHSA-... + severity text, -- 'LOW' | 'MEDIUM' | 'HIGH' | 'CRITICAL' + cvss numeric(3, 1), + -- >= 7.0 intentional: treat HIGH + CRITICAL both as actionable + is_critical bool GENERATED ALWAYS AS (cvss >= 7.0) STORED, + summary text, + details text, + published_at timestamptz, + modified_at timestamptz +); + +-- osv_id index omitted: UNIQUE constraint above already creates one. +CREATE INDEX ON advisories (is_critical) +WHERE + is_critical; + +-- Advisory → package mapping. One advisory can affect many packages. +-- package_id is NULL when the package exists in OSV but not yet in our DB. +CREATE TABLE advisory_packages ( + id bigserial PRIMARY KEY, + advisory_id bigint NOT NULL REFERENCES advisories (id), + package_id bigint REFERENCES packages (id), + ecosystem text NOT NULL, + package_name text NOT NULL, + UNIQUE (advisory_id, ecosystem, package_name) +); + +CREATE INDEX ON advisory_packages (ecosystem, package_name); + +CREATE INDEX ON advisory_packages (package_id) +WHERE + package_id IS NOT NULL; + +-- Version ranges affected by an advisory per package. +-- COALESCE prevents silent duplicates when introduced_version is NULL. +CREATE TABLE advisory_affected_ranges ( + id bigserial PRIMARY KEY, + advisory_package_id bigint NOT NULL REFERENCES advisory_packages (id), + introduced_version text, -- NULL = unknown start + fixed_version text, -- NULL = no fix yet + last_affected text -- NULL = no known upper bound +); + +CREATE UNIQUE INDEX ON advisory_affected_ranges (advisory_package_id, COALESCE(introduced_version, '')); + +CREATE INDEX ON advisory_affected_ranges (advisory_package_id); + +-- ============================================================ +-- MAINTAINERS +-- ============================================================ +CREATE TABLE maintainers ( + id bigserial PRIMARY KEY, + ecosystem text NOT NULL, + username text NOT NULL, + display_name text, + url text, + email_hash text, -- SHA-256; never raw email (GDPR) + github_login text, + UNIQUE (ecosystem, username) +); + +CREATE INDEX ON maintainers (github_login) +WHERE + github_login IS NOT NULL; + +CREATE TABLE package_maintainers ( + id bigserial PRIMARY KEY, + package_id bigint NOT NULL REFERENCES packages (id), + maintainer_id bigint NOT NULL REFERENCES maintainers (id), + role TEXT, -- 'author' | 'maintainer' + UNIQUE (package_id, maintainer_id) +); + +-- ============================================================ +-- DOWNLOADS (time-series, partitioned by month via pg_partman) +-- +-- pg_partman MUST be enabled in OCI config before this migration runs: +-- OCI Console → Database → Configuration → Extensions → enable pg_partman +-- +-- After enabling, run the setup below (once, outside Flyway or in a +-- separate migration) to register pg_partman and create initial partitions: +-- +-- CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman; +-- +-- SELECT partman.create_parent( +-- p_parent_table => 'public.downloads_daily', +-- p_control => 'date', +-- p_interval => '1 month', +-- p_premake => 3 -- pre-creates 3 future monthly partitions +-- ); +-- +-- -- pg_cron job to maintain partitions (also needs pg_cron enabled in OCI): +-- SELECT cron.schedule('partman-maintain', '0 1 * * *', +-- $$CALL partman.run_maintenance_proc()$$); +-- +-- Without this setup, inserts into downloads_daily will fail with +-- "no partition found for row". The table structure below is correct; +-- only the partition management setup is deferred. +-- +-- PK includes date because Postgres requires the partition key to be +-- part of the primary key on range-partitioned tables. +-- ============================================================ +CREATE TABLE downloads_daily ( + id bigserial, + package_id bigint NOT NULL, + date date NOT NULL, + count bigint NOT NULL, + PRIMARY KEY (id, date), + UNIQUE (package_id, date) +) +PARTITION BY RANGE (date); diff --git a/scripts/cli b/scripts/cli index f2830b8274..9f3ce75aac 100755 --- a/scripts/cli +++ b/scripts/cli @@ -318,6 +318,19 @@ function create_product_migration() { yell "Created $UP_MIG_FILE" } +function create_packages_migration() { + MIG_NAME="$(printf '%s\n' "$1" | tr -s ' ' | xargs)" + if [[ -z "$MIG_NAME" ]] || [[ ! "$MIG_NAME" =~ ^[A-Za-z0-9_-]+$ ]]; then + error "Migration name must be non-empty and contain only letters, numbers, underscores, or hyphens" + exit 1 + fi + + MIG_VERSION=$(date +%s) + UP_MIG_FILE="${CLI_HOME}/../backend/src/osspckgs/migrations/V${MIG_VERSION}__${MIG_NAME}.sql" + touch "$UP_MIG_FILE" + yell "Created $UP_MIG_FILE" +} + function build_and_publish() { VERSION="$2" @@ -404,6 +417,10 @@ function scaffold() { create_product_migration $2 exit ;; + create-packages-migration) + create_packages_migration $2 + exit + ;; migrate-up) migrate_local exit @@ -689,7 +706,25 @@ function migrate_tinybird_local() { set -eo pipefail } +function wait_for_postgres() { + local service="$1" + local container="$2" + local attempts=10 + local i=0 + say "Waiting for postgres ($service) to be ready..." + until docker exec "$container" pg_isready -U postgres -q 2>/dev/null; do + i=$((i + 1)) + if [[ $i -ge $attempts ]]; then + error "Postgres ($service) not ready after ${attempts} attempts. Aborting." + exit 1 + fi + sleep 2 + done + say "Postgres ($service) is ready." +} + function migrate_postgres_local() { + wait_for_postgres db "${PROJECT_NAME}_db_1" say "Building crowd flyway migration image..." docker build $DOCKER_PLATFORM_FLAGS -t crowd_flyway -f $CLI_HOME/../backend/src/database/Dockerfile.flyway $CLI_HOME/../backend/src/database --load say "Applying PostgreSQL migrations!" @@ -703,6 +738,7 @@ function migrate_postgres_local() { } function migrate_productdb_local() { + wait_for_postgres product "${PROJECT_NAME}_product_1" say "Building product flyway migration image..." docker build $DOCKER_PLATFORM_FLAGS -t product_flyway -f $CLI_HOME/../backend/src/product/Dockerfile.flyway $CLI_HOME/../backend/src/product --load say "Applying product database migrations!" @@ -715,9 +751,24 @@ function migrate_productdb_local() { product_flyway } +function migrate_packagesdb_local() { + wait_for_postgres packages "${PROJECT_NAME}_packages_1" + say "Building packages flyway migration image..." + docker build $DOCKER_PLATFORM_FLAGS -t packages_flyway -f $CLI_HOME/../backend/src/osspckgs/Dockerfile.flyway $CLI_HOME/../backend/src/osspckgs --load + say "Applying packages database migrations!" + docker run --rm --network "${PROJECT_NAME}-bridge" \ + -e PGHOST=packages \ + -e PGPORT=5432 \ + -e PGUSER=postgres \ + -e PGPASSWORD=example \ + -e PGDATABASE=packages-db \ + packages_flyway +} + function migrate_local() { migrate_postgres_local migrate_productdb_local + migrate_packagesdb_local migrate_tinybird_local } @@ -952,7 +1003,7 @@ function start() { SCRIPT_USAGE="${YELLOW}${PROJECT_NAME} CLI ${RESET}\n Usage: ./cli \n ${YELLOW}Scaffold:${RESET} - scaffold [up|down|destroy|reset|create-migration|create-product-migration|migrate-up|up-test]\n + scaffold [up|down|destroy|reset|create-migration|create-product-migration|create-packages-migration|migrate-up|up-test]\n ${YELLOW}Services:${RESET} service [up|down|restart|status|logs|id] start | start-dev | start-be | start-e2e diff --git a/scripts/scaffold.yaml b/scripts/scaffold.yaml index 08655cb49f..cde384b302 100644 --- a/scripts/scaffold.yaml +++ b/scripts/scaffold.yaml @@ -30,6 +30,21 @@ services: networks: - crowd-bridge + packages: + image: postgres:14-alpine + restart: unless-stopped + command: -c 'max_connections=300' + environment: + - POSTGRES_PASSWORD=example + - POSTGRES_DB=packages-db + ports: + - 5434:5432 + volumes: + - pgdata-packages-dev:/var/lib/postgresql/data + shm_size: 1gb + networks: + - crowd-bridge + open-search: image: opensearchproject/opensearch:2.7.0 restart: unless-stopped @@ -191,6 +206,7 @@ networks: volumes: pgdata-dev: pgdata-product-dev: + pgdata-packages-dev: opensearch-dev: s3-dev: redis-dev: From 01d9e5d3722b99ebb6b45e2e17e9e2ab6d6a2494 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 10:59:01 +0100 Subject: [PATCH 02/15] feat: scaffold packages_worker service with shared config and db helpers Signed-off-by: Mouad BANI --- pnpm-lock.yaml | 48 ++++++++++++++++----- services/apps/packages_worker/package.json | 28 ++++++++++++ services/apps/packages_worker/src/config.ts | 23 ++++++++++ services/apps/packages_worker/src/db.ts | 9 ++++ services/apps/packages_worker/src/types.ts | 2 + services/apps/packages_worker/tsconfig.json | 4 ++ 6 files changed, 104 insertions(+), 10 deletions(-) create mode 100644 services/apps/packages_worker/package.json create mode 100644 services/apps/packages_worker/src/config.ts create mode 100644 services/apps/packages_worker/src/db.ts create mode 100644 services/apps/packages_worker/src/types.ts create mode 100644 services/apps/packages_worker/tsconfig.json diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8b3408a593..96db6643f6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1292,6 +1292,34 @@ importers: specifier: ^3.0.1 version: 3.1.0 + services/apps/packages_worker: + dependencies: + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/database': + specifier: workspace:* + version: link:../../libs/database + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/pcc_sync_worker: dependencies: '@crowd/archetype-standard': @@ -10236,8 +10264,8 @@ snapshots: dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0 - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10431,11 +10459,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sso-oidc@3.572.0': + '@aws-sdk/client-sso-oidc@3.572.0(@aws-sdk/client-sts@3.572.0)': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10474,6 +10502,7 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: + - '@aws-sdk/client-sts' - aws-crt '@aws-sdk/client-sso@3.556.0': @@ -10649,11 +10678,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sts@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': + '@aws-sdk/client-sts@3.572.0': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0 + '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10692,7 +10721,6 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: - - '@aws-sdk/client-sso-oidc' - aws-crt '@aws-sdk/client-sts@3.985.0': @@ -10858,7 +10886,7 @@ snapshots: '@aws-sdk/credential-provider-ini@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/credential-provider-env': 3.568.0 '@aws-sdk/credential-provider-process': 3.572.0 '@aws-sdk/credential-provider-sso': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) @@ -11035,7 +11063,7 @@ snapshots: '@aws-sdk/credential-provider-web-identity@3.568.0(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/types': 2.12.0 @@ -11347,7 +11375,7 @@ snapshots: '@aws-sdk/token-providers@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': dependencies: - '@aws-sdk/client-sso-oidc': 3.572.0 + '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/shared-ini-file-loader': 2.4.0 diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json new file mode 100644 index 0000000000..49b460d8ff --- /dev/null +++ b/services/apps/packages_worker/package.json @@ -0,0 +1,28 @@ +{ + "name": "@crowd/packages-worker", + "private": true, + "scripts": { + "start:packages-worker": "SERVICE=packages-worker tsx src/bin/packages-worker.ts", + "start:github-repos-enricher": "SERVICE=github-repos-enricher tsx src/bin/github-repos-enricher.ts", + "start:debug:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=packages-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", + "start:debug:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", + "dev:packages-worker:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:packages-worker:local", + "dev:github-repos-enricher:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:github-repos-enricher:local", + "lint": "npx eslint --ext .ts src --max-warnings=0", + "format": "npx prettier --write \"src/**/*.ts\"", + "format-check": "npx prettier --check .", + "tsc-check": "tsc --noEmit" + }, + "dependencies": { + "@crowd/common": "workspace:*", + "@crowd/data-access-layer": "workspace:*", + "@crowd/database": "workspace:*", + "@crowd/logging": "workspace:*", + "tsx": "^4.7.1", + "typescript": "^5.6.3" + }, + "devDependencies": { + "@types/node": "^20.8.2", + "nodemon": "^3.0.1" + } +} diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts new file mode 100644 index 0000000000..044bf6bbff --- /dev/null +++ b/services/apps/packages_worker/src/config.ts @@ -0,0 +1,23 @@ +export function getPackagesDbConfig() { + return { + host: process.env.CROWD_PACKAGES_DB_WRITE_HOST, + port: parseInt(process.env.CROWD_PACKAGES_DB_PORT ?? '5432', 10), + database: process.env.CROWD_PACKAGES_DB_DATABASE, + user: process.env.CROWD_PACKAGES_DB_USERNAME, + password: process.env.CROWD_PACKAGES_DB_PASSWORD, + } +} + +export function getEnricherConfig() { + const rawTokens = process.env.GITHUB_TOKENS ?? '' + const tokens = rawTokens.split(',').map((t) => t.trim()).filter(Boolean) + + return { + tokens, + pageSize: parseInt(process.env.PAGE_SIZE ?? '200', 10), + batchSize: parseInt(process.env.BATCH_SIZE ?? '50', 10), + maxRetries: parseInt(process.env.MAX_RETRIES ?? '3', 10), + updateIntervalHours: parseInt(process.env.UPDATE_INTERVAL_HOURS ?? '24', 10), + idleSleepSec: parseInt(process.env.IDLE_SLEEP_SEC ?? '60', 10), + } +} diff --git a/services/apps/packages_worker/src/db.ts b/services/apps/packages_worker/src/db.ts new file mode 100644 index 0000000000..c300b8b0a2 --- /dev/null +++ b/services/apps/packages_worker/src/db.ts @@ -0,0 +1,9 @@ +import { getDbConnection } from '@crowd/database' +import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' + +import { getPackagesDbConfig } from './config' + +export async function getPackagesDb() { + const conn = await getDbConnection(getPackagesDbConfig()) + return pgpQx(conn) +} diff --git a/services/apps/packages_worker/src/types.ts b/services/apps/packages_worker/src/types.ts new file mode 100644 index 0000000000..42e5b83174 --- /dev/null +++ b/services/apps/packages_worker/src/types.ts @@ -0,0 +1,2 @@ +// Shared types for the packages-worker domain. +// Individual entry points define their own internal types in their respective directories. diff --git a/services/apps/packages_worker/tsconfig.json b/services/apps/packages_worker/tsconfig.json new file mode 100644 index 0000000000..bf7f183850 --- /dev/null +++ b/services/apps/packages_worker/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../base.tsconfig.json", + "include": ["src/**/*"] +} From 2f393b2dfadd9c0c3e74966ec126bdbd3d87661d Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:00:15 +0100 Subject: [PATCH 03/15] fix: rename GITHUB_TOKENS to ENRICHER_GITHUB_TOKENS Signed-off-by: Mouad BANI --- services/apps/packages_worker/src/config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 044bf6bbff..a837a21168 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -9,7 +9,7 @@ export function getPackagesDbConfig() { } export function getEnricherConfig() { - const rawTokens = process.env.GITHUB_TOKENS ?? '' + const rawTokens = process.env.ENRICHER_GITHUB_TOKENS ?? '' const tokens = rawTokens.split(',').map((t) => t.trim()).filter(Boolean) return { From 3d266dca38670bb241bb6bd3e2d1134b028d2ab9 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:02:43 +0100 Subject: [PATCH 04/15] feat: add packages-worker parent entry point scaffold Signed-off-by: Mouad BANI --- .../src/bin/packages-worker.ts | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 services/apps/packages_worker/src/bin/packages-worker.ts diff --git a/services/apps/packages_worker/src/bin/packages-worker.ts b/services/apps/packages_worker/src/bin/packages-worker.ts new file mode 100644 index 0000000000..c97ff2843a --- /dev/null +++ b/services/apps/packages_worker/src/bin/packages-worker.ts @@ -0,0 +1,53 @@ +import fs from 'fs' +import path from 'path' + +import { getServiceLogger } from '@crowd/logging' + +import { getPackagesDb } from '../db' + +const log = getServiceLogger() + +const liveFilePath = path.join(__dirname, '../tmp/packages-worker-live.tmp') +const readyFilePath = path.join(__dirname, '../tmp/packages-worker-ready.tmp') + +let shuttingDown = false + +const shutdown = async () => { + if (shuttingDown) return + shuttingDown = true + log.info('Shutting down packages-worker...') + process.exit(0) +} + +process.on('SIGINT', shutdown) +process.on('SIGTERM', shutdown) + +const main = async () => { + log.info('packages-worker starting...') + + const qx = await getPackagesDb() + await qx.selectOne('SELECT 1') + log.info('Connected to packages-db.') + + // Create tmp directory for health probe files + fs.mkdirSync(path.dirname(liveFilePath), { recursive: true }) + + setInterval(async () => { + if (shuttingDown) return + try { + await Promise.all([ + fs.promises.open(liveFilePath, 'a').then((f) => f.close()), + fs.promises.open(readyFilePath, 'a').then((f) => f.close()), + ]) + } catch (err) { + log.warn({ err }, 'Failed to write health probe files') + } + }, 5000) + + log.info('packages-worker started, idle.') +} + +main().catch((err) => { + log.error({ err }, 'packages-worker fatal error') + process.exit(1) +}) From 59d1ec0d2ebefa4d234a5290c5602655cc3d4fdb Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:05:08 +0100 Subject: [PATCH 05/15] feat: add enricher helper files (types, fetchLightRepo, updateEnrichedRepos) Signed-off-by: Mouad BANI --- .../src/enricher/fetchLightRepo.ts | 96 +++++++++++++++++++ .../packages_worker/src/enricher/types.ts | 31 ++++++ .../src/enricher/updateEnrichedRepos.ts | 36 +++++++ 3 files changed, 163 insertions(+) create mode 100644 services/apps/packages_worker/src/enricher/fetchLightRepo.ts create mode 100644 services/apps/packages_worker/src/enricher/types.ts create mode 100644 services/apps/packages_worker/src/enricher/updateEnrichedRepos.ts diff --git a/services/apps/packages_worker/src/enricher/fetchLightRepo.ts b/services/apps/packages_worker/src/enricher/fetchLightRepo.ts new file mode 100644 index 0000000000..ebd1dd87f4 --- /dev/null +++ b/services/apps/packages_worker/src/enricher/fetchLightRepo.ts @@ -0,0 +1,96 @@ +import { FetchError, LightRepoResult } from './types' + +const GRAPHQL_URL = 'https://api.github.com/graphql' + +const REPO_QUERY = ` + query($owner: String!, $name: String!) { + repository(owner: $owner, name: $name) { + description + primaryLanguage { name } + repositoryTopics(first: 25) { nodes { topic { name } } } + stargazerCount + forkCount + watchers { totalCount } + issues(states: OPEN) { totalCount } + pushedAt + isArchived + isDisabled + isFork + createdAt + } + } +` + +export function parseGithubUrl(url: string): { owner: string; name: string } { + const match = url.match(/https?:\/\/github\.com\/([^/]+)\/([^/]+?)(?:\.git)?\/?$/) + if (!match) throw new FetchError('MALFORMED', `Cannot parse GitHub URL: ${url}`) + return { owner: match[1], name: match[2] } +} + +export async function fetchLightRepo(url: string, token: string): Promise { + const { owner, name } = parseGithubUrl(url) + + let response: Response + try { + response = await fetch(GRAPHQL_URL, { + method: 'POST', + headers: { + Authorization: `bearer ${token}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ query: REPO_QUERY, variables: { owner, name } }), + }) + } catch (err) { + throw new FetchError('TRANSIENT', `Network error for ${url}: ${(err as Error).message}`) + } + + const resetSec = parseInt(response.headers.get('x-ratelimit-reset') ?? '0', 10) + const resetMs = resetSec ? resetSec * 1000 + 5_000 : Date.now() + 65_000 + + if (response.status === 401) { + throw new FetchError('AUTH', `401 Unauthorized for ${url}`) + } + + if (response.status === 403) { + const body = await response.text() + if (body.toLowerCase().includes('rate limit')) { + throw new FetchError('RATE_LIMIT', `Rate limited on ${url}`, resetMs) + } + throw new FetchError('AUTH', `403 Forbidden for ${url}`) + } + + if (response.status === 404) throw new FetchError('NOT_FOUND', `404 for ${url}`) + if (response.status >= 500) throw new FetchError('TRANSIENT', `${response.status} for ${url}`) + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const json = (await response.json()) as any + + if (json.errors?.length) { + const err = json.errors[0] + if (err.type === 'RATE_LIMITED') throw new FetchError('RATE_LIMIT', `RATE_LIMITED for ${url}`, resetMs) + if (err.type === 'NOT_FOUND') throw new FetchError('NOT_FOUND', `NOT_FOUND for ${url}`) + throw new FetchError('TRANSIENT', `GraphQL error for ${url}: ${err.message ?? err.type}`) + } + + const repo = json.data?.repository + if (!repo) throw new FetchError('NOT_FOUND', `No repository data for ${url}`) + + return { + url, + host: 'github', + owner, + name, + description: repo.description ?? null, + primaryLanguage: repo.primaryLanguage?.name ?? null, + topics: (repo.repositoryTopics?.nodes ?? []).map((n: { topic: { name: string } }) => n.topic.name), + stars: repo.stargazerCount ?? 0, + forks: repo.forkCount ?? 0, + watchers: repo.watchers?.totalCount ?? 0, + openIssues: repo.issues?.totalCount ?? 0, + lastCommitAt: repo.pushedAt ?? null, + archived: repo.isArchived ?? false, + disabled: repo.isDisabled ?? false, + isFork: repo.isFork ?? false, + createdAt: repo.createdAt ?? null, + } +} diff --git a/services/apps/packages_worker/src/enricher/types.ts b/services/apps/packages_worker/src/enricher/types.ts new file mode 100644 index 0000000000..d0ddd50f82 --- /dev/null +++ b/services/apps/packages_worker/src/enricher/types.ts @@ -0,0 +1,31 @@ +export interface LightRepoResult { + url: string + host: 'github' + owner: string + name: string + description: string | null + primaryLanguage: string | null + topics: string[] + stars: number + forks: number + watchers: number + openIssues: number + lastCommitAt: string | null + archived: boolean + disabled: boolean + isFork: boolean + createdAt: string | null +} + +export type FetchErrorKind = 'RATE_LIMIT' | 'TRANSIENT' | 'NOT_FOUND' | 'AUTH' | 'MALFORMED' + +export class FetchError extends Error { + constructor( + public readonly kind: FetchErrorKind, + message: string, + public readonly resetAt?: number, + ) { + super(message) + this.name = 'FetchError' + } +} diff --git a/services/apps/packages_worker/src/enricher/updateEnrichedRepos.ts b/services/apps/packages_worker/src/enricher/updateEnrichedRepos.ts new file mode 100644 index 0000000000..de18cb4c7b --- /dev/null +++ b/services/apps/packages_worker/src/enricher/updateEnrichedRepos.ts @@ -0,0 +1,36 @@ +import { getServiceChildLogger } from '@crowd/logging' +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { LightRepoResult } from './types' + +const log = getServiceChildLogger('github-repos-enricher:update') + +export async function updateEnrichedRepos(qx: QueryExecutor, rows: LightRepoResult[]): Promise { + if (rows.length === 0) return + + for (const r of rows) { + await qx.result( + `UPDATE repos SET + host = COALESCE(host, $(host)), + owner = COALESCE(owner, $(owner)), + name = COALESCE(name, $(name)), + description = $(description), + primary_language = $(primaryLanguage), + topics = $(topics)::text[], + stars = $(stars), + forks = $(forks), + watchers = $(watchers), + open_issues = $(openIssues), + last_commit_at = $(lastCommitAt)::timestamptz, + archived = $(archived), + disabled = $(disabled), + is_fork = $(isFork), + created_at = COALESCE(created_at, $(createdAt)::timestamptz), + last_synced_at = NOW() + WHERE url = $(url)`, + r, + ) + } + + log.debug({ count: rows.length }, 'Updated enriched repos') +} From 57066a5afd8d1682ada191fa180248614c7b8097 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:09:49 +0100 Subject: [PATCH 06/15] fix: remove default values from config to surface missing env vars explicitly Signed-off-by: Mouad BANI --- services/apps/packages_worker/src/config.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index a837a21168..5614e06aa0 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -1,7 +1,7 @@ export function getPackagesDbConfig() { return { host: process.env.CROWD_PACKAGES_DB_WRITE_HOST, - port: parseInt(process.env.CROWD_PACKAGES_DB_PORT ?? '5432', 10), + port: parseInt(process.env.CROWD_PACKAGES_DB_PORT, 10), database: process.env.CROWD_PACKAGES_DB_DATABASE, user: process.env.CROWD_PACKAGES_DB_USERNAME, password: process.env.CROWD_PACKAGES_DB_PASSWORD, @@ -14,10 +14,10 @@ export function getEnricherConfig() { return { tokens, - pageSize: parseInt(process.env.PAGE_SIZE ?? '200', 10), - batchSize: parseInt(process.env.BATCH_SIZE ?? '50', 10), - maxRetries: parseInt(process.env.MAX_RETRIES ?? '3', 10), - updateIntervalHours: parseInt(process.env.UPDATE_INTERVAL_HOURS ?? '24', 10), - idleSleepSec: parseInt(process.env.IDLE_SLEEP_SEC ?? '60', 10), + pageSize: parseInt(process.env.PAGE_SIZE, 10), + batchSize: parseInt(process.env.BATCH_SIZE, 10), + maxRetries: parseInt(process.env.MAX_RETRIES, 10), + updateIntervalHours: parseInt(process.env.UPDATE_INTERVAL_HOURS, 10), + idleSleepSec: parseInt(process.env.IDLE_SLEEP_SEC, 10), } } From afbdbddb8e301bb23405a04c97c0f1ef4c43d6ef Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:11:33 +0100 Subject: [PATCH 07/15] feat: add enrichment loop and github-repos-enricher entry point Signed-off-by: Mouad BANI --- .../src/bin/github-repos-enricher.ts | 69 +++++ .../src/enricher/runEnrichmentLoop.ts | 183 ++++++++++++ .../apps/script_executor_worker/package.json | 1 + .../bin/sync-light-repos/fetchLightRepo.ts | 96 ++++++ .../src/bin/sync-light-repos/index.ts | 276 ++++++++++++++++++ .../src/bin/sync-light-repos/types.ts | 46 +++ .../bin/sync-light-repos/upsertLightRepos.ts | 49 ++++ 7 files changed, 720 insertions(+) create mode 100644 services/apps/packages_worker/src/bin/github-repos-enricher.ts create mode 100644 services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts diff --git a/services/apps/packages_worker/src/bin/github-repos-enricher.ts b/services/apps/packages_worker/src/bin/github-repos-enricher.ts new file mode 100644 index 0000000000..a40064b724 --- /dev/null +++ b/services/apps/packages_worker/src/bin/github-repos-enricher.ts @@ -0,0 +1,69 @@ +import fs from 'fs' +import path from 'path' + +import { getServiceLogger } from '@crowd/logging' + +import { getEnricherConfig } from '../config' +import { getPackagesDb } from '../db' +import { runEnrichmentLoop } from '../enricher/runEnrichmentLoop' + +const log = getServiceLogger() + +const liveFilePath = path.join(__dirname, '../tmp/github-repos-enricher-live.tmp') +const readyFilePath = path.join(__dirname, '../tmp/github-repos-enricher-ready.tmp') + +let shuttingDown = false + +const shutdown = async () => { + if (shuttingDown) return + shuttingDown = true + log.info('Shutting down github-repos-enricher...') +} + +process.on('SIGINT', shutdown) +process.on('SIGTERM', shutdown) + +const main = async () => { + log.info('github-repos-enricher starting...') + + const config = getEnricherConfig() + + if (config.tokens.length === 0) { + log.error('ENRICHER_GITHUB_TOKENS is required (comma-separated PATs)') + process.exit(1) + } + + const qx = await getPackagesDb() + await qx.selectOne('SELECT 1') + log.info('Connected to packages-db.') + + fs.mkdirSync(path.dirname(liveFilePath), { recursive: true }) + + const healthInterval = setInterval(async () => { + if (shuttingDown) return + try { + await Promise.all([ + fs.promises.open(liveFilePath, 'a').then((f) => f.close()), + fs.promises.open(readyFilePath, 'a').then((f) => f.close()), + ]) + } catch (err) { + log.warn({ err }, 'Failed to write health probe files') + } + }, 5000) + + log.info( + { tokens: config.tokens.length, pageSize: config.pageSize, batchSize: config.batchSize }, + 'Starting enrichment loop', + ) + + await runEnrichmentLoop(qx, config, () => shuttingDown) + + clearInterval(healthInterval) + log.info('github-repos-enricher stopped.') + process.exit(0) +} + +main().catch((err) => { + log.error({ err }, 'github-repos-enricher fatal error') + process.exit(1) +}) diff --git a/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts new file mode 100644 index 0000000000..a406342f82 --- /dev/null +++ b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts @@ -0,0 +1,183 @@ +import { getServiceChildLogger } from '@crowd/logging' +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { getEnricherConfig } from '../config' +import { fetchLightRepo, parseGithubUrl } from './fetchLightRepo' +import { FetchError, LightRepoResult } from './types' +import { updateEnrichedRepos } from './updateEnrichedRepos' + +const log = getServiceChildLogger('github-repos-enricher') + +async function fetchWithRetries( + url: string, + token: string, + maxRetries: number, +): Promise { + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + return await fetchLightRepo(url, token) + } catch (err) { + if (!(err instanceof FetchError)) throw err + + if (['NOT_FOUND', 'AUTH', 'MALFORMED'].includes(err.kind)) { + log.warn({ url, kind: err.kind }, err.message) + return null + } + + if (err.kind === 'RATE_LIMIT') throw err + + if (attempt < maxRetries) { + const backoffMs = 1000 * 2 ** attempt + log.warn({ url, attempt, backoffMs }, `Transient error, retrying: ${err.message}`) + await new Promise((r) => setTimeout(r, backoffMs)) + } else { + log.error({ url }, `Gave up after ${maxRetries} retries: ${err.message}`) + return null + } + } + } + return null +} + +async function fetchPage( + qx: QueryExecutor, + cursor: string | null, + pageSize: number, + updateIntervalHours: number, +): Promise<{ rows: Array<{ id: string; url: string }>; urls: string[] }> { + const rows = await qx.select( + ` + SELECT id, url + FROM repos + WHERE host = 'github' + AND (last_synced_at IS NULL OR last_synced_at < NOW() - INTERVAL '$(updateIntervalHours) hours') + AND ($(cursor) IS NULL OR id > $(cursor)) + ORDER BY id + LIMIT $(pageSize) + `, + { cursor, pageSize, updateIntervalHours }, + ) + return { + rows, + urls: rows.map((r: { url: string }) => r.url), + } +} + +async function processPage( + urls: string[], + tokens: string[], + parkedUntil: Map, + config: ReturnType, + qx: QueryExecutor, +): Promise<{ fetched: number; failed: number; flushed: number }> { + const validUrls: string[] = [] + let skipped = 0 + for (const url of urls) { + try { + parseGithubUrl(url) + validUrls.push(url) + } catch { + skipped++ + } + } + if (skipped > 0) log.warn(`Skipped ${skipped} non-GitHub URLs`) + + const buffer: LightRepoResult[] = [] + const failures: Array<{ url: string; reason: string }> = [] + let failed = 0 + let flushed = 0 + let nextIdx = 0 + + await Promise.all( + tokens.map(async (token, tokenIdx) => { + // Respect any park set during a previous page of this run + const initialPark = (parkedUntil.get(token) ?? 0) - Date.now() + if (initialPark > 0) { + log.warn(`token#${tokenIdx} still parked, waiting ${Math.round(initialPark / 1000)}s`) + await new Promise((r) => setTimeout(r, initialPark)) + } + + while (true) { + const idx = nextIdx++ + if (idx >= validUrls.length) break + const url = validUrls[idx] + + try { + const result = await fetchWithRetries(url, token, config.maxRetries) + if (result) { + buffer.push(result) + if (buffer.length >= config.batchSize) { + const batch = buffer.splice(0) + await updateEnrichedRepos(qx, batch) + flushed += batch.length + } + } else { + failures.push({ url, reason: 'see warn log above' }) + failed++ + } + } catch (err) { + if (err instanceof FetchError && err.kind === 'RATE_LIMIT') { + const resetAt = err.resetAt ?? Date.now() + 60_000 + const waitMs = Math.max(1_000, resetAt - Date.now()) + parkedUntil.set(token, resetAt) + log.warn( + { tokenIdx, parkedUntil: new Date(resetAt).toISOString() }, + `token#${tokenIdx} rate limited — parking for ${Math.round(waitMs / 1000)}s`, + ) + await new Promise((r) => setTimeout(r, waitMs)) + failures.push({ url, reason: 'rate-limit' }) + failed++ + } else { + log.error({ url, err }, 'Unexpected error') + failures.push({ url, reason: (err as Error).message }) + failed++ + } + } + } + }), + ) + + if (buffer.length > 0) { + await updateEnrichedRepos(qx, buffer) + flushed += buffer.length + } + + if (failures.length > 0) { + log.warn({ failures }, `${failures.length} repo(s) failed this page`) + } + + return { fetched: validUrls.length - failed, failed, flushed } +} + +export async function runEnrichmentLoop( + qx: QueryExecutor, + config: ReturnType, + isShuttingDown: () => boolean, +): Promise { + const parkedUntil = new Map() + let cursor: string | null = null + let pageNum = 0 + + while (!isShuttingDown()) { + pageNum++ + + const { rows, urls } = await fetchPage(qx, cursor, config.pageSize, config.updateIntervalHours) + + if (urls.length === 0) { + log.info('No more repos to process, sleeping') + await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) + cursor = null + continue + } + + const { fetched, failed, flushed } = await processPage(urls, config.tokens, parkedUntil, config, qx) + + log.info( + `Page ${pageNum}: read=${urls.length} fetched=${fetched} failed=${failed} flushed=${flushed}`, + ) + + if (rows.length > 0) { + cursor = rows[rows.length - 1].id + } + } +} diff --git a/services/apps/script_executor_worker/package.json b/services/apps/script_executor_worker/package.json index c94236ec00..482a390495 100644 --- a/services/apps/script_executor_worker/package.json +++ b/services/apps/script_executor_worker/package.json @@ -12,6 +12,7 @@ "recalculate-enrichment-affiliations": "npx tsx src/bin/recalculate-enrichment-affiliations.ts", "recalculate-all-affiliations": "npx tsx src/bin/recalculate-all-affiliations.ts", "add-lf-projects-to-collection": "npx tsx src/bin/add-lf-projects-to-collection.ts", + "sync-light-repos": "npx tsx src/bin/sync-light-repos/index.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts new file mode 100644 index 0000000000..ebd1dd87f4 --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts @@ -0,0 +1,96 @@ +import { FetchError, LightRepoResult } from './types' + +const GRAPHQL_URL = 'https://api.github.com/graphql' + +const REPO_QUERY = ` + query($owner: String!, $name: String!) { + repository(owner: $owner, name: $name) { + description + primaryLanguage { name } + repositoryTopics(first: 25) { nodes { topic { name } } } + stargazerCount + forkCount + watchers { totalCount } + issues(states: OPEN) { totalCount } + pushedAt + isArchived + isDisabled + isFork + createdAt + } + } +` + +export function parseGithubUrl(url: string): { owner: string; name: string } { + const match = url.match(/https?:\/\/github\.com\/([^/]+)\/([^/]+?)(?:\.git)?\/?$/) + if (!match) throw new FetchError('MALFORMED', `Cannot parse GitHub URL: ${url}`) + return { owner: match[1], name: match[2] } +} + +export async function fetchLightRepo(url: string, token: string): Promise { + const { owner, name } = parseGithubUrl(url) + + let response: Response + try { + response = await fetch(GRAPHQL_URL, { + method: 'POST', + headers: { + Authorization: `bearer ${token}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ query: REPO_QUERY, variables: { owner, name } }), + }) + } catch (err) { + throw new FetchError('TRANSIENT', `Network error for ${url}: ${(err as Error).message}`) + } + + const resetSec = parseInt(response.headers.get('x-ratelimit-reset') ?? '0', 10) + const resetMs = resetSec ? resetSec * 1000 + 5_000 : Date.now() + 65_000 + + if (response.status === 401) { + throw new FetchError('AUTH', `401 Unauthorized for ${url}`) + } + + if (response.status === 403) { + const body = await response.text() + if (body.toLowerCase().includes('rate limit')) { + throw new FetchError('RATE_LIMIT', `Rate limited on ${url}`, resetMs) + } + throw new FetchError('AUTH', `403 Forbidden for ${url}`) + } + + if (response.status === 404) throw new FetchError('NOT_FOUND', `404 for ${url}`) + if (response.status >= 500) throw new FetchError('TRANSIENT', `${response.status} for ${url}`) + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const json = (await response.json()) as any + + if (json.errors?.length) { + const err = json.errors[0] + if (err.type === 'RATE_LIMITED') throw new FetchError('RATE_LIMIT', `RATE_LIMITED for ${url}`, resetMs) + if (err.type === 'NOT_FOUND') throw new FetchError('NOT_FOUND', `NOT_FOUND for ${url}`) + throw new FetchError('TRANSIENT', `GraphQL error for ${url}: ${err.message ?? err.type}`) + } + + const repo = json.data?.repository + if (!repo) throw new FetchError('NOT_FOUND', `No repository data for ${url}`) + + return { + url, + host: 'github', + owner, + name, + description: repo.description ?? null, + primaryLanguage: repo.primaryLanguage?.name ?? null, + topics: (repo.repositoryTopics?.nodes ?? []).map((n: { topic: { name: string } }) => n.topic.name), + stars: repo.stargazerCount ?? 0, + forks: repo.forkCount ?? 0, + watchers: repo.watchers?.totalCount ?? 0, + openIssues: repo.issues?.totalCount ?? 0, + lastCommitAt: repo.pushedAt ?? null, + archived: repo.isArchived ?? false, + disabled: repo.isDisabled ?? false, + isFork: repo.isFork ?? false, + createdAt: repo.createdAt ?? null, + } +} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts new file mode 100644 index 0000000000..cb0f2198b3 --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts @@ -0,0 +1,276 @@ +/** + * sync-light-repos + * + * Fetches GitHub repo metadata via GraphQL and upserts into the `repos` table. + * Runs one async worker per token — each worker claims URLs by index so no two + * requests ever share a token concurrently. + * + * Success tracking: a successful fetch updates repos.last_synced_at to NOW(). + * Failed repos keep a stale/null last_synced_at and are picked up on the next run. + * TODO: fetchPage will later filter by last_synced_at < NOW() - update_interval + * so this script becomes a continuous sync with no extra failure tracking needed. + * + * Usage: + * pnpm run sync-light-repos -- [options] + * + * Options: + * --page-size Repos fetched from source per cursor page (default: 200) + * --batch-size Upsert batch size (default: 50) + * --max-retries Per-repo transient retry cap (default: 3) + * --start-after Resume from cursor id (printed after each page) + * --limit Stop after N repos total (for testing) + * --dry-run Fetch but skip DB writes + * + * Environment: + * GITHUB_TOKENS Comma-separated GitHub PATs (required) + * CROWD_DB_WRITE_HOST/PORT/USERNAME/PASSWORD/DATABASE + * SERVICE + */ + +import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/data-access-layer/src/database' +import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { fetchLightRepo, parseGithubUrl } from './fetchLightRepo' +import { FetchError, LightRepoResult } from './types' +import { upsertLightRepos } from './upsertLightRepos' + +const log = getServiceChildLogger('sync-light-repos') + +function parseArgs() { + const args = process.argv.slice(2) + const getArg = (flag: string) => { + const idx = args.indexOf(flag) + return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined + } + + const pageSize = parseInt(getArg('--page-size') ?? '200', 10) + const batchSize = parseInt(getArg('--batch-size') ?? '50', 10) + const maxRetries = parseInt(getArg('--max-retries') ?? '3', 10) + const startAfter = getArg('--start-after') ?? null + const limitRaw = getArg('--limit') + const limit = limitRaw !== undefined ? parseInt(limitRaw, 10) : null + const dryRun = args.includes('--dry-run') + + if (isNaN(pageSize) || pageSize <= 0) { log.error('--page-size must be a positive integer'); process.exit(1) } + if (isNaN(batchSize) || batchSize <= 0) { log.error('--batch-size must be a positive integer'); process.exit(1) } + if (isNaN(maxRetries) || maxRetries < 0) { log.error('--max-retries must be a non-negative integer'); process.exit(1) } + if (limit !== null && (isNaN(limit) || limit <= 0)) { log.error('--limit must be a positive integer'); process.exit(1) } + + return { pageSize, batchSize, maxRetries, startAfter, limit, dryRun } +} + +// TODO: add LEFT JOIN repos r ON r.url = pr.url and filter +// WHERE (r.last_synced_at IS NULL OR r.last_synced_at < NOW() - INTERVAL '$(updateIntervalHours) hours') +// once the update interval logic is scoped in. +async function fetchPage( + qx: ReturnType, + cursor: string | null, + pageSize: number, +): Promise<{ urls: string[]; nextCursor: string | null }> { + const rows = await qx.select( + ` + SELECT id, url + FROM public.repositories + WHERE url LIKE 'https://github.com/%' + AND "deletedAt" IS NULL + ${cursor ? 'AND id > $(cursor)' : ''} + ORDER BY id + LIMIT $(pageSize) + `, + { cursor, pageSize }, + ) + return { + urls: rows.map((r: { url: string }) => r.url), + nextCursor: rows.length > 0 ? (rows[rows.length - 1] as { id: string }).id : null, + } +} + +async function fetchWithRetries( + url: string, + token: string, + maxRetries: number, +): Promise { + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + return await fetchLightRepo(url, token) + } catch (err) { + if (!(err instanceof FetchError)) throw err + + if (['NOT_FOUND', 'AUTH', 'MALFORMED'].includes(err.kind)) { + log.warn({ url, kind: err.kind }, err.message) + return null + } + + if (err.kind === 'RATE_LIMIT') throw err + + if (attempt < maxRetries) { + const backoffMs = 1000 * 2 ** attempt + log.warn({ url, attempt, backoffMs }, `Transient error, retrying: ${err.message}`) + await new Promise((r) => setTimeout(r, backoffMs)) + } else { + log.error({ url }, `Gave up after ${maxRetries} retries: ${err.message}`) + return null + } + } + } + return null +} + +async function processPage( + urls: string[], + tokens: string[], + parkedUntil: Map, + opts: ReturnType, + qx: ReturnType, +): Promise<{ fetched: number; failed: number; flushed: number }> { + const validUrls: string[] = [] + let skipped = 0 + for (const url of urls) { + try { parseGithubUrl(url); validUrls.push(url) } catch { skipped++ } + } + if (skipped > 0) log.warn(`Skipped ${skipped} non-GitHub URLs`) + + const buffer: LightRepoResult[] = [] + const failures: Array<{ url: string; reason: string }> = [] + let failed = 0 + let flushed = 0 + let nextIdx = 0 + + await Promise.all( + tokens.map(async (token, tokenIdx) => { + // Respect any park set during a previous page of this run + const initialPark = (parkedUntil.get(token) ?? 0) - Date.now() + if (initialPark > 0) { + log.warn(`token#${tokenIdx} still parked, waiting ${Math.round(initialPark / 1000)}s`) + await new Promise((r) => setTimeout(r, initialPark)) + } + + while (true) { + const idx = nextIdx++ + if (idx >= validUrls.length) break + const url = validUrls[idx] + + try { + const result = await fetchWithRetries(url, token, opts.maxRetries) + if (result) { + buffer.push(result) + if (!opts.dryRun && buffer.length >= opts.batchSize) { + const batch = buffer.splice(0) + await upsertLightRepos(qx, batch) + flushed += batch.length + } + } else { + failures.push({ url, reason: 'see warn log above' }) + failed++ + } + } catch (err) { + if (err instanceof FetchError && err.kind === 'RATE_LIMIT') { + const resetAt = err.resetAt ?? Date.now() + 60_000 + const waitMs = Math.max(1_000, resetAt - Date.now()) + parkedUntil.set(token, resetAt) + log.warn( + { tokenIdx, parkedUntil: new Date(resetAt).toISOString() }, + `token#${tokenIdx} rate limited — parking for ${Math.round(waitMs / 1000)}s`, + ) + await new Promise((r) => setTimeout(r, waitMs)) + failures.push({ url, reason: 'rate-limit' }) + failed++ + } else { + log.error({ url, err }, 'Unexpected error') + failures.push({ url, reason: (err as Error).message }) + failed++ + } + } + } + }), + ) + + if (!opts.dryRun && buffer.length > 0) { + await upsertLightRepos(qx, buffer) + flushed += buffer.length + } + + if (failures.length > 0) { + log.warn({ failures }, `${failures.length} repo(s) failed this page`) + } + + return { fetched: validUrls.length - failed, failed, flushed } +} + +async function main() { + const opts = parseArgs() + + const tokens = (process.env.GITHUB_TOKENS ?? '') + .split(',') + .map((t) => t.trim()) + .filter(Boolean) + + if (tokens.length === 0) { + log.error('GITHUB_TOKENS is required (comma-separated PATs)') + process.exit(1) + } + + // TODO: when connecting the real DB, replace with a connection pool and add keepalive / + // reconnect-on-error handling. A single long-lived connection will be dropped by the server + // during multi-hour runs (TCP timeout, idle reaper), crashing the script. Completed work + // is safe via last_synced_at, but the run stops and must be manually resumed. + const dbConnection = await getDbConnection(WRITE_DB_CONFIG()) + const qx = pgpQx(dbConnection) + + log.info('='.repeat(60)) + log.info('sync-light-repos') + log.info(`tokens=${tokens.length} page-size=${opts.pageSize} batch-size=${opts.batchSize}`) + log.info(`max-retries=${opts.maxRetries} dry-run=${opts.dryRun} limit=${opts.limit ?? 'none'}`) + log.info(`start-after=${opts.startAfter ?? '(beginning)'}`) + log.info('='.repeat(60)) + + const parkedUntil = new Map() + let cursor = opts.startAfter + let pageNum = 0 + let totalProcessed = 0 + let totalFailed = 0 + let totalFlushed = 0 + + while (true) { + pageNum++ + + const remaining = opts.limit !== null ? opts.limit - totalProcessed : opts.pageSize + if (remaining <= 0) break + + const { urls, nextCursor } = await fetchPage(qx, cursor, Math.min(opts.pageSize, remaining)) + + if (urls.length === 0) { + log.info('No more repos to process') + break + } + + const { fetched, failed, flushed } = await processPage(urls, tokens, parkedUntil, opts, qx) + + totalProcessed += urls.length + totalFailed += failed + totalFlushed += flushed + + log.info( + `Page ${pageNum}: read=${urls.length} fetched=${fetched} failed=${failed}${opts.dryRun ? ' [dry-run]' : ` flushed=${flushed}`}`, + ) + + if (nextCursor) { + log.info(`Resume with: --start-after ${nextCursor}`) + cursor = nextCursor + } + + if (urls.length < Math.min(opts.pageSize, remaining)) break + } + + log.info('='.repeat(60)) + log.info(`Summary: pages=${pageNum} processed=${totalProcessed} failed=${totalFailed} flushed=${totalFlushed}`) + log.info('='.repeat(60)) + + process.exit(totalFailed > 0 ? 1 : 0) +} + +main().catch((err) => { + log.error({ err }, 'Unexpected error') + process.exit(1) +}) diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts new file mode 100644 index 0000000000..f9b5d0fc5b --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts @@ -0,0 +1,46 @@ +export interface LightRepoResult { + url: string + host: 'github' + owner: string + name: string + description: string | null + primaryLanguage: string | null + topics: string[] + stars: number + forks: number + watchers: number + openIssues: number + lastCommitAt: string | null + archived: boolean + disabled: boolean + isFork: boolean + createdAt: string | null +} + +export interface ParsedRepoUrl { + owner: string + name: string +} + +export interface Options { + pageSize: number + batchSize: number + maxRetries: number + startAfter: string | null + limit: number | null + dryRun: boolean + source: string +} + +export type FetchErrorKind = 'RATE_LIMIT' | 'TRANSIENT' | 'NOT_FOUND' | 'AUTH' | 'MALFORMED' + +export class FetchError extends Error { + constructor( + public readonly kind: FetchErrorKind, + message: string, + public readonly resetAt?: number, // epoch ms; only for RATE_LIMIT + ) { + super(message) + this.name = 'FetchError' + } +} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts new file mode 100644 index 0000000000..f13af677fe --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts @@ -0,0 +1,49 @@ +import { getServiceChildLogger } from '@crowd/logging' + +// import { formatQuery, QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { LightRepoResult } from './types' + +const log = getServiceChildLogger('sync-light-repos:upsert') + +export async function upsertLightRepos(_qx: QueryExecutor, rows: LightRepoResult[]): Promise { + if (rows.length === 0) return + + log.info({ count: rows.length, rows: JSON.stringify(rows, null, 2) }, 'upsert results') + + // const values = rows + // .map((r) => + // formatQuery( + // `($(url), $(host), $(owner), $(name), $(description), $(primaryLanguage), $(topics)::text[], + // $(stars), $(forks), $(watchers), $(openIssues), $(lastCommitAt)::timestamptz, + // $(archived), $(disabled), $(isFork), $(createdAt)::timestamptz)`, + // r, + // ), + // ) + // .join(',\n') + + // await _qx.result(` + // INSERT INTO repos ( + // url, host, owner, name, description, primary_language, topics, + // stars, forks, watchers, open_issues, last_commit_at, + // archived, disabled, is_fork, created_at, last_synced_at + // ) VALUES ${values} + // ON CONFLICT (url) DO UPDATE SET + // host = EXCLUDED.host, + // owner = EXCLUDED.owner, + // name = EXCLUDED.name, + // description = EXCLUDED.description, + // primary_language = EXCLUDED.primary_language, + // topics = EXCLUDED.topics, + // stars = EXCLUDED.stars, + // forks = EXCLUDED.forks, + // watchers = EXCLUDED.watchers, + // open_issues = EXCLUDED.open_issues, + // last_commit_at = EXCLUDED.last_commit_at, + // archived = EXCLUDED.archived, + // disabled = EXCLUDED.disabled, + // is_fork = EXCLUDED.is_fork, + // last_synced_at = NOW() + // `) +} From 4b6c2875c8f1ee480c4c28c6bb320e4ca3da59a5 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:13:31 +0100 Subject: [PATCH 08/15] feat: add packages-worker and github-repos-enricher deployment artifacts Signed-off-by: Mouad BANI --- .../docker/Dockerfile.packages-worker | 22 +++++++ scripts/services/github-repos-enricher.yaml | 64 +++++++++++++++++++ scripts/services/packages-worker.yaml | 64 +++++++++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 scripts/services/docker/Dockerfile.packages-worker create mode 100644 scripts/services/github-repos-enricher.yaml create mode 100644 scripts/services/packages-worker.yaml diff --git a/scripts/services/docker/Dockerfile.packages-worker b/scripts/services/docker/Dockerfile.packages-worker new file mode 100644 index 0000000000..491ebd7665 --- /dev/null +++ b/scripts/services/docker/Dockerfile.packages-worker @@ -0,0 +1,22 @@ +FROM node:20-alpine as builder + +RUN apk add --no-cache python3 make g++ + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate + +COPY ./pnpm-workspace.yaml ./pnpm-lock.yaml ./ +RUN pnpm fetch + +COPY ./services ./services +RUN pnpm i --frozen-lockfile + +FROM node:20-alpine as runner + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate + +COPY --from=builder /usr/crowd/app/node_modules ./node_modules +COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json +COPY --from=builder /usr/crowd/app/services/libs ./services/libs +COPY --from=builder /usr/crowd/app/services/apps/packages_worker/ ./services/apps/packages_worker diff --git a/scripts/services/github-repos-enricher.yaml b/scripts/services/github-repos-enricher.yaml new file mode 100644 index 0000000000..b43dc6205e --- /dev/null +++ b/scripts/services/github-repos-enricher.yaml @@ -0,0 +1,64 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: github-repos-enricher + SHELL: /bin/sh + +services: + github-repos-enricher: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run start:github-repos-enricher' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + github-repos-enricher-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run dev:github-repos-enricher:local' + working_dir: /usr/crowd/app/services/apps/packages_worker + # user: '${USER_ID}:${GROUP_ID}' + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: github-repos-enricher + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true diff --git a/scripts/services/packages-worker.yaml b/scripts/services/packages-worker.yaml new file mode 100644 index 0000000000..0ac6658cbd --- /dev/null +++ b/scripts/services/packages-worker.yaml @@ -0,0 +1,64 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: packages-worker + SHELL: /bin/sh + +services: + packages-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run start:packages-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + packages-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run dev:packages-worker:local' + working_dir: /usr/crowd/app/services/apps/packages_worker + # user: '${USER_ID}:${GROUP_ID}' + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: packages-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true From 866f584870dc20bb44281a0485d2061f803add6f Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:15:22 +0100 Subject: [PATCH 09/15] chore: remove sync-light-repos script from script_executor_worker (moved to packages_worker) Signed-off-by: Mouad BANI --- .../apps/script_executor_worker/package.json | 1 - .../bin/sync-light-repos/fetchLightRepo.ts | 96 ------ .../src/bin/sync-light-repos/index.ts | 276 ------------------ .../src/bin/sync-light-repos/types.ts | 46 --- .../bin/sync-light-repos/upsertLightRepos.ts | 49 ---- 5 files changed, 468 deletions(-) delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts diff --git a/services/apps/script_executor_worker/package.json b/services/apps/script_executor_worker/package.json index 482a390495..c94236ec00 100644 --- a/services/apps/script_executor_worker/package.json +++ b/services/apps/script_executor_worker/package.json @@ -12,7 +12,6 @@ "recalculate-enrichment-affiliations": "npx tsx src/bin/recalculate-enrichment-affiliations.ts", "recalculate-all-affiliations": "npx tsx src/bin/recalculate-all-affiliations.ts", "add-lf-projects-to-collection": "npx tsx src/bin/add-lf-projects-to-collection.ts", - "sync-light-repos": "npx tsx src/bin/sync-light-repos/index.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts deleted file mode 100644 index ebd1dd87f4..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts +++ /dev/null @@ -1,96 +0,0 @@ -import { FetchError, LightRepoResult } from './types' - -const GRAPHQL_URL = 'https://api.github.com/graphql' - -const REPO_QUERY = ` - query($owner: String!, $name: String!) { - repository(owner: $owner, name: $name) { - description - primaryLanguage { name } - repositoryTopics(first: 25) { nodes { topic { name } } } - stargazerCount - forkCount - watchers { totalCount } - issues(states: OPEN) { totalCount } - pushedAt - isArchived - isDisabled - isFork - createdAt - } - } -` - -export function parseGithubUrl(url: string): { owner: string; name: string } { - const match = url.match(/https?:\/\/github\.com\/([^/]+)\/([^/]+?)(?:\.git)?\/?$/) - if (!match) throw new FetchError('MALFORMED', `Cannot parse GitHub URL: ${url}`) - return { owner: match[1], name: match[2] } -} - -export async function fetchLightRepo(url: string, token: string): Promise { - const { owner, name } = parseGithubUrl(url) - - let response: Response - try { - response = await fetch(GRAPHQL_URL, { - method: 'POST', - headers: { - Authorization: `bearer ${token}`, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ query: REPO_QUERY, variables: { owner, name } }), - }) - } catch (err) { - throw new FetchError('TRANSIENT', `Network error for ${url}: ${(err as Error).message}`) - } - - const resetSec = parseInt(response.headers.get('x-ratelimit-reset') ?? '0', 10) - const resetMs = resetSec ? resetSec * 1000 + 5_000 : Date.now() + 65_000 - - if (response.status === 401) { - throw new FetchError('AUTH', `401 Unauthorized for ${url}`) - } - - if (response.status === 403) { - const body = await response.text() - if (body.toLowerCase().includes('rate limit')) { - throw new FetchError('RATE_LIMIT', `Rate limited on ${url}`, resetMs) - } - throw new FetchError('AUTH', `403 Forbidden for ${url}`) - } - - if (response.status === 404) throw new FetchError('NOT_FOUND', `404 for ${url}`) - if (response.status >= 500) throw new FetchError('TRANSIENT', `${response.status} for ${url}`) - - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const json = (await response.json()) as any - - if (json.errors?.length) { - const err = json.errors[0] - if (err.type === 'RATE_LIMITED') throw new FetchError('RATE_LIMIT', `RATE_LIMITED for ${url}`, resetMs) - if (err.type === 'NOT_FOUND') throw new FetchError('NOT_FOUND', `NOT_FOUND for ${url}`) - throw new FetchError('TRANSIENT', `GraphQL error for ${url}: ${err.message ?? err.type}`) - } - - const repo = json.data?.repository - if (!repo) throw new FetchError('NOT_FOUND', `No repository data for ${url}`) - - return { - url, - host: 'github', - owner, - name, - description: repo.description ?? null, - primaryLanguage: repo.primaryLanguage?.name ?? null, - topics: (repo.repositoryTopics?.nodes ?? []).map((n: { topic: { name: string } }) => n.topic.name), - stars: repo.stargazerCount ?? 0, - forks: repo.forkCount ?? 0, - watchers: repo.watchers?.totalCount ?? 0, - openIssues: repo.issues?.totalCount ?? 0, - lastCommitAt: repo.pushedAt ?? null, - archived: repo.isArchived ?? false, - disabled: repo.isDisabled ?? false, - isFork: repo.isFork ?? false, - createdAt: repo.createdAt ?? null, - } -} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts deleted file mode 100644 index cb0f2198b3..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts +++ /dev/null @@ -1,276 +0,0 @@ -/** - * sync-light-repos - * - * Fetches GitHub repo metadata via GraphQL and upserts into the `repos` table. - * Runs one async worker per token — each worker claims URLs by index so no two - * requests ever share a token concurrently. - * - * Success tracking: a successful fetch updates repos.last_synced_at to NOW(). - * Failed repos keep a stale/null last_synced_at and are picked up on the next run. - * TODO: fetchPage will later filter by last_synced_at < NOW() - update_interval - * so this script becomes a continuous sync with no extra failure tracking needed. - * - * Usage: - * pnpm run sync-light-repos -- [options] - * - * Options: - * --page-size Repos fetched from source per cursor page (default: 200) - * --batch-size Upsert batch size (default: 50) - * --max-retries Per-repo transient retry cap (default: 3) - * --start-after Resume from cursor id (printed after each page) - * --limit Stop after N repos total (for testing) - * --dry-run Fetch but skip DB writes - * - * Environment: - * GITHUB_TOKENS Comma-separated GitHub PATs (required) - * CROWD_DB_WRITE_HOST/PORT/USERNAME/PASSWORD/DATABASE - * SERVICE - */ - -import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/data-access-layer/src/database' -import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' -import { getServiceChildLogger } from '@crowd/logging' - -import { fetchLightRepo, parseGithubUrl } from './fetchLightRepo' -import { FetchError, LightRepoResult } from './types' -import { upsertLightRepos } from './upsertLightRepos' - -const log = getServiceChildLogger('sync-light-repos') - -function parseArgs() { - const args = process.argv.slice(2) - const getArg = (flag: string) => { - const idx = args.indexOf(flag) - return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined - } - - const pageSize = parseInt(getArg('--page-size') ?? '200', 10) - const batchSize = parseInt(getArg('--batch-size') ?? '50', 10) - const maxRetries = parseInt(getArg('--max-retries') ?? '3', 10) - const startAfter = getArg('--start-after') ?? null - const limitRaw = getArg('--limit') - const limit = limitRaw !== undefined ? parseInt(limitRaw, 10) : null - const dryRun = args.includes('--dry-run') - - if (isNaN(pageSize) || pageSize <= 0) { log.error('--page-size must be a positive integer'); process.exit(1) } - if (isNaN(batchSize) || batchSize <= 0) { log.error('--batch-size must be a positive integer'); process.exit(1) } - if (isNaN(maxRetries) || maxRetries < 0) { log.error('--max-retries must be a non-negative integer'); process.exit(1) } - if (limit !== null && (isNaN(limit) || limit <= 0)) { log.error('--limit must be a positive integer'); process.exit(1) } - - return { pageSize, batchSize, maxRetries, startAfter, limit, dryRun } -} - -// TODO: add LEFT JOIN repos r ON r.url = pr.url and filter -// WHERE (r.last_synced_at IS NULL OR r.last_synced_at < NOW() - INTERVAL '$(updateIntervalHours) hours') -// once the update interval logic is scoped in. -async function fetchPage( - qx: ReturnType, - cursor: string | null, - pageSize: number, -): Promise<{ urls: string[]; nextCursor: string | null }> { - const rows = await qx.select( - ` - SELECT id, url - FROM public.repositories - WHERE url LIKE 'https://github.com/%' - AND "deletedAt" IS NULL - ${cursor ? 'AND id > $(cursor)' : ''} - ORDER BY id - LIMIT $(pageSize) - `, - { cursor, pageSize }, - ) - return { - urls: rows.map((r: { url: string }) => r.url), - nextCursor: rows.length > 0 ? (rows[rows.length - 1] as { id: string }).id : null, - } -} - -async function fetchWithRetries( - url: string, - token: string, - maxRetries: number, -): Promise { - for (let attempt = 0; attempt <= maxRetries; attempt++) { - try { - return await fetchLightRepo(url, token) - } catch (err) { - if (!(err instanceof FetchError)) throw err - - if (['NOT_FOUND', 'AUTH', 'MALFORMED'].includes(err.kind)) { - log.warn({ url, kind: err.kind }, err.message) - return null - } - - if (err.kind === 'RATE_LIMIT') throw err - - if (attempt < maxRetries) { - const backoffMs = 1000 * 2 ** attempt - log.warn({ url, attempt, backoffMs }, `Transient error, retrying: ${err.message}`) - await new Promise((r) => setTimeout(r, backoffMs)) - } else { - log.error({ url }, `Gave up after ${maxRetries} retries: ${err.message}`) - return null - } - } - } - return null -} - -async function processPage( - urls: string[], - tokens: string[], - parkedUntil: Map, - opts: ReturnType, - qx: ReturnType, -): Promise<{ fetched: number; failed: number; flushed: number }> { - const validUrls: string[] = [] - let skipped = 0 - for (const url of urls) { - try { parseGithubUrl(url); validUrls.push(url) } catch { skipped++ } - } - if (skipped > 0) log.warn(`Skipped ${skipped} non-GitHub URLs`) - - const buffer: LightRepoResult[] = [] - const failures: Array<{ url: string; reason: string }> = [] - let failed = 0 - let flushed = 0 - let nextIdx = 0 - - await Promise.all( - tokens.map(async (token, tokenIdx) => { - // Respect any park set during a previous page of this run - const initialPark = (parkedUntil.get(token) ?? 0) - Date.now() - if (initialPark > 0) { - log.warn(`token#${tokenIdx} still parked, waiting ${Math.round(initialPark / 1000)}s`) - await new Promise((r) => setTimeout(r, initialPark)) - } - - while (true) { - const idx = nextIdx++ - if (idx >= validUrls.length) break - const url = validUrls[idx] - - try { - const result = await fetchWithRetries(url, token, opts.maxRetries) - if (result) { - buffer.push(result) - if (!opts.dryRun && buffer.length >= opts.batchSize) { - const batch = buffer.splice(0) - await upsertLightRepos(qx, batch) - flushed += batch.length - } - } else { - failures.push({ url, reason: 'see warn log above' }) - failed++ - } - } catch (err) { - if (err instanceof FetchError && err.kind === 'RATE_LIMIT') { - const resetAt = err.resetAt ?? Date.now() + 60_000 - const waitMs = Math.max(1_000, resetAt - Date.now()) - parkedUntil.set(token, resetAt) - log.warn( - { tokenIdx, parkedUntil: new Date(resetAt).toISOString() }, - `token#${tokenIdx} rate limited — parking for ${Math.round(waitMs / 1000)}s`, - ) - await new Promise((r) => setTimeout(r, waitMs)) - failures.push({ url, reason: 'rate-limit' }) - failed++ - } else { - log.error({ url, err }, 'Unexpected error') - failures.push({ url, reason: (err as Error).message }) - failed++ - } - } - } - }), - ) - - if (!opts.dryRun && buffer.length > 0) { - await upsertLightRepos(qx, buffer) - flushed += buffer.length - } - - if (failures.length > 0) { - log.warn({ failures }, `${failures.length} repo(s) failed this page`) - } - - return { fetched: validUrls.length - failed, failed, flushed } -} - -async function main() { - const opts = parseArgs() - - const tokens = (process.env.GITHUB_TOKENS ?? '') - .split(',') - .map((t) => t.trim()) - .filter(Boolean) - - if (tokens.length === 0) { - log.error('GITHUB_TOKENS is required (comma-separated PATs)') - process.exit(1) - } - - // TODO: when connecting the real DB, replace with a connection pool and add keepalive / - // reconnect-on-error handling. A single long-lived connection will be dropped by the server - // during multi-hour runs (TCP timeout, idle reaper), crashing the script. Completed work - // is safe via last_synced_at, but the run stops and must be manually resumed. - const dbConnection = await getDbConnection(WRITE_DB_CONFIG()) - const qx = pgpQx(dbConnection) - - log.info('='.repeat(60)) - log.info('sync-light-repos') - log.info(`tokens=${tokens.length} page-size=${opts.pageSize} batch-size=${opts.batchSize}`) - log.info(`max-retries=${opts.maxRetries} dry-run=${opts.dryRun} limit=${opts.limit ?? 'none'}`) - log.info(`start-after=${opts.startAfter ?? '(beginning)'}`) - log.info('='.repeat(60)) - - const parkedUntil = new Map() - let cursor = opts.startAfter - let pageNum = 0 - let totalProcessed = 0 - let totalFailed = 0 - let totalFlushed = 0 - - while (true) { - pageNum++ - - const remaining = opts.limit !== null ? opts.limit - totalProcessed : opts.pageSize - if (remaining <= 0) break - - const { urls, nextCursor } = await fetchPage(qx, cursor, Math.min(opts.pageSize, remaining)) - - if (urls.length === 0) { - log.info('No more repos to process') - break - } - - const { fetched, failed, flushed } = await processPage(urls, tokens, parkedUntil, opts, qx) - - totalProcessed += urls.length - totalFailed += failed - totalFlushed += flushed - - log.info( - `Page ${pageNum}: read=${urls.length} fetched=${fetched} failed=${failed}${opts.dryRun ? ' [dry-run]' : ` flushed=${flushed}`}`, - ) - - if (nextCursor) { - log.info(`Resume with: --start-after ${nextCursor}`) - cursor = nextCursor - } - - if (urls.length < Math.min(opts.pageSize, remaining)) break - } - - log.info('='.repeat(60)) - log.info(`Summary: pages=${pageNum} processed=${totalProcessed} failed=${totalFailed} flushed=${totalFlushed}`) - log.info('='.repeat(60)) - - process.exit(totalFailed > 0 ? 1 : 0) -} - -main().catch((err) => { - log.error({ err }, 'Unexpected error') - process.exit(1) -}) diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts deleted file mode 100644 index f9b5d0fc5b..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts +++ /dev/null @@ -1,46 +0,0 @@ -export interface LightRepoResult { - url: string - host: 'github' - owner: string - name: string - description: string | null - primaryLanguage: string | null - topics: string[] - stars: number - forks: number - watchers: number - openIssues: number - lastCommitAt: string | null - archived: boolean - disabled: boolean - isFork: boolean - createdAt: string | null -} - -export interface ParsedRepoUrl { - owner: string - name: string -} - -export interface Options { - pageSize: number - batchSize: number - maxRetries: number - startAfter: string | null - limit: number | null - dryRun: boolean - source: string -} - -export type FetchErrorKind = 'RATE_LIMIT' | 'TRANSIENT' | 'NOT_FOUND' | 'AUTH' | 'MALFORMED' - -export class FetchError extends Error { - constructor( - public readonly kind: FetchErrorKind, - message: string, - public readonly resetAt?: number, // epoch ms; only for RATE_LIMIT - ) { - super(message) - this.name = 'FetchError' - } -} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts deleted file mode 100644 index f13af677fe..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { getServiceChildLogger } from '@crowd/logging' - -// import { formatQuery, QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' -import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' - -import { LightRepoResult } from './types' - -const log = getServiceChildLogger('sync-light-repos:upsert') - -export async function upsertLightRepos(_qx: QueryExecutor, rows: LightRepoResult[]): Promise { - if (rows.length === 0) return - - log.info({ count: rows.length, rows: JSON.stringify(rows, null, 2) }, 'upsert results') - - // const values = rows - // .map((r) => - // formatQuery( - // `($(url), $(host), $(owner), $(name), $(description), $(primaryLanguage), $(topics)::text[], - // $(stars), $(forks), $(watchers), $(openIssues), $(lastCommitAt)::timestamptz, - // $(archived), $(disabled), $(isFork), $(createdAt)::timestamptz)`, - // r, - // ), - // ) - // .join(',\n') - - // await _qx.result(` - // INSERT INTO repos ( - // url, host, owner, name, description, primary_language, topics, - // stars, forks, watchers, open_issues, last_commit_at, - // archived, disabled, is_fork, created_at, last_synced_at - // ) VALUES ${values} - // ON CONFLICT (url) DO UPDATE SET - // host = EXCLUDED.host, - // owner = EXCLUDED.owner, - // name = EXCLUDED.name, - // description = EXCLUDED.description, - // primary_language = EXCLUDED.primary_language, - // topics = EXCLUDED.topics, - // stars = EXCLUDED.stars, - // forks = EXCLUDED.forks, - // watchers = EXCLUDED.watchers, - // open_issues = EXCLUDED.open_issues, - // last_commit_at = EXCLUDED.last_commit_at, - // archived = EXCLUDED.archived, - // disabled = EXCLUDED.disabled, - // is_fork = EXCLUDED.is_fork, - // last_synced_at = NOW() - // `) -} From af3a4e575314105f55f958dd33bb99d7fdfb389c Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:45:59 +0100 Subject: [PATCH 10/15] fix: github repos enricher Signed-off-by: Mouad BANI --- scripts/services/github-repos-enricher.yaml | 8 +++++- scripts/services/packages-worker.yaml | 3 ++- services/apps/packages_worker/package.json | 8 +++--- services/apps/packages_worker/src/config.ts | 30 ++++++++++++++------- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/scripts/services/github-repos-enricher.yaml b/scripts/services/github-repos-enricher.yaml index b43dc6205e..e0f4726837 100644 --- a/scripts/services/github-repos-enricher.yaml +++ b/scripts/services/github-repos-enricher.yaml @@ -5,6 +5,12 @@ x-env-args: &env-args NODE_ENV: docker SERVICE: github-repos-enricher SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' + PAGE_SIZE: '200' + BATCH_SIZE: '50' + MAX_RETRIES: '3' + UPDATE_INTERVAL_HOURS: '24' + IDLE_SLEEP_SEC: '60' services: github-repos-enricher: @@ -28,7 +34,7 @@ services: build: context: ../../ dockerfile: ./scripts/services/docker/Dockerfile.packages-worker - command: 'pnpm run dev:github-repos-enricher:local' + command: 'pnpm run dev:github-repos-enricher' working_dir: /usr/crowd/app/services/apps/packages_worker # user: '${USER_ID}:${GROUP_ID}' env_file: diff --git a/scripts/services/packages-worker.yaml b/scripts/services/packages-worker.yaml index 0ac6658cbd..cb2ecdbf0f 100644 --- a/scripts/services/packages-worker.yaml +++ b/scripts/services/packages-worker.yaml @@ -5,6 +5,7 @@ x-env-args: &env-args NODE_ENV: docker SERVICE: packages-worker SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' services: packages-worker: @@ -28,7 +29,7 @@ services: build: context: ../../ dockerfile: ./scripts/services/docker/Dockerfile.packages-worker - command: 'pnpm run dev:packages-worker:local' + command: 'pnpm run dev:packages-worker' working_dir: /usr/crowd/app/services/apps/packages_worker # user: '${USER_ID}:${GROUP_ID}' env_file: diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 49b460d8ff..8ee8dceeae 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -4,10 +4,10 @@ "scripts": { "start:packages-worker": "SERVICE=packages-worker tsx src/bin/packages-worker.ts", "start:github-repos-enricher": "SERVICE=github-repos-enricher tsx src/bin/github-repos-enricher.ts", - "start:debug:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=packages-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", - "start:debug:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", - "dev:packages-worker:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:packages-worker:local", - "dev:github-repos-enricher:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:github-repos-enricher:local", + "dev:packages-worker": "SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", + "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", + "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", + "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 5614e06aa0..d0ac98b2c9 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -1,10 +1,20 @@ +function requireEnv(name: string): string { + const val = process.env[name] + if (!val) throw new Error(`Missing required environment variable: ${name}`) + return val +} + +function requireEnvInt(name: string): number { + return parseInt(requireEnv(name), 10) +} + export function getPackagesDbConfig() { return { - host: process.env.CROWD_PACKAGES_DB_WRITE_HOST, - port: parseInt(process.env.CROWD_PACKAGES_DB_PORT, 10), - database: process.env.CROWD_PACKAGES_DB_DATABASE, - user: process.env.CROWD_PACKAGES_DB_USERNAME, - password: process.env.CROWD_PACKAGES_DB_PASSWORD, + host: requireEnv('CROWD_PACKAGES_DB_WRITE_HOST'), + port: requireEnvInt('CROWD_PACKAGES_DB_PORT'), + database: requireEnv('CROWD_PACKAGES_DB_DATABASE'), + user: requireEnv('CROWD_PACKAGES_DB_USERNAME'), + password: requireEnv('CROWD_PACKAGES_DB_PASSWORD'), } } @@ -14,10 +24,10 @@ export function getEnricherConfig() { return { tokens, - pageSize: parseInt(process.env.PAGE_SIZE, 10), - batchSize: parseInt(process.env.BATCH_SIZE, 10), - maxRetries: parseInt(process.env.MAX_RETRIES, 10), - updateIntervalHours: parseInt(process.env.UPDATE_INTERVAL_HOURS, 10), - idleSleepSec: parseInt(process.env.IDLE_SLEEP_SEC, 10), + pageSize: requireEnvInt('PAGE_SIZE'), + batchSize: requireEnvInt('BATCH_SIZE'), + maxRetries: requireEnvInt('MAX_RETRIES'), + updateIntervalHours: requireEnvInt('UPDATE_INTERVAL_HOURS'), + idleSleepSec: requireEnvInt('IDLE_SLEEP_SEC'), } } From a9650663a7c27a771007b551de04d4ef12924c16 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 12:14:00 +0100 Subject: [PATCH 11/15] chore: cleanup enricher config Signed-off-by: Mouad BANI --- backend/.env.dist.local | 6 ++++++ scripts/services/github-repos-enricher.yaml | 8 +++---- .../src/bin/github-repos-enricher.ts | 21 ------------------- services/apps/packages_worker/src/config.ts | 8 +++---- .../src/enricher/runEnrichmentLoop.ts | 19 +++++++++-------- 5 files changed, 22 insertions(+), 40 deletions(-) diff --git a/backend/.env.dist.local b/backend/.env.dist.local index 5ac8b67df4..d9126fe135 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -174,3 +174,9 @@ CROWD_PACKAGES_DB_PORT=5434 CROWD_PACKAGES_DB_USERNAME=postgres CROWD_PACKAGES_DB_PASSWORD=example CROWD_PACKAGES_DB_DATABASE=packages-db + +# github-repos-enricher +ENRICHER_GITHUB_TOKENS= +ENRICHER_BATCH_SIZE=100 +ENRICHER_REPO_UPDATE_INTERVAL_HOURS=24 +ENRICHER_IDLE_SLEEP_SEC=60 diff --git a/scripts/services/github-repos-enricher.yaml b/scripts/services/github-repos-enricher.yaml index e0f4726837..adf0a7a522 100644 --- a/scripts/services/github-repos-enricher.yaml +++ b/scripts/services/github-repos-enricher.yaml @@ -6,11 +6,9 @@ x-env-args: &env-args SERVICE: github-repos-enricher SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' - PAGE_SIZE: '200' - BATCH_SIZE: '50' - MAX_RETRIES: '3' - UPDATE_INTERVAL_HOURS: '24' - IDLE_SLEEP_SEC: '60' + ENRICHER_BATCH_SIZE: '100' + ENRICHER_REPO_UPDATE_INTERVAL_HOURS: '24' + ENRICHER_IDLE_SLEEP_SEC: '60' services: github-repos-enricher: diff --git a/services/apps/packages_worker/src/bin/github-repos-enricher.ts b/services/apps/packages_worker/src/bin/github-repos-enricher.ts index a40064b724..8de28a7aaa 100644 --- a/services/apps/packages_worker/src/bin/github-repos-enricher.ts +++ b/services/apps/packages_worker/src/bin/github-repos-enricher.ts @@ -1,6 +1,3 @@ -import fs from 'fs' -import path from 'path' - import { getServiceLogger } from '@crowd/logging' import { getEnricherConfig } from '../config' @@ -9,9 +6,6 @@ import { runEnrichmentLoop } from '../enricher/runEnrichmentLoop' const log = getServiceLogger() -const liveFilePath = path.join(__dirname, '../tmp/github-repos-enricher-live.tmp') -const readyFilePath = path.join(__dirname, '../tmp/github-repos-enricher-ready.tmp') - let shuttingDown = false const shutdown = async () => { @@ -37,20 +31,6 @@ const main = async () => { await qx.selectOne('SELECT 1') log.info('Connected to packages-db.') - fs.mkdirSync(path.dirname(liveFilePath), { recursive: true }) - - const healthInterval = setInterval(async () => { - if (shuttingDown) return - try { - await Promise.all([ - fs.promises.open(liveFilePath, 'a').then((f) => f.close()), - fs.promises.open(readyFilePath, 'a').then((f) => f.close()), - ]) - } catch (err) { - log.warn({ err }, 'Failed to write health probe files') - } - }, 5000) - log.info( { tokens: config.tokens.length, pageSize: config.pageSize, batchSize: config.batchSize }, 'Starting enrichment loop', @@ -58,7 +38,6 @@ const main = async () => { await runEnrichmentLoop(qx, config, () => shuttingDown) - clearInterval(healthInterval) log.info('github-repos-enricher stopped.') process.exit(0) } diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index d0ac98b2c9..4faf7444bf 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -24,10 +24,8 @@ export function getEnricherConfig() { return { tokens, - pageSize: requireEnvInt('PAGE_SIZE'), - batchSize: requireEnvInt('BATCH_SIZE'), - maxRetries: requireEnvInt('MAX_RETRIES'), - updateIntervalHours: requireEnvInt('UPDATE_INTERVAL_HOURS'), - idleSleepSec: requireEnvInt('IDLE_SLEEP_SEC'), + batchSize: requireEnvInt('ENRICHER_BATCH_SIZE'), + updateIntervalHours: requireEnvInt('ENRICHER_REPO_UPDATE_INTERVAL_HOURS'), + idleSleepSec: requireEnvInt('ENRICHER_IDLE_SLEEP_SEC'), } } diff --git a/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts index a406342f82..b80d84b0bd 100644 --- a/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts @@ -8,12 +8,13 @@ import { updateEnrichedRepos } from './updateEnrichedRepos' const log = getServiceChildLogger('github-repos-enricher') +const MAX_RETRIES = 3 + async function fetchWithRetries( url: string, token: string, - maxRetries: number, ): Promise { - for (let attempt = 0; attempt <= maxRetries; attempt++) { + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { try { return await fetchLightRepo(url, token) } catch (err) { @@ -26,12 +27,12 @@ async function fetchWithRetries( if (err.kind === 'RATE_LIMIT') throw err - if (attempt < maxRetries) { + if (attempt < MAX_RETRIES) { const backoffMs = 1000 * 2 ** attempt log.warn({ url, attempt, backoffMs }, `Transient error, retrying: ${err.message}`) await new Promise((r) => setTimeout(r, backoffMs)) } else { - log.error({ url }, `Gave up after ${maxRetries} retries: ${err.message}`) + log.error({ url }, `Gave up after ${MAX_RETRIES} retries: ${err.message}`) return null } } @@ -42,7 +43,7 @@ async function fetchWithRetries( async function fetchPage( qx: QueryExecutor, cursor: string | null, - pageSize: number, + batchSize: number, updateIntervalHours: number, ): Promise<{ rows: Array<{ id: string; url: string }>; urls: string[] }> { const rows = await qx.select( @@ -53,9 +54,9 @@ async function fetchPage( AND (last_synced_at IS NULL OR last_synced_at < NOW() - INTERVAL '$(updateIntervalHours) hours') AND ($(cursor) IS NULL OR id > $(cursor)) ORDER BY id - LIMIT $(pageSize) + LIMIT $(batchSize) `, - { cursor, pageSize, updateIntervalHours }, + { cursor, batchSize, updateIntervalHours }, ) return { rows, @@ -103,7 +104,7 @@ async function processPage( const url = validUrls[idx] try { - const result = await fetchWithRetries(url, token, config.maxRetries) + const result = await fetchWithRetries(url, token) if (result) { buffer.push(result) if (buffer.length >= config.batchSize) { @@ -161,7 +162,7 @@ export async function runEnrichmentLoop( while (!isShuttingDown()) { pageNum++ - const { rows, urls } = await fetchPage(qx, cursor, config.pageSize, config.updateIntervalHours) + const { rows, urls } = await fetchPage(qx, cursor, config.batchSize, config.updateIntervalHours) if (urls.length === 0) { log.info('No more repos to process, sleeping') From 5ed2d3ca20c843c7fdf3e2eca0e3a546b893ee57 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 26 May 2026 13:10:23 +0100 Subject: [PATCH 12/15] chore: dev env Signed-off-by: Joana Maia --- .claude/rules/skill-guidance.md | 12 ++ .../packages-worker-add-entrypoint/SKILL.md | 129 ++++++++++++++++++ .claude/skills/packages-worker-setup/SKILL.md | 99 ++++++++++++++ docs/adr/0001-packages-database.md | 45 ++++++ docs/adr/0002-packages-worker-architecture.md | 55 ++++++++ docs/adr/README.md | 3 +- .../src/tmp/packages-worker-live.tmp | 0 .../src/tmp/packages-worker-ready.tmp | 0 8 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/packages-worker-add-entrypoint/SKILL.md create mode 100644 .claude/skills/packages-worker-setup/SKILL.md create mode 100644 docs/adr/0001-packages-database.md create mode 100644 docs/adr/0002-packages-worker-architecture.md create mode 100644 services/apps/packages_worker/src/tmp/packages-worker-live.tmp create mode 100644 services/apps/packages_worker/src/tmp/packages-worker-ready.tmp diff --git a/.claude/rules/skill-guidance.md b/.claude/rules/skill-guidance.md index 509cc031f3..854366fda9 100644 --- a/.claude/rules/skill-guidance.md +++ b/.claude/rules/skill-guidance.md @@ -16,6 +16,8 @@ This project has guided skills for common workflows. **Proactively suggest the r | `/review-pr` | Review a PR, audit code changes, check PR quality, validate a PR against standards | | `/adr` | Record an architecture decision, choose between frameworks/libraries/patterns, query past decisions | | `/scaffold-snowflake-connector` | Add a new Snowflake-connector data source or integration | +| `/packages-worker-setup` | First-time setup of packages-db and github-repos-enricher for a new engineer | +| `/packages-worker-add-entrypoint` | Scaffold a new sibling worker inside packages_worker (npm, OSV, scorecard, etc.) | ## Trigger Phrases @@ -45,3 +47,13 @@ This project has guided skills for common workflows. **Proactively suggest the r **`/scaffold-snowflake-connector`** — match any of these intents: - "Add a new Snowflake connector", "New integration for [platform]" - "Scaffold a new data source", anything about adding a platform to `snowflake_connectors` + +**`/packages-worker-setup`** — match any of these intents: +- "Set up packages worker", "how do I run the enricher", "first time on this branch" +- "Get packages-db running", "packages-db won't start", "ENRICHER_GITHUB_TOKENS" +- Any first-time setup question specific to `packages_worker` or `packages-db` + +**`/packages-worker-add-entrypoint`** — match any of these intents: +- "Add a new packages worker", "scaffold a sibling worker", "new entry point in packages_worker" +- "Add npm ingestion", "add OSV worker", "add scorecard runner" +- Any request to create a new `src/bin/*.ts` worker inside `packages_worker` diff --git a/.claude/skills/packages-worker-add-entrypoint/SKILL.md b/.claude/skills/packages-worker-add-entrypoint/SKILL.md new file mode 100644 index 0000000000..a7ce04b297 --- /dev/null +++ b/.claude/skills/packages-worker-add-entrypoint/SKILL.md @@ -0,0 +1,129 @@ +--- +name: packages-worker-add-entrypoint +description: > + Scaffold a new sub-worker inside packages_worker (npm, deps.dev, osv, scorecard, + etc.) following the single-service multi-entry-point structure. Use when: "add a + new packages worker", "scaffold a sub-worker in packages_worker", "new worker for + packages-db", "add npm worker", "add OSV worker", "add deps.dev worker". +allowed-tools: Read, Write, Edit, Bash, AskUserQuestion, Glob +--- + +# packages-worker — Add a New Sub-worker + +You are adding a new data-ingestion worker to `services/apps/packages_worker/`. +The structure follows the same pattern as `backend/` (where `api.ts` and +`job-generator.ts` share one Dockerfile): one npm package, one Docker image, +each worker in its own `src/{worker}/` directory with its own entry point. + +``` +services/apps/packages_worker/ + src/ + bin/ + packages-worker.ts ← parent stub + github-repos-enricher.ts ← existing worker + .ts ← entry point you will create + github/ ← existing worker logic + / ← directory you will create + index.ts ← main logic for this worker + types.ts + config.ts ← shared — add your config getter here + db.ts ← shared — do not modify +``` + +## Step 1 — Gather requirements + +Ask the engineer for: + +1. **Worker name** (kebab-case) — e.g. `npm-sync`, `osv-sync`, `scorecard-runner`. Used as the entry point filename (`src/bin/.ts`) and docker-compose service name. +2. **Worker directory name** (short, lowercase) — e.g. `npm`, `osv`, `scorecard`. Becomes `src//`. +3. **What it does** — what data it fetches/writes, what table(s) in packages-db it reads from and writes to. +4. **External API or data source** (if any) — URL, auth method, rate-limit characteristics. +5. **Required env vars** beyond the shared DB vars — e.g. `NPM_API_URL`, `OSV_API_KEY`. + +Do not proceed until you have answers to 1–3. + +## Step 2 — Read existing files first + +```bash +cat services/apps/packages_worker/src/bin/github-repos-enricher.ts +cat services/apps/packages_worker/src/config.ts +cat services/apps/packages_worker/package.json +cat scripts/services/github-repos-enricher.yaml +``` + +These are the canonical references. Do not deviate from the patterns you see there. + +## Step 3 — Scaffold the files + +### 3a. Worker directory — `services/apps/packages_worker/src//` + +Create the directory with at minimum: + +**`types.ts`** — types specific to this worker (input/output shapes, error kinds if calling an external API). + +**`index.ts`** — the main logic function(s) this worker runs. What goes here depends entirely on what the worker does — do not force a loop shape if it does not fit. Discuss with the engineer what the execution model should be (continuous loop, one-shot batch, event-driven, etc.) and implement accordingly. + +Add any additional files the worker needs (e.g. an API client, a DB query helper). All DB access uses inline pg-promise SQL via `qx.select` / `qx.result` / `qx.none` — do not add files to `services/libs/data-access-layer`. + +### 3b. Entry point — `services/apps/packages_worker/src/bin/.ts` + +Follow the structure of `github-repos-enricher.ts`: +- Import `getServiceLogger` from `@crowd/logging` +- Import your worker's config getter from `../config` and `getPackagesDb` from `../db` +- Import your worker's main function from `..//index` +- Set `liveFilePath` / `readyFilePath` to `../tmp/-live.tmp` / `../tmp/-ready.tmp` +- Handle SIGINT / SIGTERM with a `shuttingDown` flag +- In `main()`: call config getter → validate any required tokens/keys → `await getPackagesDb()` → `await qx.selectOne('SELECT 1')` → `fs.mkdirSync` for the tmp dir → `setInterval` writing probe files every 5000ms → call your worker's main function → `clearInterval` → `process.exit(0)` +- Fatal handler: `main().catch(err => { log.error({ err }, ' fatal error'); process.exit(1) })` + +### 3c. Config additions — `services/apps/packages_worker/src/config.ts` + +Read the file first, then add a `getConfig()` function: +- Use `requireEnv(name)` for string vars, `requireEnvInt(name)` for integers +- No defaults, no `?? undefined` — the process must refuse to start on missing config + +### 3d. Docker-compose service — `scripts/services/.yaml` + +Copy `scripts/services/github-repos-enricher.yaml` and adapt: +- Service names: `` (prod) and `-dev` (dev) +- `command` (prod): `pnpm run start:` +- `command` (dev): `pnpm run dev:` +- `env_file`: keep the same four files (`backend/.env.dist.local`, `backend/.env.dist.composed`, `backend/.env.override.local`, `backend/.env.override.composed`) +- `environment`: set any tuning var defaults inline (avoids requiring them in `.env.override.local` for local dev) +- `volumes` (dev only): bind-mount `./services/apps/packages_worker/src` plus every `services/libs/*/src` directory (copy the full list from the enricher yaml for hot reload) + +### 3e. package.json scripts — `services/apps/packages_worker/package.json` + +Read the file first, then add: +```json +"start:": "tsx src/bin/.ts", +"dev:": "tsx watch src/bin/.ts" +``` + +### 3f. Env var files — `backend/.env.dist.local` and `backend/.env.dist.composed` + +Append new required vars with empty-string defaults (or sensible local values for non-secrets): +``` +NEW_WORKER_API_KEY= +``` + +## Step 4 — TypeScript check + +```bash +cd services/apps/packages_worker && pnpm tsc --noEmit +``` + +Fix any errors before proceeding. + +## Checklist before committing + +- [ ] `src//` directory created with `types.ts` and `index.ts` +- [ ] `src/bin/.ts` — probe files, SIGINT/SIGTERM handler, fail-fast config check, `SELECT 1` on startup +- [ ] `config.ts` — new `getConfig()` using `requireEnv`/`requireEnvInt`, no defaults +- [ ] `scripts/services/.yaml` — prod + dev services with bind mounts +- [ ] `package.json` — `start:` and `dev:` scripts added +- [ ] `backend/.env.dist.local` and `.env.dist.composed` — new vars documented +- [ ] No new files in `services/libs/data-access-layer` (packages-db uses inline SQL) +- [ ] `pnpm tsc --noEmit` passes + +Use `/preflight` before opening a PR and `/commit` to sign off. diff --git a/.claude/skills/packages-worker-setup/SKILL.md b/.claude/skills/packages-worker-setup/SKILL.md new file mode 100644 index 0000000000..af84328e33 --- /dev/null +++ b/.claude/skills/packages-worker-setup/SKILL.md @@ -0,0 +1,99 @@ +--- +name: packages-worker-setup +description: > + Get packages_worker running locally — first time or resuming after a break. + Spins up packages-db if not running, applies any pending migrations, and starts + the worker. All steps are safe to re-run. + Use when: "set up packages worker", "start packages worker", "resume packages worker", + "get packages-db running", "packages-db stopped", "restart the worker". +allowed-tools: Read, Bash, Edit, AskUserQuestion +--- + +# packages-worker + +Get `packages_worker` running locally. All steps are idempotent — safe to run +whether this is your first time or you're resuming after a break. + +## Prerequisites check + +```bash +git branch --show-current # should be feat/track-packages +docker info --format '{{.ServerVersion}}' +pnpm --version +``` + +If the branch is wrong: `git checkout feat/track-packages && pnpm i`. + +## Step 1 — Start packages-db + +No-op if already running. + +```bash +docker compose -f scripts/scaffold.yaml up -d packages +until docker compose -f scripts/scaffold.yaml exec packages pg_isready -U postgres; do sleep 1; done +echo "packages-db is ready" +``` + +## Step 2 — Apply pending migrations + +Flyway skips already-applied migrations, so this is safe to re-run. + +```bash +arch=$(uname -m) +[ "$arch" = "arm64" ] && PLATFORM="--platform=linux/arm64/v8" || PLATFORM="--platform=linux/amd64" +docker build $PLATFORM -t packages_flyway \ + -f backend/src/osspckgs/Dockerfile.flyway backend/src/osspckgs --load + +docker run --rm --network crowd-bridge \ + -e PGHOST=packages \ + -e PGPORT=5432 \ + -e PGUSER=postgres \ + -e PGPASSWORD=example \ + -e PGDATABASE=packages-db \ + packages_flyway +``` + +To create a new migration: + +```bash +./scripts/cli scaffold create-packages-migration +``` + +## Step 3 — Start the worker + +```bash +DEV=1 ./scripts/cli service packages-worker up +``` + +Dev mode uses hot reload — edits to `services/apps/packages_worker/src/` and +`services/libs/*/src/` are picked up immediately without restarting. + +## Day-to-day commands + +```bash +# Follow logs +./scripts/cli service packages-worker logs + +# Stop +./scripts/cli service packages-worker down + +# Restart +./scripts/cli service packages-worker restart + +# Check status +./scripts/cli service packages-worker status +``` + +## Going further + +- Add a new sub-worker (npm-sync, osv-sync, etc.): `/packages-worker-add-entrypoint` +- Record an architecture decision: `/adr` +- Before opening a PR: `/preflight` +- Commit with DCO sign-off: `/commit` + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---|---|---| +| `Connection refused` on packages-db | Docker not running | `docker compose -f scripts/scaffold.yaml up -d packages` | +| `permission denied: scripts/cli` | CLI not executable | `chmod +x scripts/cli` | diff --git a/docs/adr/0001-packages-database.md b/docs/adr/0001-packages-database.md new file mode 100644 index 0000000000..37e37399dd --- /dev/null +++ b/docs/adr/0001-packages-database.md @@ -0,0 +1,45 @@ +# ADR-0001: Separate physical database for the packages domain + +**Date**: 2026-05-25 +**Status**: accepted +**Deciders**: CDP/Insights team + +## Context + +The packages domain (tracking open-source packages, dependency graphs, repositories, security advisories, and maintainers) is being built as a new capability inside the CDP platform. The main CDP database (`crowd-web`) and the existing `product-db` already exist as separate physical Postgres instances, each owned by a distinct domain. The packages schema has no foreign-key relationships into either existing database, and requires partitioned tables sized for 90M+ versions and 1.15B+ dependency rows — a scale profile that would create resource contention if mixed with CDP's community-activity tables. + +## Decision + +We store all packages-domain data in a dedicated physical Postgres instance (`packages-db`, port 5434) with its own Flyway migration path (`backend/src/osspckgs/migrations/`), following the same Dockerfile and migration-script pattern used by `product-db`. Schema and connection code live entirely within the `packages_worker` service. + +## Alternatives Considered + +### Alternative 1: Add tables to the main CDP db (`crowd-web`) + +- **Pros**: No new database to manage; existing Flyway setup and pg-promise helpers already target it. +- **Cons**: Packages tables have a completely different shape and scale from community-activity tables. Resource contention at scale is a real risk, and schema coupling makes independent evolution harder. +- **Why not**: The packages schema has zero FK dependencies on CDP tables. Co-locating independent domains in one database couples their lifecycle, backup strategy, and performance headroom for no benefit. + +## Consequences + +### Positive + +- Clear domain boundary: packages-db has no FK relationships outside its own schema. +- Independent scaling, backup, and maintenance. +- Follows existing precedent; `product-db` demonstrates the pattern works. +- Schema decisions (partitioning, GDPR) are isolated to one database and one migration path. + +### Negative + +- A third Postgres instance to operate, monitor, and back up. +- Read/write host split is prepared in env vars (`CROWD_PACKAGES_DB_READ_HOST` / `CROWD_PACKAGES_DB_WRITE_HOST`) but only the write host is wired in `config.ts` today — read routing is deferred. + +--- + +**Partitioning rationale (captured here to avoid re-litigating per-table):** + +| Table | Strategy | Buckets | Hot query shape | +|---|---|---|---| +| `versions` | HASH(`package_id`) | 32 | Lookup by package — lands in one partition; ~2.8M rows each at 90M total | +| `package_dependencies` | HASH(`depends_on_id`) | 64 | "Who depends on vulnerable package X?" — lands in one partition; ~18M rows each at 1.15B total | +| `downloads_daily` | RANGE(`date`) via `pg_partman` | automatic | Time-series; pruning old partitions is straightforward | diff --git a/docs/adr/0002-packages-worker-architecture.md b/docs/adr/0002-packages-worker-architecture.md new file mode 100644 index 0000000000..cf4c81af4f --- /dev/null +++ b/docs/adr/0002-packages-worker-architecture.md @@ -0,0 +1,55 @@ +# ADR-0002: Single-service, multi-entry-point architecture for packages_worker + +**Date**: 2026-05-25 +**Status**: accepted +**Deciders**: CDP/Insights team + +## Context + +The packages domain requires several independent data-ingestion workers (GitHub repository enrichment, npm package sync, deps.dev dependency data, OSV advisories, OpenSSF Scorecard, and others). Each worker has distinct external API dependencies, rate-limit profiles, and scheduling needs. The platform already demonstrates this pattern in `backend/`, where `api.ts` and `job-generator.ts` are two separate processes built from the same Dockerfile and npm package. + +## Decision + +All packages_worker sub-workers live in a single npm package (`services/apps/packages_worker`) and are built from one Dockerfile (`scripts/services/docker/Dockerfile.packages-worker`). Each sub-worker is a self-contained directory under `services/apps/packages_worker/src/{worker}/` with its own logic, types, and database access. Each is launched as a separate container using a different entry point command, sharing the same image. Config helpers (`requireEnv`, `requireEnvInt`) and the packages-db connection are shared across all entry points. + +``` +services/apps/packages_worker/ + src/ + bin/ + packages-worker.ts ← parent / health-check stub + github-repos-enricher.ts + github/ ← github-repos-enricher logic + npm/ ← npm worker (future) + deps-dev/ ← deps.dev worker (future) + osv/ ← OSV worker (future) + config.ts ← shared requireEnv / requireEnvInt + db.ts ← shared packages-db connection +``` + +## Alternatives Considered + +### Alternative 1: One npm package per worker (matching the pattern of other workers in `services/apps/`) + +- **Pros**: Full isolation; each worker has its own `package.json`, Dockerfile, and deploy lifecycle. +- **Cons**: Most packages-domain workers share the same DB connection, config shape, and type definitions. Duplicating these across N packages creates maintenance overhead that grows with each new data source. +- **Why not**: The packages domain has a clear shared foundation (one DB, one config pattern, one set of domain types). A monorepo sub-package per worker is the right split when workers diverge significantly in dependencies or deploy cadence — that is not the case here. + +### Alternative 2: One monolithic process running all workers + +- **Pros**: Simpler deployment — one container. +- **Cons**: Workers have different rate-limit profiles and external API dependencies. A failure or resource spike in one worker affects all others. Independent scaling is impossible. +- **Why not**: Workers must be deployed and scaled independently. A single-process monolith would require internal concurrency management that replicates what separate processes give for free. + +## Consequences + +### Positive + +- One Dockerfile and one build to maintain regardless of how many sub-workers are added. +- Shared `config.ts` enforces fail-fast env-var validation (`requireEnv`/`requireEnvInt`) across all workers — no silent `undefined`/`NaN` tuning values. +- Each worker can be deployed, scaled, and restarted independently. +- Adding a new data source means adding `src/{worker}/` and a new compose service entry — no new npm package, no new Dockerfile. + +### Negative + +- All workers in the service share the same npm dependency tree. A dependency needed by only one worker adds to the image size for all. +- A breaking change to shared code (`config.ts`, `db.ts`) affects all entry points simultaneously. diff --git a/docs/adr/README.md b/docs/adr/README.md index 6e2c3ae031..8f8df1d85f 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -8,7 +8,8 @@ Use the `/adr` skill in Claude Code to record new ADRs or query past decisions. | ADR | Title | Status | Date | | --- | ----- | ------ | ---- | -| _none yet_ | | | | +| [ADR-0001](./0001-packages-database.md) | Separate physical database for the packages domain | accepted | 2026-05-26 | +| [ADR-0002](./0002-packages-worker-architecture.md) | Single-service, multi-entry-point architecture for packages_worker | accepted | 2026-05-25 | ## Why ADRs? diff --git a/services/apps/packages_worker/src/tmp/packages-worker-live.tmp b/services/apps/packages_worker/src/tmp/packages-worker-live.tmp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/services/apps/packages_worker/src/tmp/packages-worker-ready.tmp b/services/apps/packages_worker/src/tmp/packages-worker-ready.tmp new file mode 100644 index 0000000000..e69de29bb2 From c1c404c5be8236e6fae849ba16439db2644e6df9 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 13:47:22 +0100 Subject: [PATCH 13/15] fix: repos.last_synced_at Signed-off-by: Mouad BANI --- .../V1779799200__repos_default_created_and_last_synced.sql | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 backend/src/osspckgs/migrations/V1779799200__repos_default_created_and_last_synced.sql diff --git a/backend/src/osspckgs/migrations/V1779799200__repos_default_created_and_last_synced.sql b/backend/src/osspckgs/migrations/V1779799200__repos_default_created_and_last_synced.sql new file mode 100644 index 0000000000..c7b9bc17c7 --- /dev/null +++ b/backend/src/osspckgs/migrations/V1779799200__repos_default_created_and_last_synced.sql @@ -0,0 +1,6 @@ +ALTER TABLE repos + ALTER COLUMN created_at SET DEFAULT NOW(); + +ALTER TABLE repos + ALTER COLUMN last_synced_at DROP NOT NULL, + ALTER COLUMN last_synced_at DROP DEFAULT; From b49de68679e6897efc91c386c23e8243ecbd9bba Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 26 May 2026 17:59:07 +0200 Subject: [PATCH 14/15] feat: pom extractor Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 82 ++++- scripts/services/pom-fetcher.yaml | 69 ++++ services/apps/packages_worker/package.json | 5 + .../packages_worker/src/bin/pom-fetcher.ts | 42 +++ services/apps/packages_worker/src/config.ts | 9 + .../src/pom-fetcher/extract.ts | 295 ++++++++++++++++++ .../src/pom-fetcher/metadata.ts | 50 +++ .../src/pom-fetcher/runPomEnrichmentLoop.ts | 221 +++++++++++++ services/libs/data-access-layer/src/index.ts | 1 + .../data-access-layer/src/osspckgs/index.ts | 3 + .../src/osspckgs/maintainers.ts | 55 ++++ .../src/osspckgs/packages.ts | 95 ++++++ .../data-access-layer/src/osspckgs/types.ts | 44 +++ 13 files changed, 963 insertions(+), 8 deletions(-) create mode 100644 scripts/services/pom-fetcher.yaml create mode 100644 services/apps/packages_worker/src/bin/pom-fetcher.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/extract.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/metadata.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts create mode 100644 services/libs/data-access-layer/src/osspckgs/index.ts create mode 100644 services/libs/data-access-layer/src/osspckgs/maintainers.ts create mode 100644 services/libs/data-access-layer/src/osspckgs/packages.ts create mode 100644 services/libs/data-access-layer/src/osspckgs/types.ts diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 96db6643f6..58e98925d3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1306,6 +1306,12 @@ importers: '@crowd/logging': specifier: workspace:* version: link:../../libs/logging + axios: + specifier: ^1.6.7 + version: 1.13.5 + fast-xml-parser: + specifier: ^4.4.0 + version: 4.5.6 tsx: specifier: ^4.7.1 version: 4.7.3 @@ -1366,6 +1372,58 @@ importers: specifier: ^3.0.1 version: 3.1.0 + services/apps/pom_fetcher_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/database': + specifier: workspace:* + version: link:../../libs/database + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + axios: + specifier: ^1.6.7 + version: 1.13.5 + fast-xml-parser: + specifier: ^4.4.0 + version: 4.5.6 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/profiles_worker: dependencies: '@crowd/archetype-standard': @@ -6570,6 +6628,10 @@ packages: resolution: {integrity: sha512-B9/wizE4WngqQftFPmdaMYlXoJlJOYxGQOanC77fq9k8+Z0v5dDSVh+3glErdIROP//s/jgb7ZuxKfB8nVyo0g==} hasBin: true + fast-xml-parser@4.5.6: + resolution: {integrity: sha512-Yd4vkROfJf8AuJrDIVMVmYfULKmIJszVsMv7Vo71aocsKgFxpdlpSHXSaInvyYfgw2PRuObQSW2GFpVMUjxu9A==} + hasBin: true + fast-xml-parser@5.3.4: resolution: {integrity: sha512-EFd6afGmXlCx8H8WTZHhAoDaWaGyuIBoZJ2mknrNxug+aZKjkp0a0dlars9Izl+jF+7Gu1/5f/2h68cQpe0IiA==} hasBin: true @@ -6858,11 +6920,11 @@ packages: glob@6.0.4: resolution: {integrity: sha512-MKZeRNyYZAVVVG1oZeLaWie1uweH40m9AZwIwxyPbTSX4hHrVYSzLg0Ro5Z5R7XKkIX+Cc6oD1rqeDJnwsB8/A==} - deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me + deprecated: Glob versions prior to v9 are no longer supported glob@7.2.3: resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==} - deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me + deprecated: Glob versions prior to v9 are no longer supported global-directory@4.0.1: resolution: {integrity: sha512-wHTUcDUoZ1H5/0iVqEudYW4/kAlN5cZ3j/bXn0Dpbizl9iaUVeWSHqiOjsgk6OW2bkLclbBjzewBz6weQ1zA2Q==} @@ -12786,7 +12848,7 @@ snapshots: '@sendgrid/client@8.1.3': dependencies: '@sendgrid/helpers': 8.0.0 - axios: 1.13.1 + axios: 1.13.5 transitivePeerDependencies: - debug @@ -12817,7 +12879,7 @@ snapshots: '@slack/types': 2.11.0 '@types/is-stream': 1.1.0 '@types/node': 20.12.7 - axios: 1.11.0 + axios: 1.13.5 eventemitter3: 3.1.2 form-data: 2.5.1 is-electron: 2.2.2 @@ -14395,7 +14457,7 @@ snapshots: axios@0.21.4: dependencies: - follow-redirects: 1.15.6 + follow-redirects: 1.15.11 transitivePeerDependencies: - debug @@ -14416,8 +14478,8 @@ snapshots: axios@1.12.0: dependencies: - follow-redirects: 1.15.6 - form-data: 4.0.4 + follow-redirects: 1.15.11 + form-data: 4.0.5 proxy-from-env: 1.1.0 transitivePeerDependencies: - debug @@ -15990,6 +16052,10 @@ snapshots: dependencies: strnum: 1.0.5 + fast-xml-parser@4.5.6: + dependencies: + strnum: 1.0.5 + fast-xml-parser@5.3.4: dependencies: strnum: 2.1.2 @@ -18036,7 +18102,7 @@ snapshots: peopledatalabs@6.1.5: dependencies: - axios: 1.11.0 + axios: 1.13.5 copy-anything: 3.0.5 transitivePeerDependencies: - debug diff --git a/scripts/services/pom-fetcher.yaml b/scripts/services/pom-fetcher.yaml new file mode 100644 index 0000000000..a7aa8a9c37 --- /dev/null +++ b/scripts/services/pom-fetcher.yaml @@ -0,0 +1,69 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: pom-fetcher + SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' + POM_FETCHER_BATCH_SIZE: '200' + POM_FETCHER_CONCURRENCY: '10' + POM_FETCHER_STALE_DAYS: '7' + POM_FETCHER_IDLE_SLEEP_SEC: '3600' + +services: + pom-fetcher: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run start:pom-fetcher' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + pom-fetcher-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run dev:pom-fetcher' + working_dir: /usr/crowd/app/services/apps/packages_worker + # user: '${USER_ID}:${GROUP_ID}' + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: pom-fetcher + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 8ee8dceeae..5979ba1981 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -4,10 +4,13 @@ "scripts": { "start:packages-worker": "SERVICE=packages-worker tsx src/bin/packages-worker.ts", "start:github-repos-enricher": "SERVICE=github-repos-enricher tsx src/bin/github-repos-enricher.ts", + "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", "dev:packages-worker": "SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", + "dev:pom-fetcher": "SERVICE=pom-fetcher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", + "dev:pom-fetcher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pom-fetcher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", @@ -18,6 +21,8 @@ "@crowd/data-access-layer": "workspace:*", "@crowd/database": "workspace:*", "@crowd/logging": "workspace:*", + "axios": "^1.6.7", + "fast-xml-parser": "^4.4.0", "tsx": "^4.7.1", "typescript": "^5.6.3" }, diff --git a/services/apps/packages_worker/src/bin/pom-fetcher.ts b/services/apps/packages_worker/src/bin/pom-fetcher.ts new file mode 100644 index 0000000000..d8130bb327 --- /dev/null +++ b/services/apps/packages_worker/src/bin/pom-fetcher.ts @@ -0,0 +1,42 @@ +import { getServiceLogger } from '@crowd/logging' + +import { getPomFetcherConfig } from '../config' +import { getPackagesDb } from '../db' +import { runPomEnrichmentLoop } from '../pom-fetcher/runPomEnrichmentLoop' + +const log = getServiceLogger() + +let shuttingDown = false + +const shutdown = async () => { + if (shuttingDown) return + shuttingDown = true + log.info('Shutting down pom-fetcher...') +} + +process.on('SIGINT', shutdown) +process.on('SIGTERM', shutdown) + +const main = async () => { + log.info('pom-fetcher starting...') + + const config = getPomFetcherConfig() + log.info( + { batchSize: config.batchSize, concurrency: config.concurrency, staleDays: config.staleDays }, + 'Config loaded', + ) + + const qx = await getPackagesDb() + await qx.selectOne('SELECT 1') + log.info('Connected to packages-db.') + + await runPomEnrichmentLoop(qx, config, () => shuttingDown) + + log.info('pom-fetcher stopped.') + process.exit(0) +} + +main().catch((err) => { + log.error({ err }, 'pom-fetcher fatal error') + process.exit(1) +}) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 4faf7444bf..77deb64d96 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -29,3 +29,12 @@ export function getEnricherConfig() { idleSleepSec: requireEnvInt('ENRICHER_IDLE_SLEEP_SEC'), } } + +export function getPomFetcherConfig() { + return { + batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '200', 10), + concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '10', 10), + staleDays: parseInt(process.env.POM_FETCHER_STALE_DAYS ?? '7', 10), + idleSleepSec: parseInt(process.env.POM_FETCHER_IDLE_SLEEP_SEC ?? '3600', 10), + } +} diff --git a/services/apps/packages_worker/src/pom-fetcher/extract.ts b/services/apps/packages_worker/src/pom-fetcher/extract.ts new file mode 100644 index 0000000000..ea79257726 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/extract.ts @@ -0,0 +1,295 @@ +/** + * Core POM extraction logic — pure functions (no I/O side-effects, no DB calls). + * Callers are responsible for concurrency, retries, and persistence. + */ + +import axios from 'axios' +import { XMLParser } from 'fast-xml-parser' + +// ─── Types ──────────────────────────────────────────────────────────────────── + +export interface PomMaintainer { + username: string | null + displayName: string | null + /** Raw email from POM — hash with SHA-256 before storing (GDPR) */ + email: string | null + url: string | null + role: 'author' | 'maintainer' +} + +export interface PomExtractionResult { + groupId: string + artifactId: string + version: string + purl: string + description: string | null + licenses: string[] + licensesRaw: string | null + scmUrl: string | null + homepageUrl: string | null + developers: PomMaintainer[] + contributors: PomMaintainer[] + parentHops: number + error: string | null +} + +// ─── Internal POM types ─────────────────────────────────────────────────────── + +interface PomData { + description?: unknown + url?: unknown + licenses?: { license?: unknown } + scm?: { url?: unknown; connection?: unknown } + developers?: { developer?: unknown } + contributors?: { contributor?: unknown } + parent?: { groupId?: unknown; artifactId?: unknown; version?: unknown } +} + +interface PomPerson { + id?: unknown + name?: unknown + email?: unknown + url?: unknown +} + +// ─── Config ─────────────────────────────────────────────────────────────────── + +const MAVEN_REPO = 'https://repo1.maven.org/maven2' +const MAX_PARENT_HOPS = 5 +const REQUEST_TIMEOUT_MS = 15_000 + +const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '@_', + parseTagValue: false, // keep all values as strings — prevents version "65" becoming number + parseAttributeValue: false, +}) + +// ─── POM fetch ──────────────────────────────────────────────────────────────── + +export function buildPomUrl(groupId: string, artifactId: string, version: string): string { + const groupPath = groupId.replace(/\./g, '/') + return `${MAVEN_REPO}/${groupPath}/${artifactId}/${version}/${artifactId}-${version}.pom` +} + +export async function fetchPom( + groupId: string, + artifactId: string, + version: string, + log?: (msg: string) => void, +): Promise { + const url = buildPomUrl(groupId, artifactId, version) + try { + const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + const parsed = parser.parse(res.data) + return (parsed?.project as PomData) ?? null + } catch (err) { + if (axios.isAxiosError(err)) { + const status = err.response?.status + if (status === 404) { + log?.(`POM not found (404): ${url}`) + return null + } + log?.(`HTTP ${status ?? 'unknown'} fetching POM: ${url}`) + return null + } + throw err + } +} + +// ─── Inheritance resolution ─────────────────────────────────────────────────── + +interface ResolvedFields { + description: string | null + licenses: string[] + licensesRaw: string | null + scmUrl: string | null + homepageUrl: string | null + developers: PomMaintainer[] + contributors: PomMaintainer[] + hops: number +} + +async function resolveWithInheritance( + groupId: string, + artifactId: string, + version: string, + log: (msg: string) => void, + depth = 0, +): Promise { + if (depth > MAX_PARENT_HOPS) { + log(`Max parent hops (${MAX_PARENT_HOPS}) reached`) + return emptyFields(depth) + } + + const pom = await fetchPom(groupId, artifactId, version, log) + if (!pom) return emptyFields(depth) + + const licenses = extractLicenses(pom) + const scmUrl = extractStr(pom.scm?.url ?? pom.scm?.connection) + const developers = extractPersons(pom.developers?.developer, 'author') + const contributors = extractPersons(pom.contributors?.contributor, 'maintainer') + + const missingLicense = licenses.length === 0 + const missingScm = !scmUrl + const parent = extractParent(pom) + + if (parent && (missingLicense || missingScm)) { + log(`[hop ${depth + 1}] ${parent.groupId}:${parent.artifactId}:${parent.version}`) + const parentFields = await resolveWithInheritance( + parent.groupId, + parent.artifactId, + parent.version, + log, + depth + 1, + ) + return { + description: extractStr(pom.description) ?? parentFields.description, + licenses: licenses.length > 0 ? licenses : parentFields.licenses, + licensesRaw: licenses.length > 0 ? licenses.join(', ') : parentFields.licensesRaw, + scmUrl: scmUrl ?? parentFields.scmUrl, + homepageUrl: extractStr(pom.url) ?? parentFields.homepageUrl, + developers: developers.length > 0 ? developers : parentFields.developers, + contributors: contributors.length > 0 ? contributors : parentFields.contributors, + hops: parentFields.hops, + } + } + + return { + description: extractStr(pom.description), + licenses, + licensesRaw: licenses.length > 0 ? licenses.join(', ') : null, + scmUrl, + homepageUrl: extractStr(pom.url), + developers, + contributors, + hops: depth, + } +} + +// ─── Public entry point ─────────────────────────────────────────────────────── + +/** + * Fetches and resolves POM metadata for the given Maven artifact. + * Always returns a result object; errors are captured in `result.error`. + */ +export async function extractArtifact( + groupId: string, + artifactId: string, + version: string, + log: (msg: string) => void = () => undefined, +): Promise { + const purl = `pkg:maven/${groupId}/${artifactId}@${version}` + + const rootPom = await fetchPom(groupId, artifactId, version, log) + if (!rootPom) { + const pomUrl = buildPomUrl(groupId, artifactId, version) + return { + groupId, + artifactId, + version, + purl, + description: null, + licenses: [], + licensesRaw: null, + scmUrl: null, + homepageUrl: null, + developers: [], + contributors: [], + parentHops: 0, + error: `POM not found: ${pomUrl}`, + } + } + + try { + const resolved = await resolveWithInheritance(groupId, artifactId, version, log) + return { + groupId, + artifactId, + version, + purl, + description: resolved.description, + licenses: resolved.licenses, + licensesRaw: resolved.licensesRaw, + scmUrl: resolved.scmUrl, + homepageUrl: resolved.homepageUrl, + developers: resolved.developers, + contributors: resolved.contributors, + parentHops: resolved.hops, + error: null, + } + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + log(`Error resolving POM: ${message}`) + return { + groupId, + artifactId, + version, + purl, + description: null, + licenses: [], + licensesRaw: null, + scmUrl: null, + homepageUrl: null, + developers: [], + contributors: [], + parentHops: 0, + error: message, + } + } +} + +// ─── Private helpers ────────────────────────────────────────────────────────── + +function extractStr(value: unknown): string | null { + if (typeof value === 'string' && value.trim()) return value.trim() + return null +} + +function extractLicenses(pom: PomData): string[] { + const raw = pom.licenses?.license + if (!raw) return [] + const list = Array.isArray(raw) ? raw : [raw] + return (list as Array<{ name?: unknown }>) + .map((l) => extractStr(l?.name)) + .filter((n): n is string => n !== null) +} + +function extractPersons(raw: unknown, role: 'author' | 'maintainer'): PomMaintainer[] { + if (!raw) return [] + const list = Array.isArray(raw) ? raw : [raw] + return (list as PomPerson[]) + .filter((p) => p.id || p.name || p.email) + .map((p) => ({ + username: extractStr(p.id), + displayName: extractStr(p.name), + email: extractStr(p.email), + url: extractStr(p.url), + role, + })) +} + +function extractParent( + pom: PomData, +): { groupId: string; artifactId: string; version: string } | null { + const p = pom.parent + if (!p) return null + const groupId = extractStr(p.groupId) + const artifactId = extractStr(p.artifactId) + const version = extractStr(p.version) + if (!groupId || !artifactId || !version) return null + return { groupId, artifactId, version } +} + +function emptyFields(hops: number): ResolvedFields { + return { + description: null, + licenses: [], + licensesRaw: null, + scmUrl: null, + homepageUrl: null, + developers: [], + contributors: [], + hops, + } +} diff --git a/services/apps/packages_worker/src/pom-fetcher/metadata.ts b/services/apps/packages_worker/src/pom-fetcher/metadata.ts new file mode 100644 index 0000000000..9192bc05a1 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/metadata.ts @@ -0,0 +1,50 @@ +/** + * Resolves the latest release version of a Maven artifact using the + * maven-metadata.xml endpoint on Maven Central. + * + * URL format: + * https://repo1.maven.org/maven2/{groupPath}/{artifactId}/maven-metadata.xml + * + * Returns null when the artifact is not found (404) or the metadata is + * malformed. + */ + +import axios from 'axios' +import { XMLParser } from 'fast-xml-parser' + +const MAVEN_REPO = 'https://repo1.maven.org/maven2' +const REQUEST_TIMEOUT_MS = 10_000 + +const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '@_', + parseTagValue: false, + parseAttributeValue: false, +}) + +export async function resolveLatestVersion( + groupId: string, + artifactId: string, +): Promise { + const groupPath = groupId.replace(/\./g, '/') + const url = `${MAVEN_REPO}/${groupPath}/${artifactId}/maven-metadata.xml` + + try { + const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + const parsed = parser.parse(res.data) + + // Prefer over — release excludes snapshots/alphas + const versioning = parsed?.metadata?.versioning + const release = typeof versioning?.release === 'string' ? versioning.release.trim() : null + const latest = typeof versioning?.latest === 'string' ? versioning.latest.trim() : null + + return release || latest || null + } catch (err) { + if (axios.isAxiosError(err)) { + // Not found is expected for packages that don't exist on Maven Central + if (err.response?.status === 404) return null + } + // Rethrow unexpected errors so callers can decide whether to retry + throw err + } +} diff --git a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts new file mode 100644 index 0000000000..b31a591309 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts @@ -0,0 +1,221 @@ +import crypto from 'crypto' + +import { + listMavenPackagesToEnrich, + upsertMaintainer, + upsertPackage, + upsertPackageMaintainer, +} from '@crowd/data-access-layer' +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { getPomFetcherConfig } from '../config' +import { extractArtifact } from './extract' +import { resolveLatestVersion } from './metadata' + +const log = getServiceChildLogger('pom-fetcher') + +// ─── Types ──────────────────────────────────────────────────────────────────── + +interface BatchResult { + processed: number + skipped: number + errors: number +} + +// ─── Batch processing ───────────────────────────────────────────────────────── + +async function processBatch( + qx: QueryExecutor, + offset: number, + config: ReturnType, +): Promise { + const packages = await listMavenPackagesToEnrich(qx, { + limit: config.batchSize, + offset, + staleDays: config.staleDays, + }) + + if (packages.length === 0) { + return { processed: 0, skipped: 0, errors: 0 } + } + + log.info({ offset, count: packages.length }, 'Processing POM batch...') + + let processed = 0 + let skipped = 0 + let errors = 0 + + // Process in small concurrent groups to be polite to Maven Central + for (let i = 0; i < packages.length; i += config.concurrency) { + const group = packages.slice(i, i + config.concurrency) + + await Promise.all( + group.map(async (pkg) => { + const groupId = pkg.namespace + const artifactId = pkg.name + + if (!groupId) { + log.warn({ purl: pkg.purl }, 'Skipping package with null namespace (groupId)') + skipped++ + return + } + + try { + log.info({ groupId, artifactId }, 'Fetching POM...') + + // Step 1: resolve latest version from maven-metadata.xml + const version = await resolveLatestVersion(groupId, artifactId) + if (!version) { + log.warn({ groupId, artifactId }, 'Could not resolve latest version, skipping') + skipped++ + return + } + log.info({ groupId, artifactId, version }, 'Version resolved, extracting POM...') + + // Step 2: fetch + resolve POM (follows parent chain) + const result = await extractArtifact(groupId, artifactId, version, (msg) => { + log.debug({ groupId, artifactId, version }, msg) + }) + + if (result.error) { + log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction error') + errors++ + return + } + log.info( + { + groupId, + artifactId, + version, + licenses: result.licenses, + scmUrl: result.scmUrl, + developers: result.developers.length, + contributors: result.contributors.length, + parentHops: result.parentHops, + }, + 'POM extracted, upserting...', + ) + + // Step 3: upsert into `packages` + // purl at package level has no version (package-level identifier) + const packagePurl = `pkg:maven/${groupId}/${artifactId}` + const packageId = await upsertPackage(qx, { + purl: packagePurl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: result.description, + homepage: result.homepageUrl, + declaredRepositoryUrl: result.scmUrl, + licenses: result.licenses.length > 0 ? result.licenses : null, + licensesRaw: result.licensesRaw, + latestVersion: version, + ingestionSource: 'pom_fetcher', + }) + + // Step 4: upsert maintainers (developers + contributors) + const allPeople = [ + ...result.developers.map((d) => ({ ...d, role: 'author' as const })), + ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), + ] + + for (const person of allPeople) { + const username = person.username ?? person.email ?? person.displayName + if (!username) continue + + const emailHash = person.email + ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') + : null + + const maintainerId = await upsertMaintainer(qx, { + ecosystem: 'maven', + username, + displayName: person.displayName, + url: person.url, + emailHash, + }) + + await upsertPackageMaintainer(qx, { + packageId, + maintainerId, + role: person.role, + }) + } + + processed++ + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + log.error({ groupId, artifactId, error: message }, 'Unexpected error processing package') + errors++ + } + }), + ) + } + + return { processed, skipped, errors } +} + +// ─── Main loop ──────────────────────────────────────────────────────────────── + +/** + * Loops indefinitely: pages through all Maven packages that need POM + * enrichment, sleeps when the pass is complete, then restarts from offset 0. + * + * The caller is responsible for creating the DB connection and passing + * `isShuttingDown` so the loop exits cleanly on SIGTERM/SIGINT. + */ +export async function runPomEnrichmentLoop( + qx: QueryExecutor, + config: ReturnType, + isShuttingDown: () => boolean, +): Promise { + let offset = 0 + let totalProcessed = 0 + let totalSkipped = 0 + let totalErrors = 0 + let passNumber = 0 + let passStartedAt = Date.now() + + while (!isShuttingDown()) { + if (offset === 0) { + passNumber++ + passStartedAt = Date.now() + log.info({ pass: passNumber }, 'Starting pass') + } + + const result = await processBatch(qx, offset, config) + + if (result.processed + result.skipped + result.errors === 0) { + // Nothing left in this pass — log summary and sleep + const durationMs = Date.now() - passStartedAt + log.info( + { + totalProcessed, + totalSkipped, + totalErrors, + durationMs, + durationSec: Math.round(durationMs / 1000), + }, + `Pass complete. Sleeping ${config.idleSleepSec}s before next pass.`, + ) + await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) + offset = 0 + totalProcessed = 0 + totalSkipped = 0 + totalErrors = 0 + passStartedAt = Date.now() + continue + } + + totalProcessed += result.processed + totalSkipped += result.skipped + totalErrors += result.errors + offset += config.batchSize + + log.info( + { offset, processed: result.processed, skipped: result.skipped, errors: result.errors }, + 'Batch complete', + ) + } +} diff --git a/services/libs/data-access-layer/src/index.ts b/services/libs/data-access-layer/src/index.ts index 2eec73d0dc..d8e9fb18a4 100644 --- a/services/libs/data-access-layer/src/index.ts +++ b/services/libs/data-access-layer/src/index.ts @@ -15,3 +15,4 @@ export * from './integrations' export * from './auditLogs' export * from './maintainers' export * from './project-catalog' +export * from './osspckgs' diff --git a/services/libs/data-access-layer/src/osspckgs/index.ts b/services/libs/data-access-layer/src/osspckgs/index.ts new file mode 100644 index 0000000000..49fc5f85e3 --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/index.ts @@ -0,0 +1,3 @@ +export * from './types' +export * from './packages' +export * from './maintainers' diff --git a/services/libs/data-access-layer/src/osspckgs/maintainers.ts b/services/libs/data-access-layer/src/osspckgs/maintainers.ts new file mode 100644 index 0000000000..ebd9a77767 --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/maintainers.ts @@ -0,0 +1,55 @@ +import { QueryExecutor } from '../queryExecutor' + +import { IDbMaintainerUpsert, IDbPackageMaintainerUpsert } from './types' + +/** + * Inserts or updates a maintainer row. + * Returns the maintainer id. + */ +export async function upsertMaintainer( + qx: QueryExecutor, + item: IDbMaintainerUpsert, +): Promise { + const row = await qx.selectOne( + ` + INSERT INTO maintainers ( + ecosystem, + username, + display_name, + url, + email_hash + ) VALUES ( + $(ecosystem), + $(username), + $(displayName), + $(url), + $(emailHash) + ) + ON CONFLICT (ecosystem, username) DO UPDATE SET + display_name = COALESCE(EXCLUDED.display_name, maintainers.display_name), + url = COALESCE(EXCLUDED.url, maintainers.url), + email_hash = COALESCE(EXCLUDED.email_hash, maintainers.email_hash) + RETURNING id + `, + item, + ) + return row.id as number +} + +/** + * Links a maintainer to a package with the given role. + * Does nothing on conflict. + */ +export async function upsertPackageMaintainer( + qx: QueryExecutor, + item: IDbPackageMaintainerUpsert, +): Promise { + await qx.result( + ` + INSERT INTO package_maintainers (package_id, maintainer_id, role) + VALUES ($(packageId), $(maintainerId), $(role)) + ON CONFLICT (package_id, maintainer_id) DO NOTHING + `, + item, + ) +} diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts new file mode 100644 index 0000000000..caee0e7beb --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -0,0 +1,95 @@ +import { QueryExecutor } from '../queryExecutor' + +import { IDbPackageUniverse, IDbPackageUpsert } from './types' + +// ─── packages_universe ──────────────────────────────────────────────────────── + +/** + * Returns a page of Maven packages from packages_universe that either have no + * corresponding entry in `packages` yet, or whose `packages.last_synced_at` is + * older than the given cutoff (defaults to 7 days). + * + * Ordered by rank_in_ecosystem ASC (most critical first), unranked last. + */ +export async function listMavenPackagesToEnrich( + qx: QueryExecutor, + options: { limit: number; offset: number; staleDays?: number }, +): Promise[]> { + const { limit, offset, staleDays = 7 } = options + + return qx.select( + ` + SELECT + pu.id, + pu.purl, + pu.namespace, + pu.name + FROM packages_universe pu + LEFT JOIN packages p ON p.purl = pu.purl + WHERE + pu.ecosystem = 'maven' + AND pu.namespace IS NOT NULL + AND ( + p.id IS NULL + OR p.last_synced_at < NOW() - ($(staleDays) || ' days')::interval + ) + ORDER BY + pu.rank_in_ecosystem ASC NULLS LAST, + pu.id ASC + LIMIT $(limit) OFFSET $(offset) + `, + { limit, offset, staleDays }, + ) +} + +// ─── packages upsert ────────────────────────────────────────────────────────── + +/** + * Inserts or updates a row in `packages`. + * Returns the id of the upserted row. + */ +export async function upsertPackage(qx: QueryExecutor, item: IDbPackageUpsert): Promise { + const row = await qx.selectOne( + ` + INSERT INTO packages ( + purl, + ecosystem, + namespace, + name, + description, + homepage, + declared_repository_url, + licenses, + licenses_raw, + latest_version, + ingestion_source, + last_synced_at + ) VALUES ( + $(purl), + $(ecosystem), + $(namespace), + $(name), + $(description), + $(homepage), + $(declaredRepositoryUrl), + $(licenses)::text[], + $(licensesRaw), + $(latestVersion), + $(ingestionSource), + NOW() + ) + ON CONFLICT (purl) DO UPDATE SET + description = EXCLUDED.description, + homepage = EXCLUDED.homepage, + declared_repository_url = EXCLUDED.declared_repository_url, + licenses = EXCLUDED.licenses, + licenses_raw = EXCLUDED.licenses_raw, + latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), + ingestion_source = EXCLUDED.ingestion_source, + last_synced_at = NOW() + RETURNING id + `, + item, + ) + return row.id as number +} diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts new file mode 100644 index 0000000000..7553ff00a6 --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -0,0 +1,44 @@ +// ─── packages_universe ──────────────────────────────────────────────────────── + +export interface IDbPackageUniverse { + id: number + purl: string | null + ecosystem: string + namespace: string | null + name: string + rankInEcosystem: number | null +} + +// ─── packages ───────────────────────────────────────────────────────────────── + +export type IDbPackageUpsert = { + purl: string + ecosystem: string + namespace: string | null + name: string + description: string | null + homepage: string | null + declaredRepositoryUrl: string | null + licenses: string[] | null + licensesRaw: string | null + latestVersion: string | null + ingestionSource: string +} + +// ─── maintainers ────────────────────────────────────────────────────────────── + +export type IDbMaintainerUpsert = { + ecosystem: string + username: string + displayName: string | null + url: string | null + emailHash: string | null +} + +// ─── package_maintainers ────────────────────────────────────────────────────── + +export type IDbPackageMaintainerUpsert = { + packageId: number + maintainerId: number + role: 'author' | 'maintainer' | null +} From b0812f935c909dbe9d69f78b6658e5f5514b1d8b Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 27 May 2026 10:19:25 +0200 Subject: [PATCH 15/15] feat: pom extractor Signed-off-by: Umberto Sgueglia --- scripts/cli | 2 +- scripts/services/packages-worker.yaml | 4 ++ scripts/services/pom-fetcher.yaml | 5 +- services/apps/packages_worker/package.json | 4 +- services/apps/packages_worker/src/config.ts | 4 +- .../src/pom-fetcher/runPomEnrichmentLoop.ts | 60 +++++++++++-------- 6 files changed, 46 insertions(+), 33 deletions(-) diff --git a/scripts/cli b/scripts/cli index 9f3ce75aac..6e863b5c44 100755 --- a/scripts/cli +++ b/scripts/cli @@ -1060,7 +1060,7 @@ while test $# -gt 0; do exit ;; clean-start-dev) - # IGNORED_SERVICES=("python-worker" "job-generator" "discord-ws" "webhook-api" "profiles-worker" "organizations-enrichment-worker" "merge-suggestions-worker" "members-enrichment-worker" "exports-worker" "entity-merging-worker") + IGNORED_SERVICES=("python-worker" "job-generator" "discord-ws" "webhook-api" "profiles-worker" "organizations-enrichment-worker" "merge-suggestions-worker" "members-enrichment-worker" "exports-worker" "entity-merging-worker") CLEAN_START=1 DEV=1 start diff --git a/scripts/services/packages-worker.yaml b/scripts/services/packages-worker.yaml index cb2ecdbf0f..58399b63fc 100644 --- a/scripts/services/packages-worker.yaml +++ b/scripts/services/packages-worker.yaml @@ -6,6 +6,10 @@ x-env-args: &env-args SERVICE: packages-worker SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' + POM_FETCHER_BATCH_SIZE: '50' + POM_FETCHER_CONCURRENCY: '3' + POM_FETCHER_STALE_DAYS: '7' + POM_FETCHER_IDLE_SLEEP_SEC: '3600' services: packages-worker: diff --git a/scripts/services/pom-fetcher.yaml b/scripts/services/pom-fetcher.yaml index a7aa8a9c37..4210778a6d 100644 --- a/scripts/services/pom-fetcher.yaml +++ b/scripts/services/pom-fetcher.yaml @@ -6,8 +6,9 @@ x-env-args: &env-args SERVICE: pom-fetcher SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' - POM_FETCHER_BATCH_SIZE: '200' - POM_FETCHER_CONCURRENCY: '10' + LOG_LEVEL: 'info' + POM_FETCHER_BATCH_SIZE: '50' + POM_FETCHER_CONCURRENCY: '3' POM_FETCHER_STALE_DAYS: '7' POM_FETCHER_IDLE_SLEEP_SEC: '3600' diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 5979ba1981..b511bbe520 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -7,10 +7,10 @@ "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", "dev:packages-worker": "SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", - "dev:pom-fetcher": "SERVICE=pom-fetcher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", + "dev:pom-fetcher": "SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", - "dev:pom-fetcher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pom-fetcher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", + "dev:pom-fetcher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 77deb64d96..93adc67781 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -32,8 +32,8 @@ export function getEnricherConfig() { export function getPomFetcherConfig() { return { - batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '200', 10), - concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '10', 10), + batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '50', 10), + concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '3', 10), staleDays: parseInt(process.env.POM_FETCHER_STALE_DAYS ?? '7', 10), idleSleepSec: parseInt(process.env.POM_FETCHER_IDLE_SLEEP_SEC ?? '3600', 10), } diff --git a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts index b31a591309..a652377e4f 100644 --- a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts @@ -27,12 +27,11 @@ interface BatchResult { async function processBatch( qx: QueryExecutor, - offset: number, config: ReturnType, ): Promise { const packages = await listMavenPackagesToEnrich(qx, { limit: config.batchSize, - offset, + offset: 0, staleDays: config.staleDays, }) @@ -40,11 +39,12 @@ async function processBatch( return { processed: 0, skipped: 0, errors: 0 } } - log.info({ offset, count: packages.length }, 'Processing POM batch...') + log.info({ count: packages.length }, 'Processing POM batch...') let processed = 0 let skipped = 0 let errors = 0 + const PROGRESS_EVERY = 25 // Process in small concurrent groups to be polite to Maven Central for (let i = 0; i < packages.length; i += config.concurrency) { @@ -62,16 +62,30 @@ async function processBatch( } try { - log.info({ groupId, artifactId }, 'Fetching POM...') - // Step 1: resolve latest version from maven-metadata.xml const version = await resolveLatestVersion(groupId, artifactId) if (!version) { log.warn({ groupId, artifactId }, 'Could not resolve latest version, skipping') + // Upsert a minimal record so last_synced_at is set — prevents this package + // from re-appearing in every batch within the same pass. + // ingestionSource 'pom_fetcher_no_version' marks that it was tried but had no + // resolvable version on Maven Central (404 on maven-metadata.xml). + await upsertPackage(qx, { + purl: `pkg:maven/${groupId}/${artifactId}`, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + declaredRepositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: null, + ingestionSource: 'pom_fetcher_no_version', + }) skipped++ return } - log.info({ groupId, artifactId, version }, 'Version resolved, extracting POM...') // Step 2: fetch + resolve POM (follows parent chain) const result = await extractArtifact(groupId, artifactId, version, (msg) => { @@ -83,19 +97,6 @@ async function processBatch( errors++ return } - log.info( - { - groupId, - artifactId, - version, - licenses: result.licenses, - scmUrl: result.scmUrl, - developers: result.developers.length, - contributors: result.contributors.length, - parentHops: result.parentHops, - }, - 'POM extracted, upserting...', - ) // Step 3: upsert into `packages` // purl at package level has no version (package-level identifier) @@ -151,6 +152,17 @@ async function processBatch( } }), ) + + // done = packages processed so far (based on loop index, always accurate) + const done = i + group.length + const prevDone = i + const crossedBoundary = Math.floor(done / PROGRESS_EVERY) > Math.floor(prevDone / PROGRESS_EVERY) + if (crossedBoundary || done === packages.length) { + log.info( + { done, total: packages.length, processed, skipped, errors }, + `Progress: ${done}/${packages.length}`, + ) + } } return { processed, skipped, errors } @@ -170,7 +182,6 @@ export async function runPomEnrichmentLoop( config: ReturnType, isShuttingDown: () => boolean, ): Promise { - let offset = 0 let totalProcessed = 0 let totalSkipped = 0 let totalErrors = 0 @@ -178,13 +189,13 @@ export async function runPomEnrichmentLoop( let passStartedAt = Date.now() while (!isShuttingDown()) { - if (offset === 0) { + if (totalProcessed + totalSkipped + totalErrors === 0) { passNumber++ passStartedAt = Date.now() log.info({ pass: passNumber }, 'Starting pass') } - const result = await processBatch(qx, offset, config) + const result = await processBatch(qx, config) if (result.processed + result.skipped + result.errors === 0) { // Nothing left in this pass — log summary and sleep @@ -200,21 +211,18 @@ export async function runPomEnrichmentLoop( `Pass complete. Sleeping ${config.idleSleepSec}s before next pass.`, ) await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) - offset = 0 totalProcessed = 0 totalSkipped = 0 totalErrors = 0 - passStartedAt = Date.now() continue } totalProcessed += result.processed totalSkipped += result.skipped totalErrors += result.errors - offset += config.batchSize log.info( - { offset, processed: result.processed, skipped: result.skipped, errors: result.errors }, + { processed: result.processed, skipped: result.skipped, errors: result.errors, totalProcessed, totalSkipped, totalErrors }, 'Batch complete', ) }