diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 2f4bfc7..f0ff0a3 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -17,7 +17,7 @@ OLRC Website (uscode.house.gov) | v +-----------+ - | Pipeline | (orchestration) + | Workflow | (orchestration) +-----------+ | v @@ -37,7 +37,6 @@ OLRC Website (uscode.house.gov) | `@civic-source/fetcher` | `packages/fetcher` | Downloads release point listings and ZIP archives from the OLRC. Includes SHA-256 hash-based caching (`HashStore`) to skip unchanged content, exponential backoff retry, and a structured logger. | | `@civic-source/transformer` | `packages/transformer` | Parses USLM XML using `fast-xml-parser` in `preserveOrder` mode and generates per-section Markdown files with YAML frontmatter. Handles namespace-aware element traversal. | | `@civic-source/annotator` | `packages/annotator` | Queries CourtListener's full-text search API to find cases citing a given statute section. Maps results to the `PrecedentAnnotation` schema. Rate-limited. | -| `@civic-source/pipeline` | `packages/pipeline` | Orchestrates the end-to-end flow: fetch release points, transform each title's XML to Markdown, write files, and optionally annotate with case law. Per-title failures do not block other titles. | | `@civic-source/web` | `apps/web` | Astro v5 static site that renders statute Markdown with Tailwind CSS styling, Pagefind search, and Svelte interactive components. | ## Dual-Repo Strategy diff --git a/README.md b/README.md index fb7a67d..4efe3f0 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,6 @@ Automated pipeline that fetches U.S. Code XML releases from the Office of the La | `@civic-source/fetcher` | OLRC release point fetcher with retry and idempotency | | `@civic-source/transformer` | USLM XML to Markdown converter with status detection | | `@civic-source/annotator` | CourtListener precedent annotation generator | -| `@civic-source/pipeline` | Orchestration pipeline for bulk conversion | -| `@civic-source/observability` | Pipeline metrics collector and reporting | | `@civic-source/shared` | Shared utilities (logger, retry, token bucket) | | `@civic-source/web` | Astro 6 static site with Svelte components | @@ -44,12 +42,12 @@ Automated pipeline that fetches U.S. Code XML releases from the Office of the La ```bash pnpm install pnpm build -pnpm test # 267 tests across 8 packages +pnpm test # run all workspace tests pnpm lint pnpm typecheck ``` -Requires Node.js 22.x LTS and pnpm 9.x. +Requires Node.js 24.x LTS and pnpm 11.x. ## Architecture diff --git a/packages/observability/package.json b/packages/observability/package.json deleted file mode 100644 index c4de222..0000000 --- a/packages/observability/package.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "name": "@civic-source/observability", - "version": "0.0.1", - "private": true, - "type": "module", - "main": "./dist/index.js", - "types": "./dist/index.d.ts", - "exports": { - ".": { - "import": "./dist/index.js", - "types": "./dist/index.d.ts" - } - }, - "scripts": { - "build": "tsc --project tsconfig.build.json", - "typecheck": "tsc --noEmit", - "lint": "eslint src/", - "test": "vitest run" - }, - "dependencies": { - "@civic-source/shared": "workspace:*", - "@civic-source/types": "workspace:*" - }, - "devDependencies": { - "@types/node": "^22.19.15", - "typescript": "^6.0.0", - "vitest": "^4.1.6" - } -} diff --git a/packages/observability/src/__tests__/observability.test.ts b/packages/observability/src/__tests__/observability.test.ts deleted file mode 100644 index 73ce603..0000000 --- a/packages/observability/src/__tests__/observability.test.ts +++ /dev/null @@ -1,111 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import { createMetricsCollector, createTimer } from '../index.js'; - -describe('createMetricsCollector', () => { - it('initializes with zero counts and a startedAt timestamp', () => { - const collector = createMetricsCollector(); - const snapshot = collector.toJSON(); - expect(snapshot.startedAt).toBeTruthy(); - expect(snapshot.titlesProcessed).toBe(0); - expect(snapshot.titlesSkipped).toBe(0); - expect(snapshot.titlesFailed).toBe(0); - expect(snapshot.sectionsGenerated).toBe(0); - expect(snapshot.annotationsGenerated).toBe(0); - expect(snapshot.xmlDownloadSizeBytes).toBe(0); - expect(snapshot.peakMemoryMB).toBe(0); - expect(snapshot.runnerType).toBe('self-hosted'); - }); - - it('records additive count values', () => { - const collector = createMetricsCollector(); - collector.record({ titlesProcessed: 3, sectionsGenerated: 50 }); - collector.record({ titlesProcessed: 2, sectionsGenerated: 30 }); - const snapshot = collector.toJSON(); - expect(snapshot.titlesProcessed).toBe(5); - expect(snapshot.sectionsGenerated).toBe(80); - }); - - it('records peakMemoryMB as a max (not additive)', () => { - const collector = createMetricsCollector(); - collector.record({ peakMemoryMB: 100 }); - collector.record({ peakMemoryMB: 50 }); - collector.record({ peakMemoryMB: 200 }); - expect(collector.toJSON().peakMemoryMB).toBe(200); - }); - - it('accepts a custom runnerType', () => { - const collector = createMetricsCollector('github-hosted'); - expect(collector.toJSON().runnerType).toBe('github-hosted'); - }); - - it('complete() sets completedAt and durationMs', () => { - const collector = createMetricsCollector(); - collector.record({ titlesProcessed: 1 }); - const final = collector.complete(); - expect(final.completedAt).toBeTruthy(); - expect(typeof final.durationMs).toBe('number'); - expect(final.durationMs).toBeGreaterThanOrEqual(0); - }); - - it('toJSON() returns a snapshot (not a reference to internal state)', () => { - const collector = createMetricsCollector(); - const snap1 = collector.toJSON(); - collector.record({ titlesProcessed: 10 }); - const snap2 = collector.toJSON(); - expect(snap1.titlesProcessed).toBe(0); - expect(snap2.titlesProcessed).toBe(10); - }); - - it('toMarkdown() returns a GitHub Actions-compatible summary table', () => { - const collector = createMetricsCollector(); - collector.record({ titlesProcessed: 5, titlesFailed: 1, sectionsGenerated: 120 }); - collector.complete(); - const md = collector.toMarkdown(); - expect(md).toContain('## Pipeline Run Summary'); - expect(md).toContain('| Metric | Value |'); - expect(md).toContain('| Titles processed | 5 |'); - expect(md).toContain('| Titles failed | 1 |'); - expect(md).toContain('| Sections generated | 120 |'); - expect(md).toContain('Completed'); - }); - - it('toMarkdown() shows "in progress" when not yet completed', () => { - const collector = createMetricsCollector(); - const md = collector.toMarkdown(); - expect(md).toContain('in progress'); - expect(md).not.toContain('Completed'); - }); -}); - -describe('createTimer', () => { - it('returns a timer with the given label', () => { - const timer = createTimer('test-op'); - const result = timer.stop(); - expect(result.label).toBe('test-op'); - }); - - it('elapsed() returns a non-negative millisecond value', () => { - const timer = createTimer('elapsed-test'); - const ms = timer.elapsed(); - expect(typeof ms).toBe('number'); - expect(ms).toBeGreaterThanOrEqual(0); - }); - - it('stop() returns durationMs as a non-negative number', () => { - const timer = createTimer('stop-test'); - const result = timer.stop(); - expect(result.durationMs).toBeGreaterThanOrEqual(0); - }); - - it('elapsed() can be called multiple times and values increase', async () => { - const timer = createTimer('multi-elapsed'); - const first = timer.elapsed(); - // Small busy-wait to ensure time passes - const start = performance.now(); - while (performance.now() - start < 5) { - // spin - } - const second = timer.elapsed(); - expect(second).toBeGreaterThanOrEqual(first); - }); -}); diff --git a/packages/observability/src/index.ts b/packages/observability/src/index.ts deleted file mode 100644 index f36a914..0000000 --- a/packages/observability/src/index.ts +++ /dev/null @@ -1,4 +0,0 @@ -export { createMetricsCollector } from './metrics.js'; -export type { PipelineMetrics, MetricsCollector } from './metrics.js'; -export { createTimer } from './timer.js'; -export type { Timer, TimerResult } from './timer.js'; diff --git a/packages/observability/src/metrics.ts b/packages/observability/src/metrics.ts deleted file mode 100644 index 6d10e79..0000000 --- a/packages/observability/src/metrics.ts +++ /dev/null @@ -1,129 +0,0 @@ -/** Pipeline metrics collector for observability and GitHub Actions job summaries. */ - -import { TIMEZONE } from '@civic-source/shared'; - -/** Recorded metrics for a pipeline run. */ -export interface PipelineMetrics { - startedAt: string; - completedAt?: string; - durationMs?: number; - titlesProcessed: number; - titlesSkipped: number; - titlesFailed: number; - sectionsGenerated: number; - annotationsGenerated: number; - xmlDownloadSizeBytes: number; - peakMemoryMB: number; - runnerType: string; -} - -/** Partial metric values that can be recorded incrementally. */ -type RecordableMetrics = Partial< - Pick< - PipelineMetrics, - | 'titlesProcessed' - | 'titlesSkipped' - | 'titlesFailed' - | 'sectionsGenerated' - | 'annotationsGenerated' - | 'xmlDownloadSizeBytes' - | 'peakMemoryMB' - | 'runnerType' - > ->; - -/** Metrics collector with record/complete/export API. */ -export interface MetricsCollector { - /** Record incremental metric values (additive for counts, max for peakMemoryMB). */ - record(values: RecordableMetrics): void; - /** Mark the run as complete, capturing completedAt and durationMs. */ - complete(): PipelineMetrics; - /** Return the current metrics snapshot as a plain object. */ - toJSON(): PipelineMetrics; - /** Render a GitHub Actions-compatible Markdown summary table. */ - toMarkdown(): string; -} - -function nowET(): string { - return new Date().toLocaleString('sv-SE', { timeZone: TIMEZONE }).replace(' ', 'T'); -} - -/** Create a new pipeline metrics collector. */ -export function createMetricsCollector(runnerType = 'self-hosted'): MetricsCollector { - const startedAt = nowET(); - const startMs = performance.now(); - - const state: PipelineMetrics = { - startedAt, - titlesProcessed: 0, - titlesSkipped: 0, - titlesFailed: 0, - sectionsGenerated: 0, - annotationsGenerated: 0, - xmlDownloadSizeBytes: 0, - peakMemoryMB: 0, - runnerType, - }; - - function record(values: RecordableMetrics): void { - if (values.titlesProcessed !== undefined) state.titlesProcessed += values.titlesProcessed; - if (values.titlesSkipped !== undefined) state.titlesSkipped += values.titlesSkipped; - if (values.titlesFailed !== undefined) state.titlesFailed += values.titlesFailed; - if (values.sectionsGenerated !== undefined) state.sectionsGenerated += values.sectionsGenerated; - if (values.annotationsGenerated !== undefined) state.annotationsGenerated += values.annotationsGenerated; - if (values.xmlDownloadSizeBytes !== undefined) state.xmlDownloadSizeBytes += values.xmlDownloadSizeBytes; - if (values.peakMemoryMB !== undefined) { - state.peakMemoryMB = Math.max(state.peakMemoryMB, values.peakMemoryMB); - } - if (values.runnerType !== undefined) state.runnerType = values.runnerType; - } - - function complete(): PipelineMetrics { - state.completedAt = nowET(); - state.durationMs = Math.round(performance.now() - startMs); - return { ...state }; - } - - function toJSON(): PipelineMetrics { - return { ...state }; - } - - function toMarkdown(): string { - const durationLabel = state.durationMs !== undefined - ? `${(state.durationMs / 1000).toFixed(1)}s` - : 'in progress'; - - const lines = [ - '## Pipeline Run Summary', - '', - '| Metric | Value |', - '|--------|-------|', - `| Started | ${state.startedAt} ET |`, - `| Duration | ${durationLabel} |`, - `| Titles processed | ${state.titlesProcessed} |`, - `| Titles skipped | ${state.titlesSkipped} |`, - `| Titles failed | ${state.titlesFailed} |`, - `| Sections generated | ${state.sectionsGenerated} |`, - `| Annotations generated | ${state.annotationsGenerated} |`, - `| XML download size | ${formatBytes(state.xmlDownloadSizeBytes)} |`, - `| Peak memory | ${state.peakMemoryMB} MB |`, - `| Runner | ${state.runnerType} |`, - ]; - - if (state.completedAt) { - lines.splice(5, 0, `| Completed | ${state.completedAt} ET |`); - } - - return lines.join('\n'); - } - - return { record, complete, toJSON, toMarkdown }; -} - -function formatBytes(bytes: number): string { - if (bytes === 0) return '0 B'; - const units = ['B', 'KB', 'MB', 'GB']; - const i = Math.min(Math.floor(Math.log(bytes) / Math.log(1024)), units.length - 1); - const value = bytes / Math.pow(1024, i); - return `${value.toFixed(i === 0 ? 0 : 1)} ${units[i]}`; -} diff --git a/packages/observability/src/timer.ts b/packages/observability/src/timer.ts deleted file mode 100644 index c936257..0000000 --- a/packages/observability/src/timer.ts +++ /dev/null @@ -1,31 +0,0 @@ -/** Lightweight performance timer using process.hrtime.bigint(). */ - -/** Result returned when a timer is stopped. */ -export interface TimerResult { - label: string; - durationMs: number; -} - -/** Active timer with elapsed() and stop() methods. */ -export interface Timer { - /** Get elapsed milliseconds without stopping the timer. */ - elapsed(): number; - /** Stop the timer and return the final result. */ - stop(): TimerResult; -} - -/** Create a named performance timer. */ -export function createTimer(label: string): Timer { - const start = process.hrtime.bigint(); - - function elapsed(): number { - const now = process.hrtime.bigint(); - return Number(now - start) / 1_000_000; - } - - function stop(): TimerResult { - return { label, durationMs: elapsed() }; - } - - return { elapsed, stop }; -} diff --git a/packages/observability/tsconfig.build.json b/packages/observability/tsconfig.build.json deleted file mode 100644 index 60ed234..0000000 --- a/packages/observability/tsconfig.build.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - "noEmit": false, - "declaration": true, - "declarationMap": true, - "sourceMap": true, - "rootDir": "src", - "outDir": "dist" - }, - "include": ["src"], - "exclude": ["src/__tests__"] -} diff --git a/packages/observability/tsconfig.json b/packages/observability/tsconfig.json deleted file mode 100644 index ebea6a0..0000000 --- a/packages/observability/tsconfig.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - "noEmit": true, - "rootDir": "src", - "outDir": "dist" - }, - "include": ["src"] -} diff --git a/packages/pipeline/package.json b/packages/pipeline/package.json deleted file mode 100644 index 76c1481..0000000 --- a/packages/pipeline/package.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "name": "@civic-source/pipeline", - "version": "0.0.1", - "private": true, - "type": "module", - "main": "./dist/index.js", - "types": "./dist/index.d.ts", - "exports": { - ".": { - "import": "./dist/index.js", - "types": "./dist/index.d.ts" - } - }, - "scripts": { - "build": "tsc --project tsconfig.build.json", - "typecheck": "tsc --noEmit", - "lint": "eslint src/", - "test": "vitest run" - }, - "dependencies": { - "@civic-source/types": "workspace:*", - "@civic-source/fetcher": "workspace:*", - "@civic-source/transformer": "workspace:*", - "@civic-source/annotator": "workspace:*" - }, - "devDependencies": { - "@types/node": "^25.5.0", - "typescript": "^6.0.0", - "vitest": "^4.1.6" - } -} diff --git a/packages/pipeline/src/__tests__/e2e-pipeline.test.ts b/packages/pipeline/src/__tests__/e2e-pipeline.test.ts deleted file mode 100644 index c020c7b..0000000 --- a/packages/pipeline/src/__tests__/e2e-pipeline.test.ts +++ /dev/null @@ -1,169 +0,0 @@ -import { describe, it, expect } from 'vitest'; -import { mkdtemp, rm, readdir, readFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; -import { join } from 'node:path'; - -/** - * End-to-end integration test that validates the pipeline against real OLRC data. - * - * This test downloads a real USLM XML file (Title 1 — General Provisions, - * the smallest title with ~8 sections), transforms it, and validates the output. - * - * Only runs when RUN_E2E=true environment variable is set. - * Network failures cause the test to skip, not fail. - */ - -const RUN_E2E = process.env['RUN_E2E'] === 'true'; - -/** Known OLRC Title 1 XML ZIP URLs to try in order (newest first) */ -const TITLE_1_URLS = [ - 'https://uscode.house.gov/download/releasepoints/us/pl/119/73/xml_usc01@119-73.zip', - 'https://uscode.house.gov/download/releasepoints/us/pl/118/100/xml_usc01@118-100.zip', - 'https://uscode.house.gov/download/releasepoints/us/pl/118/78/xml_usc01@118-78.zip', -]; - -/** Attempt to download a ZIP, returning the Buffer or null on failure */ -async function tryDownload(urls: string[]): Promise<{ buffer: Buffer; url: string } | null> { - for (const url of urls) { - try { - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), 30_000); - const response = await fetch(url, { signal: controller.signal }); - clearTimeout(timeout); - - if (!response.ok) continue; - - const buffer = Buffer.from(await response.arrayBuffer()); - // Validate ZIP signature (PK header) - if (buffer.length >= 4 && buffer[0] === 0x50 && buffer[1] === 0x4b) { - return { buffer, url }; - } - } catch { - // Network error, rate limit, timeout — try next URL - continue; - } - } - return null; -} - -/** Extract XML from a ZIP buffer using the system `unzip` command */ -async function extractXmlFromZip(zipBuffer: Buffer): Promise { - const { writeFile, readdir, readFile, rm } = await import('node:fs/promises'); - const { mkdtemp } = await import('node:fs/promises'); - const { tmpdir } = await import('node:os'); - const { join } = await import('node:path'); - const { execFile } = await import('node:child_process'); - const { promisify } = await import('node:util'); - const execFileAsync = promisify(execFile); - - const tmpDir = await mkdtemp(join(tmpdir(), 'zip-extract-')); - try { - const zipPath = join(tmpDir, 'download.zip'); - await writeFile(zipPath, zipBuffer); - await execFileAsync('unzip', ['-o', '-q', zipPath, '-d', tmpDir], { timeout: 15_000 }); - - const entries = await readdir(tmpDir); - const xmlFile = entries.find((f) => f.endsWith('.xml')); - if (!xmlFile) return null; - - return await readFile(join(tmpDir, xmlFile), 'utf-8'); - } catch { - return null; - } finally { - await rm(tmpDir, { recursive: true, force: true }).catch(() => undefined); - } -} - -describe.skipIf(!RUN_E2E)('E2E: Real OLRC Title 1 Pipeline', () => { - let outputDir: string; - - it('downloads, transforms, and validates Title 1 XML', async () => { - // Step 1: Download - const download = await tryDownload(TITLE_1_URLS); - if (download === null) { - console.log('Skipping E2E: could not download Title 1 ZIP from OLRC (network or rate limit)'); - return; - } - - console.log(`Downloaded Title 1 from: ${download.url} (${download.buffer.length} bytes)`); - - // Step 2: Extract XML from ZIP - const xml = await extractXmlFromZip(download.buffer); - expect(xml).not.toBeNull(); - if (xml === null) return; - - expect(xml.length).toBeGreaterThan(100); - // OLRC uses (USLM 1.0) or (USLM 2.0) - const hasRootElement = xml.includes(' String(e).endsWith('.md')); - expect(mdFiles.length).toBe(files.length); - - // Read one file back and verify content roundtrip - const firstFile = files[0]; - if (firstFile) { - const content = await readFile(join(outputDir, firstFile.path), 'utf-8'); - expect(content).toBe(firstFile.content); - } - - console.log(`E2E passed: ${files.length} sections written to ${outputDir}`); - - // Cleanup - await rm(outputDir, { recursive: true, force: true }); - }, 60_000); // 60s timeout for network download -}); diff --git a/packages/pipeline/src/__tests__/orchestrate.test.ts b/packages/pipeline/src/__tests__/orchestrate.test.ts deleted file mode 100644 index 1f9fab3..0000000 --- a/packages/pipeline/src/__tests__/orchestrate.test.ts +++ /dev/null @@ -1,300 +0,0 @@ -import { describe, it, expect, vi, beforeEach } from 'vitest'; -import type { Result, ReleasePoint } from '@civic-source/types'; -import type { MarkdownFile } from '@civic-source/transformer'; - -// --- Mocks must be declared before the module under test is imported --- - -const mockListReleasePoints = vi.fn<() => Promise>>(); -const mockFetchXml = vi.fn<() => Promise>>(); -const mockTransformToFiles = vi.fn<() => Result>(); -const mockAnnotateSection = vi.fn<() => Promise>>(); - -vi.mock('@civic-source/fetcher', () => ({ - OlrcFetcher: vi.fn(function () { - return { - listReleasePoints: mockListReleasePoints, - fetchXml: mockFetchXml, - }; - }), - HashStore: vi.fn(function () { - return {}; - }), - createLogger: vi.fn(() => ({ - debug: vi.fn(), - info: vi.fn(), - warn: vi.fn(), - error: vi.fn(), - startTimer: vi.fn(() => vi.fn()), - logMemory: vi.fn(), - })), -})); - -vi.mock('@civic-source/transformer', () => ({ - XmlToMarkdownAdapter: vi.fn(function () { - return { transformToFiles: mockTransformToFiles }; - }), -})); - -vi.mock('@civic-source/annotator', () => ({ - Annotator: vi.fn(function () { - return { annotateSection: mockAnnotateSection }; - }), - annotationToYaml: vi.fn(() => 'targetSection: "1 U.S.C. 1"\n'), -})); - -// Mock fs to avoid real disk I/O -vi.mock('node:fs/promises', () => ({ - writeFile: vi.fn(async () => undefined), - mkdir: vi.fn(async () => undefined), -})); - -import { orchestrate } from '../orchestrate.js'; - -// --- Helpers --- - -function makeReleasePoint(overrides?: Partial): ReleasePoint { - return { - title: '1', - publicLaw: 'PL 118-100', - dateET: '2025-01-01T00:00:00.000Z', - uslmUrl: 'https://uscode.house.gov/download/releasepoints/us/pl/118/1/xml_usc01@118-100.zip', - sha256Hash: 'a'.repeat(64), - ...overrides, - }; -} - -function makeMarkdownFile(overrides?: Partial): MarkdownFile { - return { - path: 'statutes/title-01/chapter-0/section-1.md', - content: '---\ntitle: Section 1\n---\n\nBody text here.', - ...overrides, - }; -} - -// --- Tests --- - -describe('orchestrate', () => { - beforeEach(() => { - mockListReleasePoints.mockReset(); - mockFetchXml.mockReset(); - mockTransformToFiles.mockReset(); - mockAnnotateSection.mockReset(); - }); - - it('returns correct metrics on successful orchestration', async () => { - const rp = makeReleasePoint(); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp] }); - mockFetchXml.mockResolvedValue({ ok: true, value: 'data' }); - mockTransformToFiles.mockReturnValue({ - ok: true, - value: [makeMarkdownFile(), makeMarkdownFile({ path: 'statutes/title-01/chapter-0/section-2.md' })], - }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - skipAnnotation: true, - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - expect(result.value.titlesProcessed).toBe(1); - expect(result.value.totalSectionsTransformed).toBe(2); - expect(result.value.publicLaw).toBe('PL 118-100'); - expect(result.value.durationMs).toBeGreaterThanOrEqual(0); - }); - - it('isolates single title failure without blocking others', async () => { - const rp1 = makeReleasePoint({ title: '1' }); - const rp2 = makeReleasePoint({ title: '2' }); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp1, rp2] }); - - // Title 1 fails at XML fetch, Title 2 succeeds - mockFetchXml - .mockResolvedValueOnce({ ok: false, error: new Error('Network timeout') }) - .mockResolvedValueOnce({ ok: true, value: 'title2' }); - - mockTransformToFiles.mockReturnValue({ - ok: true, - value: [makeMarkdownFile({ path: 'statutes/title-02/chapter-0/section-1.md' })], - }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - skipAnnotation: true, - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - // Title 1 should be recorded with errors, title 2 should succeed - expect(result.value.titlesProcessed).toBe(2); - const title1 = result.value.titleResults.find((t) => t.title === '1'); - const title2 = result.value.titleResults.find((t) => t.title === '2'); - expect(title1?.errors).toHaveLength(1); - expect(title1?.errors[0]).toContain('Network timeout'); - expect(title2?.sectionsTransformed).toBe(1); - expect(title2?.errors).toHaveLength(0); - }); - - it('returns early with empty result when no release points exist', async () => { - mockListReleasePoints.mockResolvedValue({ ok: true, value: [] }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - expect(result.value.titlesProcessed).toBe(0); - expect(result.value.totalSectionsTransformed).toBe(0); - expect(result.value.publicLaw).toBe('None'); - }); - - it('propagates release point list failure', async () => { - mockListReleasePoints.mockResolvedValue({ - ok: false, - error: new Error('OLRC page unavailable'), - }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - }); - - expect(result.ok).toBe(false); - if (result.ok) return; - expect(result.error.message).toContain('OLRC page unavailable'); - }); - - it('handles empty transform output gracefully', async () => { - const rp = makeReleasePoint(); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp] }); - mockFetchXml.mockResolvedValue({ ok: true, value: 'empty' }); - mockTransformToFiles.mockReturnValue({ ok: true, value: [] }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - skipAnnotation: true, - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - expect(result.value.titlesProcessed).toBe(1); - expect(result.value.totalSectionsTransformed).toBe(0); - expect(result.value.titleResults[0]?.errors).toHaveLength(0); - }); - - it('skips annotation when skipAnnotation is true', async () => { - const rp = makeReleasePoint(); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp] }); - mockFetchXml.mockResolvedValue({ ok: true, value: '' }); - mockTransformToFiles.mockReturnValue({ - ok: true, - value: [makeMarkdownFile()], - }); - - await orchestrate({ - outputDir: '/tmp/test-output', - skipAnnotation: true, - }); - - expect(mockAnnotateSection).not.toHaveBeenCalled(); - }); - - it('skips unchanged titles (empty XML response)', async () => { - const rp = makeReleasePoint(); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp] }); - // Empty string means content unchanged (hash match) - mockFetchXml.mockResolvedValue({ ok: true, value: '' }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - skipAnnotation: true, - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - expect(result.value.skippedTitles).toBe(1); - expect(result.value.titlesProcessed).toBe(0); - expect(mockTransformToFiles).not.toHaveBeenCalled(); - }); - - it('handles transform failure for a title', async () => { - const rp = makeReleasePoint(); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp] }); - mockFetchXml.mockResolvedValue({ ok: true, value: '' }); - mockTransformToFiles.mockReturnValue({ - ok: false, - error: new Error('No title element found in document'), - }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - skipAnnotation: true, - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - expect(result.value.titlesProcessed).toBe(1); - const titleResult = result.value.titleResults[0]; - expect(titleResult?.sectionsTransformed).toBe(0); - expect(titleResult?.errors[0]).toContain('No title element'); - }); - - it('filters to requested titles only', async () => { - const rp1 = makeReleasePoint({ title: '1' }); - const rp2 = makeReleasePoint({ title: '2' }); - const rp3 = makeReleasePoint({ title: '3' }); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp1, rp2, rp3] }); - mockFetchXml.mockResolvedValue({ ok: true, value: '' }); - mockTransformToFiles.mockReturnValue({ - ok: true, - value: [makeMarkdownFile()], - }); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - titles: ['1', '3'], - skipAnnotation: true, - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - // Only titles 1 and 3 should be processed - expect(mockFetchXml).toHaveBeenCalledTimes(2); - }); -}); - -describe('isSafePath (via writeMarkdownFile behavior)', () => { - beforeEach(() => { - mockListReleasePoints.mockReset(); - mockFetchXml.mockReset(); - mockTransformToFiles.mockReset(); - }); - - it('rejects path traversal in file paths', async () => { - const rp = makeReleasePoint(); - mockListReleasePoints.mockResolvedValue({ ok: true, value: [rp] }); - mockFetchXml.mockResolvedValue({ ok: true, value: '' }); - mockTransformToFiles.mockReturnValue({ - ok: true, - value: [ - makeMarkdownFile({ path: '../../../etc/passwd' }), - makeMarkdownFile({ path: 'statutes/title-01/chapter-0/section-1.md' }), - ], - }); - - const { writeFile } = await import('node:fs/promises'); - - const result = await orchestrate({ - outputDir: '/tmp/test-output', - skipAnnotation: true, - }); - - expect(result.ok).toBe(true); - if (!result.ok) return; - // The traversal path should be skipped, only the safe path should be written - // writeFile is called once for the safe path only - const writeCalls = vi.mocked(writeFile).mock.calls; - const writtenPaths = writeCalls.map((call) => String(call[0])); - expect(writtenPaths.every((p) => !p.includes('etc/passwd'))).toBe(true); - }); -}); diff --git a/packages/pipeline/src/index.ts b/packages/pipeline/src/index.ts deleted file mode 100644 index da51153..0000000 --- a/packages/pipeline/src/index.ts +++ /dev/null @@ -1,2 +0,0 @@ -export { orchestrate } from './orchestrate.js'; -export type { PipelineResult, TitleResult, OrchestrateOptions } from './orchestrate.js'; diff --git a/packages/pipeline/src/orchestrate.ts b/packages/pipeline/src/orchestrate.ts deleted file mode 100644 index d022140..0000000 --- a/packages/pipeline/src/orchestrate.ts +++ /dev/null @@ -1,262 +0,0 @@ -import { writeFile, mkdir } from 'node:fs/promises'; -import { join, dirname, resolve } from 'node:path'; -import { type ReleasePoint, type Result, ok } from '@civic-source/types'; -import { OlrcFetcher, HashStore, createLogger } from '@civic-source/fetcher'; -import { XmlToMarkdownAdapter } from '@civic-source/transformer'; -import { Annotator, annotationToYaml } from '@civic-source/annotator'; -import type { MarkdownFile } from '@civic-source/transformer'; - -const log = createLogger('pipeline'); - -/** Summary of a single title's pipeline run */ -export interface TitleResult { - title: string; - sectionsTransformed: number; - sectionsAnnotated: number; - errors: string[]; -} - -/** Summary of the full pipeline run */ -export interface PipelineResult { - publicLaw: string; - titlesProcessed: number; - totalSectionsTransformed: number; - totalSectionsAnnotated: number; - titleResults: TitleResult[]; - skippedTitles: number; - durationMs: number; -} - -export interface OrchestrateOptions { - /** Directory to write transformed statute Markdown files */ - outputDir: string; - /** Optional: only process these title numbers */ - titles?: string[]; - /** Skip annotation step (useful for testing) */ - skipAnnotation?: boolean; -} - -/** - * Run the full pipeline: fetch release points, transform XML to Markdown, - * and annotate sections with precedent cases. - * - * Returns a structured result with per-title metrics. Per-title failures - * do not block other titles from processing. - */ -export async function orchestrate( - options: OrchestrateOptions -): Promise> { - const start = performance.now(); - const timer = log.startTimer('Full pipeline'); - - const fetcher = new OlrcFetcher({ hashStore: new HashStore() }); - - // Step 1: Fetch release points - log.info('Fetching release points'); - const releaseResult = await fetcher.listReleasePoints(); - if (!releaseResult.ok) { - timer(); - return releaseResult; - } - - let releasePoints = releaseResult.value; - if (releasePoints.length === 0) { - timer(); - return ok({ - publicLaw: 'None', - titlesProcessed: 0, - totalSectionsTransformed: 0, - totalSectionsAnnotated: 0, - titleResults: [], - skippedTitles: 0, - durationMs: Math.round(performance.now() - start), - }); - } - - // Filter to requested titles if specified - if (options.titles !== undefined && options.titles.length > 0) { - releasePoints = releasePoints.filter((rp) => - options.titles!.includes(rp.title) - ); - } - - const publicLaw = releasePoints[0]?.publicLaw ?? 'Unknown'; - const titleResults: TitleResult[] = []; - let skippedTitles = 0; - - // Step 2: Process each title independently - for (const releasePoint of releasePoints) { - const titleResult = await processTitle( - releasePoint, - fetcher, - options - ); - - if (titleResult === null) { - skippedTitles++; - continue; - } - - titleResults.push(titleResult); - } - - timer(); - - const result: PipelineResult = { - publicLaw, - titlesProcessed: titleResults.length, - totalSectionsTransformed: titleResults.reduce( - (sum, t) => sum + t.sectionsTransformed, - 0 - ), - totalSectionsAnnotated: titleResults.reduce( - (sum, t) => sum + t.sectionsAnnotated, - 0 - ), - titleResults, - skippedTitles, - durationMs: Math.round(performance.now() - start), - }; - - log.info('Pipeline complete', { - titlesProcessed: result.titlesProcessed, - totalSections: result.totalSectionsTransformed, - skipped: result.skippedTitles, - durationMs: result.durationMs, - }); - - return ok(result); -} - -/** - * Process a single title: fetch XML, transform to Markdown, optionally annotate. - * Returns null if the content is unchanged (hash match). - */ -async function processTitle( - releasePoint: ReleasePoint, - fetcher: OlrcFetcher, - options: OrchestrateOptions -): Promise { - const { title } = releasePoint; - const errors: string[] = []; - log.info('Processing title', { title }); - - // Fetch XML (returns empty string if unchanged) - const xmlResult = await fetcher.fetchXml(releasePoint); - if (!xmlResult.ok) { - log.error('Failed to fetch XML', { title, error: xmlResult.error.message }); - return { title, sectionsTransformed: 0, sectionsAnnotated: 0, errors: [xmlResult.error.message] }; - } - - if (xmlResult.value === '') { - log.info('Title unchanged, skipping', { title }); - return null; - } - - // Transform XML to Markdown files - const transformer = new XmlToMarkdownAdapter(releasePoint.publicLaw); - const transformResult = transformer.transformToFiles(xmlResult.value); - if (!transformResult.ok) { - log.error('Transform failed', { title, error: transformResult.error.message }); - return { title, sectionsTransformed: 0, sectionsAnnotated: 0, errors: [transformResult.error.message] }; - } - - const files = transformResult.value; - - // Write transformed files to output directory - for (const file of files) { - await writeMarkdownFile(options.outputDir, file); - } - - log.info('Transformed title', { title, sections: files.length }); - - // Annotate sections (if not skipped) - let annotatedCount = 0; - if (options.skipAnnotation !== true) { - annotatedCount = await annotateSections(files, options.outputDir, errors); - } - - return { - title, - sectionsTransformed: files.length, - sectionsAnnotated: annotatedCount, - errors, - }; -} - -/** - * Check that a resolved path stays within the base directory. - * Returns true if the path is safe, false if traversal is detected. - */ -function isSafePath(baseDir: string, filePath: string): boolean { - const resolvedBase = resolve(baseDir); - const resolvedFull = resolve(join(baseDir, filePath)); - return resolvedFull.startsWith(resolvedBase); -} - -/** Write a single Markdown file to disk, creating directories as needed */ -async function writeMarkdownFile( - outputDir: string, - file: MarkdownFile -): Promise { - if (!isSafePath(outputDir, file.path)) { - log.warn('Path traversal detected, skipping file', { path: file.path }); - return; - } - const fullPath = join(outputDir, file.path); - await mkdir(dirname(fullPath), { recursive: true }); - await writeFile(fullPath, file.content, 'utf-8'); -} - -/** Annotate transformed sections, writing annotation YAML to annotations/ directory */ -async function annotateSections( - files: MarkdownFile[], - outputDir: string, - errors: string[] -): Promise { - const annotator = new Annotator(); - let count = 0; - - for (const file of files) { - // Extract section reference from file path (e.g., "18 U.S.C. 111") - const sectionRef = sectionRefFromPath(file.path); - if (sectionRef === null) continue; - - const result = await annotator.annotateSection(sectionRef); - if (!result.ok) { - log.warn('Annotation failed for section', { - section: sectionRef, - error: result.error.message, - }); - errors.push(`annotation:${sectionRef}: ${result.error.message}`); - continue; - } - - // Write annotation YAML to annotations/ directory - const annotationPath = result.value.path; - if (!isSafePath(outputDir, annotationPath)) { - log.warn('Path traversal detected in annotation path, skipping', { path: annotationPath }); - continue; - } - const fullPath = join(outputDir, annotationPath); - await mkdir(dirname(fullPath), { recursive: true }); - await writeFile(fullPath, annotationToYaml(result.value.annotation), 'utf-8'); - count++; - } - - return count; -} - -/** - * Derive a statute citation from a file path. - * Path format: statutes/title-{n}/chapter-{n}/section-{n}.md - * Returns e.g. "18 U.S.C. 111" or null if path doesn't match. - */ -function sectionRefFromPath(filePath: string): string | null { - const match = /statutes\/title-(\d+[a-zA-Z]?)\/.*\/section-(\d+[a-zA-Z-]*)\.md$/.exec( - filePath - ); - if (!match) return null; - const [, titleNum, sectionNum] = match; - return `${titleNum} U.S.C. ${sectionNum}`; -} diff --git a/packages/pipeline/tsconfig.build.json b/packages/pipeline/tsconfig.build.json deleted file mode 100644 index 9fb6f30..0000000 --- a/packages/pipeline/tsconfig.build.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - "noEmit": false, - "declaration": true, - "declarationMap": true, - "sourceMap": true, - "rootDir": "src", - "outDir": "dist" - }, - "include": ["src"] -} diff --git a/packages/pipeline/tsconfig.json b/packages/pipeline/tsconfig.json deleted file mode 100644 index ebea6a0..0000000 --- a/packages/pipeline/tsconfig.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - "noEmit": true, - "rootDir": "src", - "outDir": "dist" - }, - "include": ["src"] -} diff --git a/packages/pipeline/vitest.config.ts b/packages/pipeline/vitest.config.ts deleted file mode 100644 index 697ae29..0000000 --- a/packages/pipeline/vitest.config.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { defineConfig } from 'vitest/config'; - -export default defineConfig({ - test: { - exclude: ['dist/**', 'node_modules/**'], - }, -}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5c4279d..503832a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -141,50 +141,6 @@ importers: specifier: ^4.1.6 version: 4.1.6(@types/node@22.19.15)(vite@7.3.2(@types/node@22.19.15)(jiti@2.6.1)(lightningcss@1.32.0)(yaml@2.9.0)) - packages/observability: - dependencies: - '@civic-source/shared': - specifier: workspace:* - version: link:../shared - '@civic-source/types': - specifier: workspace:* - version: link:../types - devDependencies: - '@types/node': - specifier: ^22.19.15 - version: 22.19.15 - typescript: - specifier: ^6.0.0 - version: 6.0.3 - vitest: - specifier: ^4.1.6 - version: 4.1.6(@types/node@22.19.15)(vite@7.3.2(@types/node@22.19.15)(jiti@2.6.1)(lightningcss@1.32.0)(yaml@2.9.0)) - - packages/pipeline: - dependencies: - '@civic-source/annotator': - specifier: workspace:* - version: link:../annotator - '@civic-source/fetcher': - specifier: workspace:* - version: link:../fetcher - '@civic-source/transformer': - specifier: workspace:* - version: link:../transformer - '@civic-source/types': - specifier: workspace:* - version: link:../types - devDependencies: - '@types/node': - specifier: ^25.5.0 - version: 25.5.0 - typescript: - specifier: ^6.0.0 - version: 6.0.3 - vitest: - specifier: ^4.1.6 - version: 4.1.6(@types/node@25.5.0)(vite@7.3.2(@types/node@25.5.0)(jiti@2.6.1)(lightningcss@1.32.0)(yaml@2.9.0)) - packages/shared: devDependencies: '@types/node':