From e5cf8c4320bf98c59094dcac5a60331a6e2a0563 Mon Sep 17 00:00:00 2001 From: William Zujkowski Date: Mon, 22 Jun 2026 23:52:13 -0400 Subject: [PATCH] fix(fetcher): harden against SSRF and oversized downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two defense-in-depth fixes from the security review (#201): SSRF: parseReleasePoints' link regex accepted an optional absolute-URL prefix `(?:https?://[^"/]+)?`, so an href to any host on a compromised/MitM'd OLRC page could become a uslmUrl that fetchXml then fetched verbatim. Drop the absolute-URL alternative — the parser now matches only OLRC server-relative paths, so every uslmUrl is anchored to https://uscode.house.gov. Anti-ReDoS anchoring preserved. Response size cap: fetchXml read the body with no limit, risking runner OOM on a hostile/oversized response. Add MAX_DOWNLOAD_BYTES (300 MiB — generous vs the ~100-150 MB all-titles archive) and a Content-Length pre-check (exceedsContentLengthLimit) plus a post-read length backstop, applied to the archive download and the release-point page reads. New tests: foreign-host href never yields an off-host uslmUrl; a Content-Length over the cap yields an error Result. fetcher: 127 tests pass; monorepo builds. Closes #201 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../fetcher/src/__tests__/fetcher.test.ts | 31 ++++++++ packages/fetcher/src/constants.ts | 10 +++ packages/fetcher/src/fetcher.ts | 73 +++++++++++++++++-- 3 files changed, 108 insertions(+), 6 deletions(-) diff --git a/packages/fetcher/src/__tests__/fetcher.test.ts b/packages/fetcher/src/__tests__/fetcher.test.ts index 86d62d8..0f5af88 100644 --- a/packages/fetcher/src/__tests__/fetcher.test.ts +++ b/packages/fetcher/src/__tests__/fetcher.test.ts @@ -165,6 +165,21 @@ describe('parseReleasePoints', () => { expect(points[0]?.publicLaw).toBe(''); }); + it('does not produce an off-host uslmUrl for an absolute foreign-host href (SSRF)', () => { + // An attacker-controlled page that points a release-point link at a + // foreign host must never yield a uslmUrl that fetchXml would fetch. + const html = ` +

Public Law 118-200 (11/15/2024)

+ Evil + `; + const points = parseReleasePoints(html); + // The link is either skipped or rewritten to the OLRC host — never evil.example. + for (const p of points) { + expect(p.uslmUrl).not.toContain('evil.example'); + expect(p.uslmUrl.startsWith('https://uscode.house.gov/')).toBe(true); + } + }); + it('does not catastrophically backtrack on malformed input (CodeQL js/polynomial-redos)', () => { // Long string of repeated non-quote chars that look like an unclosed href. // The old [^"]* prefix would let the engine try every cut point. @@ -376,6 +391,22 @@ describe('OlrcFetcher', () => { } }); + it('fetchXml rejects a response whose Content-Length exceeds the size cap', async () => { + // Advertise a body far larger than MAX_DOWNLOAD_BYTES (300 MiB). + const oversized = String(314572800 + 1); + vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce( + new Response('PK', { status: 200, headers: { 'content-length': oversized } }) + ); + + const rp: ReleasePoint = { title: '42', publicLaw: 'PL 118-200', dateET: '2024-01-01T00:00:00Z', uslmUrl: 'https://example.com/t42.zip', sha256Hash: '0'.repeat(64) }; + const fetcher = new OlrcFetcher({ logger }); + const result = await fetcher.fetchXml(rp); + expect(result.ok).toBe(false); + if (!result.ok) { + expect(result.error.message).toContain('size limit'); + } + }); + it('fetchXml skips download when hash is unchanged', async () => { // Create a fake ZIP (starts with PK signature) const zipContent = Buffer.from([0x50, 0x4b, 0x03, 0x04, ...Buffer.from('fake-zip-data')]); diff --git a/packages/fetcher/src/constants.ts b/packages/fetcher/src/constants.ts index f5a7d8b..3309b6e 100644 --- a/packages/fetcher/src/constants.ts +++ b/packages/fetcher/src/constants.ts @@ -22,6 +22,16 @@ export function allTitlesXmlUrl(congress: string, law: string): string { return `${OLRC_RELEASE_POINTS_URL}us/pl/${congress}/${law}/xml_uscAll@${congress}-${law}.zip`; } +/** + * Maximum number of bytes to accept from a single download. + * + * Guards CI runners against OOM from a maliciously or accidentally huge + * response. The bound is deliberately generous: the all-titles USC archive + * can be ~100-150 MB, so 300 MiB leaves ample headroom for legitimate + * downloads while still capping runaway responses. + */ +export const MAX_DOWNLOAD_BYTES = 314572800; // 300 MiB + /** Path for hash storage relative to working directory */ export const HASH_STORE_DIR = '.openlaw-git'; export const HASH_STORE_FILE = 'hashes.json'; diff --git a/packages/fetcher/src/fetcher.ts b/packages/fetcher/src/fetcher.ts index ba354c3..009d2ea 100644 --- a/packages/fetcher/src/fetcher.ts +++ b/packages/fetcher/src/fetcher.ts @@ -1,7 +1,12 @@ import { createHash } from 'node:crypto'; import { type IUsCodeFetcher, type ReleasePoint, type Result, ok, err } from '@civic-source/types'; import { type Logger, createLogger, fetchWithRetry as sharedFetchWithRetry } from '@civic-source/shared'; -import { OLRC_DOWNLOAD_PAGE, OLRC_PRIOR_RELEASE_POINTS_PAGE } from './constants.js'; +import { + OLRC_BASE_URL, + OLRC_DOWNLOAD_PAGE, + OLRC_PRIOR_RELEASE_POINTS_PAGE, + MAX_DOWNLOAD_BYTES, +} from './constants.js'; import { HashStore } from './hash-store.js'; import { FetcherMetrics } from './metrics.js'; @@ -10,6 +15,23 @@ export function sha256(data: Buffer): string { return createHash('sha256').update(data).digest('hex'); } +/** + * Inspect a response's Content-Length header and reject up-front if it + * advertises a body larger than MAX_DOWNLOAD_BYTES. This is the primary guard + * against runner OOM: it avoids ever reading an oversized body into memory. + * + * Returns true when the response is over the cap (caller should error). + * A missing or unparseable Content-Length returns false — the post-read + * length check provides defense-in-depth for those cases. + */ +export function exceedsContentLengthLimit(response: Response): boolean { + const header = response.headers.get('content-length'); + if (header === null) return false; + const declared = Number(header); + if (!Number.isFinite(declared)) return false; + return declared > MAX_DOWNLOAD_BYTES; +} + /** * Fetch with exponential backoff retry. * Delegates to the shared fetchWithRetry utility. @@ -118,9 +140,16 @@ export function parseReleasePoints(html: string): ReleasePoint[] { const currentRelease = parseCurrentRelease(html); // Match links like: /download/releasepoints/us/pl/118/42/xml_usc42@118-200.zip + // + // SSRF hardening: only accept OLRC server-relative hrefs (the form the OLRC + // pages actually emit). We deliberately do NOT accept an absolute-URL prefix + // here: allowing one would let an href to an arbitrary host become a + // uslmUrl that fetchXml then fetches verbatim. Because every match is a bare + // path, fullUrl below is always anchored to the OLRC host. + // // Anchor segments to non-slash/non-quote chars so we don't get polynomial // backtracking on malformed input (CodeQL js/polynomial-redos). - const linkPattern = /href="((?:https?:\/\/[^"/]+)?\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g; + const linkPattern = /href="(\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g; let match: RegExpExecArray | null; // Extract unique title numbers from XML download links @@ -138,9 +167,10 @@ export function parseReleasePoints(html: string): ReleasePoint[] { if (!title || seen.has(title)) continue; seen.add(title); - const fullUrl = path.startsWith('http') - ? path - : `https://uscode.house.gov${path}`; + // path is always an OLRC server-relative path (the regex no longer + // accepts an absolute-URL prefix), so this is always anchored to the + // OLRC host — a foreign host can never become a uslmUrl. + const fullUrl = `${OLRC_BASE_URL}${path}`; results.push({ title, @@ -180,6 +210,11 @@ export class OlrcFetcher implements IUsCodeFetcher { return result; } + if (exceedsContentLengthLimit(result.value)) { + timer(); + return err(new Error('Download exceeds size limit')); + } + const html = await result.value.text(); let points = parseReleasePoints(html); this.metrics.recordDiscovered(points.length); @@ -209,13 +244,18 @@ export class OlrcFetcher implements IUsCodeFetcher { return priorResult; } + if (exceedsContentLengthLimit(priorResult.value)) { + timer(); + return err(new Error('Download exceeds size limit')); + } + const priorHtml = await priorResult.value.text(); const historicalPoints = parsePriorReleasePoints(priorHtml); this.metrics.recordDiscovered(historicalPoints.length); // Also fetch current release point to include it const currentResult = await fetchWithRetry(OLRC_DOWNLOAD_PAGE, this.logger); - if (currentResult.ok) { + if (currentResult.ok && !exceedsContentLengthLimit(currentResult.value)) { const currentHtml = await currentResult.value.text(); const current = parseCurrentRelease(currentHtml); if (current) { @@ -257,7 +297,28 @@ export class OlrcFetcher implements IUsCodeFetcher { return result; } + // Size cap (primary guard): reject before reading the body if the server + // advertises a Content-Length over the limit, avoiding runner OOM. + if (exceedsContentLengthLimit(result.value)) { + const durationMs = performance.now() - startMs; + this.metrics.recordDuration(durationMs); + this.metrics.recordError('network'); + timer(); + return err(new Error('Download exceeds size limit')); + } + const buffer = Buffer.from(await result.value.arrayBuffer()); + + // Defense-in-depth: catch oversized bodies that lacked a (truthful) + // Content-Length header. + if (buffer.length > MAX_DOWNLOAD_BYTES) { + const durationMs = performance.now() - startMs; + this.metrics.recordDuration(durationMs); + this.metrics.recordError('network'); + timer(); + return err(new Error('Download exceeds size limit')); + } + const hash = sha256(buffer); const hashKey = `xml:${releasePoint.title}:${releasePoint.uslmUrl}`;