diff --git a/packages/fetcher/src/__tests__/fetcher.test.ts b/packages/fetcher/src/__tests__/fetcher.test.ts
index 86d62d8..0f5af88 100644
--- a/packages/fetcher/src/__tests__/fetcher.test.ts
+++ b/packages/fetcher/src/__tests__/fetcher.test.ts
@@ -165,6 +165,21 @@ describe('parseReleasePoints', () => {
expect(points[0]?.publicLaw).toBe('');
});
+ it('does not produce an off-host uslmUrl for an absolute foreign-host href (SSRF)', () => {
+ // An attacker-controlled page that points a release-point link at a
+ // foreign host must never yield a uslmUrl that fetchXml would fetch.
+ const html = `
+
Public Law 118-200 (11/15/2024)
+ Evil
+ `;
+ const points = parseReleasePoints(html);
+ // The link is either skipped or rewritten to the OLRC host — never evil.example.
+ for (const p of points) {
+ expect(p.uslmUrl).not.toContain('evil.example');
+ expect(p.uslmUrl.startsWith('https://uscode.house.gov/')).toBe(true);
+ }
+ });
+
it('does not catastrophically backtrack on malformed input (CodeQL js/polynomial-redos)', () => {
// Long string of repeated non-quote chars that look like an unclosed href.
// The old [^"]* prefix would let the engine try every cut point.
@@ -376,6 +391,22 @@ describe('OlrcFetcher', () => {
}
});
+ it('fetchXml rejects a response whose Content-Length exceeds the size cap', async () => {
+ // Advertise a body far larger than MAX_DOWNLOAD_BYTES (300 MiB).
+ const oversized = String(314572800 + 1);
+ vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
+ new Response('PK', { status: 200, headers: { 'content-length': oversized } })
+ );
+
+ const rp: ReleasePoint = { title: '42', publicLaw: 'PL 118-200', dateET: '2024-01-01T00:00:00Z', uslmUrl: 'https://example.com/t42.zip', sha256Hash: '0'.repeat(64) };
+ const fetcher = new OlrcFetcher({ logger });
+ const result = await fetcher.fetchXml(rp);
+ expect(result.ok).toBe(false);
+ if (!result.ok) {
+ expect(result.error.message).toContain('size limit');
+ }
+ });
+
it('fetchXml skips download when hash is unchanged', async () => {
// Create a fake ZIP (starts with PK signature)
const zipContent = Buffer.from([0x50, 0x4b, 0x03, 0x04, ...Buffer.from('fake-zip-data')]);
diff --git a/packages/fetcher/src/constants.ts b/packages/fetcher/src/constants.ts
index f5a7d8b..3309b6e 100644
--- a/packages/fetcher/src/constants.ts
+++ b/packages/fetcher/src/constants.ts
@@ -22,6 +22,16 @@ export function allTitlesXmlUrl(congress: string, law: string): string {
return `${OLRC_RELEASE_POINTS_URL}us/pl/${congress}/${law}/xml_uscAll@${congress}-${law}.zip`;
}
+/**
+ * Maximum number of bytes to accept from a single download.
+ *
+ * Guards CI runners against OOM from a maliciously or accidentally huge
+ * response. The bound is deliberately generous: the all-titles USC archive
+ * can be ~100-150 MB, so 300 MiB leaves ample headroom for legitimate
+ * downloads while still capping runaway responses.
+ */
+export const MAX_DOWNLOAD_BYTES = 314572800; // 300 MiB
+
/** Path for hash storage relative to working directory */
export const HASH_STORE_DIR = '.openlaw-git';
export const HASH_STORE_FILE = 'hashes.json';
diff --git a/packages/fetcher/src/fetcher.ts b/packages/fetcher/src/fetcher.ts
index ba354c3..009d2ea 100644
--- a/packages/fetcher/src/fetcher.ts
+++ b/packages/fetcher/src/fetcher.ts
@@ -1,7 +1,12 @@
import { createHash } from 'node:crypto';
import { type IUsCodeFetcher, type ReleasePoint, type Result, ok, err } from '@civic-source/types';
import { type Logger, createLogger, fetchWithRetry as sharedFetchWithRetry } from '@civic-source/shared';
-import { OLRC_DOWNLOAD_PAGE, OLRC_PRIOR_RELEASE_POINTS_PAGE } from './constants.js';
+import {
+ OLRC_BASE_URL,
+ OLRC_DOWNLOAD_PAGE,
+ OLRC_PRIOR_RELEASE_POINTS_PAGE,
+ MAX_DOWNLOAD_BYTES,
+} from './constants.js';
import { HashStore } from './hash-store.js';
import { FetcherMetrics } from './metrics.js';
@@ -10,6 +15,23 @@ export function sha256(data: Buffer): string {
return createHash('sha256').update(data).digest('hex');
}
+/**
+ * Inspect a response's Content-Length header and reject up-front if it
+ * advertises a body larger than MAX_DOWNLOAD_BYTES. This is the primary guard
+ * against runner OOM: it avoids ever reading an oversized body into memory.
+ *
+ * Returns true when the response is over the cap (caller should error).
+ * A missing or unparseable Content-Length returns false — the post-read
+ * length check provides defense-in-depth for those cases.
+ */
+export function exceedsContentLengthLimit(response: Response): boolean {
+ const header = response.headers.get('content-length');
+ if (header === null) return false;
+ const declared = Number(header);
+ if (!Number.isFinite(declared)) return false;
+ return declared > MAX_DOWNLOAD_BYTES;
+}
+
/**
* Fetch with exponential backoff retry.
* Delegates to the shared fetchWithRetry utility.
@@ -118,9 +140,16 @@ export function parseReleasePoints(html: string): ReleasePoint[] {
const currentRelease = parseCurrentRelease(html);
// Match links like: /download/releasepoints/us/pl/118/42/xml_usc42@118-200.zip
+ //
+ // SSRF hardening: only accept OLRC server-relative hrefs (the form the OLRC
+ // pages actually emit). We deliberately do NOT accept an absolute-URL prefix
+ // here: allowing one would let an href to an arbitrary host become a
+ // uslmUrl that fetchXml then fetches verbatim. Because every match is a bare
+ // path, fullUrl below is always anchored to the OLRC host.
+ //
// Anchor segments to non-slash/non-quote chars so we don't get polynomial
// backtracking on malformed input (CodeQL js/polynomial-redos).
- const linkPattern = /href="((?:https?:\/\/[^"/]+)?\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g;
+ const linkPattern = /href="(\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g;
let match: RegExpExecArray | null;
// Extract unique title numbers from XML download links
@@ -138,9 +167,10 @@ export function parseReleasePoints(html: string): ReleasePoint[] {
if (!title || seen.has(title)) continue;
seen.add(title);
- const fullUrl = path.startsWith('http')
- ? path
- : `https://uscode.house.gov${path}`;
+ // path is always an OLRC server-relative path (the regex no longer
+ // accepts an absolute-URL prefix), so this is always anchored to the
+ // OLRC host — a foreign host can never become a uslmUrl.
+ const fullUrl = `${OLRC_BASE_URL}${path}`;
results.push({
title,
@@ -180,6 +210,11 @@ export class OlrcFetcher implements IUsCodeFetcher {
return result;
}
+ if (exceedsContentLengthLimit(result.value)) {
+ timer();
+ return err(new Error('Download exceeds size limit'));
+ }
+
const html = await result.value.text();
let points = parseReleasePoints(html);
this.metrics.recordDiscovered(points.length);
@@ -209,13 +244,18 @@ export class OlrcFetcher implements IUsCodeFetcher {
return priorResult;
}
+ if (exceedsContentLengthLimit(priorResult.value)) {
+ timer();
+ return err(new Error('Download exceeds size limit'));
+ }
+
const priorHtml = await priorResult.value.text();
const historicalPoints = parsePriorReleasePoints(priorHtml);
this.metrics.recordDiscovered(historicalPoints.length);
// Also fetch current release point to include it
const currentResult = await fetchWithRetry(OLRC_DOWNLOAD_PAGE, this.logger);
- if (currentResult.ok) {
+ if (currentResult.ok && !exceedsContentLengthLimit(currentResult.value)) {
const currentHtml = await currentResult.value.text();
const current = parseCurrentRelease(currentHtml);
if (current) {
@@ -257,7 +297,28 @@ export class OlrcFetcher implements IUsCodeFetcher {
return result;
}
+ // Size cap (primary guard): reject before reading the body if the server
+ // advertises a Content-Length over the limit, avoiding runner OOM.
+ if (exceedsContentLengthLimit(result.value)) {
+ const durationMs = performance.now() - startMs;
+ this.metrics.recordDuration(durationMs);
+ this.metrics.recordError('network');
+ timer();
+ return err(new Error('Download exceeds size limit'));
+ }
+
const buffer = Buffer.from(await result.value.arrayBuffer());
+
+ // Defense-in-depth: catch oversized bodies that lacked a (truthful)
+ // Content-Length header.
+ if (buffer.length > MAX_DOWNLOAD_BYTES) {
+ const durationMs = performance.now() - startMs;
+ this.metrics.recordDuration(durationMs);
+ this.metrics.recordError('network');
+ timer();
+ return err(new Error('Download exceeds size limit'));
+ }
+
const hash = sha256(buffer);
const hashKey = `xml:${releasePoint.title}:${releasePoint.uslmUrl}`;