Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions packages/fetcher/src/__tests__/fetcher.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,21 @@ describe('parseReleasePoints', () => {
expect(points[0]?.publicLaw).toBe('');
});

it('does not produce an off-host uslmUrl for an absolute foreign-host href (SSRF)', () => {
// An attacker-controlled page that points a release-point link at a
// foreign host must never yield a uslmUrl that fetchXml would fetch.
const html = `
<h2>Public Law 118-200 (11/15/2024)</h2>
<a href="https://evil.example/download/releasepoints/us/pl/1/1/xml_usc18@1-1.zip">Evil</a>
`;
const points = parseReleasePoints(html);
// The link is either skipped or rewritten to the OLRC host — never evil.example.
for (const p of points) {
expect(p.uslmUrl).not.toContain('evil.example');
expect(p.uslmUrl.startsWith('https://uscode.house.gov/')).toBe(true);
}
});

it('does not catastrophically backtrack on malformed input (CodeQL js/polynomial-redos)', () => {
// Long string of repeated non-quote chars that look like an unclosed href.
// The old [^"]* prefix would let the engine try every cut point.
Expand Down Expand Up @@ -376,6 +391,22 @@ describe('OlrcFetcher', () => {
}
});

it('fetchXml rejects a response whose Content-Length exceeds the size cap', async () => {
// Advertise a body far larger than MAX_DOWNLOAD_BYTES (300 MiB).
const oversized = String(314572800 + 1);
vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
new Response('PK', { status: 200, headers: { 'content-length': oversized } })
);

const rp: ReleasePoint = { title: '42', publicLaw: 'PL 118-200', dateET: '2024-01-01T00:00:00Z', uslmUrl: 'https://example.com/t42.zip', sha256Hash: '0'.repeat(64) };
const fetcher = new OlrcFetcher({ logger });
const result = await fetcher.fetchXml(rp);
expect(result.ok).toBe(false);
if (!result.ok) {
expect(result.error.message).toContain('size limit');
}
});

it('fetchXml skips download when hash is unchanged', async () => {
// Create a fake ZIP (starts with PK signature)
const zipContent = Buffer.from([0x50, 0x4b, 0x03, 0x04, ...Buffer.from('fake-zip-data')]);
Expand Down
10 changes: 10 additions & 0 deletions packages/fetcher/src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ export function allTitlesXmlUrl(congress: string, law: string): string {
return `${OLRC_RELEASE_POINTS_URL}us/pl/${congress}/${law}/xml_uscAll@${congress}-${law}.zip`;
}

/**
* Maximum number of bytes to accept from a single download.
*
* Guards CI runners against OOM from a maliciously or accidentally huge
* response. The bound is deliberately generous: the all-titles USC archive
* can be ~100-150 MB, so 300 MiB leaves ample headroom for legitimate
* downloads while still capping runaway responses.
*/
export const MAX_DOWNLOAD_BYTES = 314572800; // 300 MiB

/** Path for hash storage relative to working directory */
export const HASH_STORE_DIR = '.openlaw-git';
export const HASH_STORE_FILE = 'hashes.json';
73 changes: 67 additions & 6 deletions packages/fetcher/src/fetcher.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import { createHash } from 'node:crypto';
import { type IUsCodeFetcher, type ReleasePoint, type Result, ok, err } from '@civic-source/types';
import { type Logger, createLogger, fetchWithRetry as sharedFetchWithRetry } from '@civic-source/shared';
import { OLRC_DOWNLOAD_PAGE, OLRC_PRIOR_RELEASE_POINTS_PAGE } from './constants.js';
import {
OLRC_BASE_URL,
OLRC_DOWNLOAD_PAGE,
OLRC_PRIOR_RELEASE_POINTS_PAGE,
MAX_DOWNLOAD_BYTES,
} from './constants.js';
import { HashStore } from './hash-store.js';
import { FetcherMetrics } from './metrics.js';

Expand All @@ -10,6 +15,23 @@ export function sha256(data: Buffer): string {
return createHash('sha256').update(data).digest('hex');
}

/**
* Inspect a response's Content-Length header and reject up-front if it
* advertises a body larger than MAX_DOWNLOAD_BYTES. This is the primary guard
* against runner OOM: it avoids ever reading an oversized body into memory.
*
* Returns true when the response is over the cap (caller should error).
* A missing or unparseable Content-Length returns false — the post-read
* length check provides defense-in-depth for those cases.
*/
export function exceedsContentLengthLimit(response: Response): boolean {
const header = response.headers.get('content-length');
if (header === null) return false;
const declared = Number(header);
if (!Number.isFinite(declared)) return false;
return declared > MAX_DOWNLOAD_BYTES;
}

/**
* Fetch with exponential backoff retry.
* Delegates to the shared fetchWithRetry utility.
Expand Down Expand Up @@ -118,9 +140,16 @@ export function parseReleasePoints(html: string): ReleasePoint[] {
const currentRelease = parseCurrentRelease(html);

// Match links like: /download/releasepoints/us/pl/118/42/xml_usc42@118-200.zip
//
// SSRF hardening: only accept OLRC server-relative hrefs (the form the OLRC
// pages actually emit). We deliberately do NOT accept an absolute-URL prefix
// here: allowing one would let an href to an arbitrary host become a
// uslmUrl that fetchXml then fetches verbatim. Because every match is a bare
// path, fullUrl below is always anchored to the OLRC host.
//
// Anchor segments to non-slash/non-quote chars so we don't get polynomial
// backtracking on malformed input (CodeQL js/polynomial-redos).
const linkPattern = /href="((?:https?:\/\/[^"/]+)?\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g;
const linkPattern = /href="(\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g;
let match: RegExpExecArray | null;

// Extract unique title numbers from XML download links
Expand All @@ -138,9 +167,10 @@ export function parseReleasePoints(html: string): ReleasePoint[] {
if (!title || seen.has(title)) continue;
seen.add(title);

const fullUrl = path.startsWith('http')
? path
: `https://uscode.house.gov${path}`;
// path is always an OLRC server-relative path (the regex no longer
// accepts an absolute-URL prefix), so this is always anchored to the
// OLRC host — a foreign host can never become a uslmUrl.
const fullUrl = `${OLRC_BASE_URL}${path}`;

results.push({
title,
Expand Down Expand Up @@ -180,6 +210,11 @@ export class OlrcFetcher implements IUsCodeFetcher {
return result;
}

if (exceedsContentLengthLimit(result.value)) {
timer();
return err(new Error('Download exceeds size limit'));
}

const html = await result.value.text();
let points = parseReleasePoints(html);
this.metrics.recordDiscovered(points.length);
Expand Down Expand Up @@ -209,13 +244,18 @@ export class OlrcFetcher implements IUsCodeFetcher {
return priorResult;
}

if (exceedsContentLengthLimit(priorResult.value)) {
timer();
return err(new Error('Download exceeds size limit'));
}

const priorHtml = await priorResult.value.text();
const historicalPoints = parsePriorReleasePoints(priorHtml);
this.metrics.recordDiscovered(historicalPoints.length);

// Also fetch current release point to include it
const currentResult = await fetchWithRetry(OLRC_DOWNLOAD_PAGE, this.logger);
if (currentResult.ok) {
if (currentResult.ok && !exceedsContentLengthLimit(currentResult.value)) {
const currentHtml = await currentResult.value.text();
const current = parseCurrentRelease(currentHtml);
if (current) {
Expand Down Expand Up @@ -257,7 +297,28 @@ export class OlrcFetcher implements IUsCodeFetcher {
return result;
}

// Size cap (primary guard): reject before reading the body if the server
// advertises a Content-Length over the limit, avoiding runner OOM.
if (exceedsContentLengthLimit(result.value)) {
const durationMs = performance.now() - startMs;
this.metrics.recordDuration(durationMs);
this.metrics.recordError('network');
timer();
return err(new Error('Download exceeds size limit'));
}

const buffer = Buffer.from(await result.value.arrayBuffer());

// Defense-in-depth: catch oversized bodies that lacked a (truthful)
// Content-Length header.
if (buffer.length > MAX_DOWNLOAD_BYTES) {
const durationMs = performance.now() - startMs;
this.metrics.recordDuration(durationMs);
this.metrics.recordError('network');
timer();
return err(new Error('Download exceeds size limit'));
}

const hash = sha256(buffer);
const hashKey = `xml:${releasePoint.title}:${releasePoint.uslmUrl}`;

Expand Down
Loading