From e5cf8c4320bf98c59094dcac5a60331a6e2a0563 Mon Sep 17 00:00:00 2001
From: William Zujkowski <williamzujkowski@gmail.com>
Date: Mon, 22 Jun 2026 23:52:13 -0400
Subject: [PATCH] fix(fetcher): harden against SSRF and oversized downloads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two defense-in-depth fixes from the security review (#201):

SSRF: parseReleasePoints' link regex accepted an optional absolute-URL
prefix `(?:https?://[^"/]+)?`, so an href to any host on a
compromised/MitM'd OLRC page could become a uslmUrl that fetchXml then
fetched verbatim. Drop the absolute-URL alternative — the parser now
matches only OLRC server-relative paths, so every uslmUrl is anchored to
https://uscode.house.gov. Anti-ReDoS anchoring preserved.

Response size cap: fetchXml read the body with no limit, risking runner
OOM on a hostile/oversized response. Add MAX_DOWNLOAD_BYTES (300 MiB —
generous vs the ~100-150 MB all-titles archive) and a Content-Length
pre-check (exceedsContentLengthLimit) plus a post-read length backstop,
applied to the archive download and the release-point page reads.

New tests: foreign-host href never yields an off-host uslmUrl; a
Content-Length over the cap yields an error Result. fetcher: 127 tests
pass; monorepo builds.

Closes #201

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../fetcher/src/__tests__/fetcher.test.ts     | 31 ++++++++
 packages/fetcher/src/constants.ts             | 10 +++
 packages/fetcher/src/fetcher.ts               | 73 +++++++++++++++++--
 3 files changed, 108 insertions(+), 6 deletions(-)
diff --git a/packages/fetcher/src/__tests__/fetcher.test.ts b/packages/fetcher/src/__tests__/fetcher.test.ts
index 86d62d8..0f5af88 100644
--- a/packages/fetcher/src/__tests__/fetcher.test.ts
+++ b/packages/fetcher/src/__tests__/fetcher.test.ts
@@ -165,6 +165,21 @@ describe('parseReleasePoints', () => {
     expect(points[0]?.publicLaw).toBe('');
   });
 
+  it('does not produce an off-host uslmUrl for an absolute foreign-host href (SSRF)', () => {
+    // An attacker-controlled page that points a release-point link at a
+    // foreign host must never yield a uslmUrl that fetchXml would fetch.
+    const html = `
+      <h2>Public Law 118-200 (11/15/2024)</h2>
+      <a href="https://evil.example/download/releasepoints/us/pl/1/1/xml_usc18@1-1.zip">Evil</a>
+    `;
+    const points = parseReleasePoints(html);
+    // The link is either skipped or rewritten to the OLRC host — never evil.example.
+    for (const p of points) {
+      expect(p.uslmUrl).not.toContain('evil.example');
+      expect(p.uslmUrl.startsWith('https://uscode.house.gov/')).toBe(true);
+    }
+  });
+
   it('does not catastrophically backtrack on malformed input (CodeQL js/polynomial-redos)', () => {
     // Long string of repeated non-quote chars that look like an unclosed href.
     // The old [^"]* prefix would let the engine try every cut point.
@@ -376,6 +391,22 @@ describe('OlrcFetcher', () => {
     }
   });
 
+  it('fetchXml rejects a response whose Content-Length exceeds the size cap', async () => {
+    // Advertise a body far larger than MAX_DOWNLOAD_BYTES (300 MiB).
+    const oversized = String(314572800 + 1);
+    vi.spyOn(globalThis, 'fetch').mockResolvedValueOnce(
+      new Response('PK', { status: 200, headers: { 'content-length': oversized } })
+    );
+
+    const rp: ReleasePoint = { title: '42', publicLaw: 'PL 118-200', dateET: '2024-01-01T00:00:00Z', uslmUrl: 'https://example.com/t42.zip', sha256Hash: '0'.repeat(64) };
+    const fetcher = new OlrcFetcher({ logger });
+    const result = await fetcher.fetchXml(rp);
+    expect(result.ok).toBe(false);
+    if (!result.ok) {
+      expect(result.error.message).toContain('size limit');
+    }
+  });
+
   it('fetchXml skips download when hash is unchanged', async () => {
     // Create a fake ZIP (starts with PK signature)
     const zipContent = Buffer.from([0x50, 0x4b, 0x03, 0x04, ...Buffer.from('fake-zip-data')]);
diff --git a/packages/fetcher/src/constants.ts b/packages/fetcher/src/constants.ts
index f5a7d8b..3309b6e 100644
--- a/packages/fetcher/src/constants.ts
+++ b/packages/fetcher/src/constants.ts
@@ -22,6 +22,16 @@ export function allTitlesXmlUrl(congress: string, law: string): string {
   return `${OLRC_RELEASE_POINTS_URL}us/pl/${congress}/${law}/xml_uscAll@${congress}-${law}.zip`;
 }
 
+/**
+ * Maximum number of bytes to accept from a single download.
+ *
+ * Guards CI runners against OOM from a maliciously or accidentally huge
+ * response. The bound is deliberately generous: the all-titles USC archive
+ * can be ~100-150 MB, so 300 MiB leaves ample headroom for legitimate
+ * downloads while still capping runaway responses.
+ */
+export const MAX_DOWNLOAD_BYTES = 314572800; // 300 MiB
+
 /** Path for hash storage relative to working directory */
 export const HASH_STORE_DIR = '.openlaw-git';
 export const HASH_STORE_FILE = 'hashes.json';
diff --git a/packages/fetcher/src/fetcher.ts b/packages/fetcher/src/fetcher.ts
index ba354c3..009d2ea 100644
--- a/packages/fetcher/src/fetcher.ts
+++ b/packages/fetcher/src/fetcher.ts
@@ -1,7 +1,12 @@
 import { createHash } from 'node:crypto';
 import { type IUsCodeFetcher, type ReleasePoint, type Result, ok, err } from '@civic-source/types';
 import { type Logger, createLogger, fetchWithRetry as sharedFetchWithRetry } from '@civic-source/shared';
-import { OLRC_DOWNLOAD_PAGE, OLRC_PRIOR_RELEASE_POINTS_PAGE } from './constants.js';
+import {
+  OLRC_BASE_URL,
+  OLRC_DOWNLOAD_PAGE,
+  OLRC_PRIOR_RELEASE_POINTS_PAGE,
+  MAX_DOWNLOAD_BYTES,
+} from './constants.js';
 import { HashStore } from './hash-store.js';
 import { FetcherMetrics } from './metrics.js';
 
@@ -10,6 +15,23 @@ export function sha256(data: Buffer): string {
   return createHash('sha256').update(data).digest('hex');
 }
 
+/**
+ * Inspect a response's Content-Length header and reject up-front if it
+ * advertises a body larger than MAX_DOWNLOAD_BYTES. This is the primary guard
+ * against runner OOM: it avoids ever reading an oversized body into memory.
+ *
+ * Returns true when the response is over the cap (caller should error).
+ * A missing or unparseable Content-Length returns false — the post-read
+ * length check provides defense-in-depth for those cases.
+ */
+export function exceedsContentLengthLimit(response: Response): boolean {
+  const header = response.headers.get('content-length');
+  if (header === null) return false;
+  const declared = Number(header);
+  if (!Number.isFinite(declared)) return false;
+  return declared > MAX_DOWNLOAD_BYTES;
+}
+
 /**
  * Fetch with exponential backoff retry.
  * Delegates to the shared fetchWithRetry utility.
@@ -118,9 +140,16 @@ export function parseReleasePoints(html: string): ReleasePoint[] {
   const currentRelease = parseCurrentRelease(html);
 
   // Match links like: /download/releasepoints/us/pl/118/42/xml_usc42@118-200.zip
+  //
+  // SSRF hardening: only accept OLRC server-relative hrefs (the form the OLRC
+  // pages actually emit). We deliberately do NOT accept an absolute-URL prefix
+  // here: allowing one would let an href to an arbitrary host become a
+  // uslmUrl that fetchXml then fetches verbatim. Because every match is a bare
+  // path, fullUrl below is always anchored to the OLRC host.
+  //
   // Anchor segments to non-slash/non-quote chars so we don't get polynomial
   // backtracking on malformed input (CodeQL js/polynomial-redos).
-  const linkPattern = /href="((?:https?:\/\/[^"/]+)?\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g;
+  const linkPattern = /href="(\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g;
   let match: RegExpExecArray | null;
 
   // Extract unique title numbers from XML download links
@@ -138,9 +167,10 @@ export function parseReleasePoints(html: string): ReleasePoint[] {
     if (!title || seen.has(title)) continue;
     seen.add(title);
 
-    const fullUrl = path.startsWith('http')
-      ? path
-      : `https://uscode.house.gov${path}`;
+    // path is always an OLRC server-relative path (the regex no longer
+    // accepts an absolute-URL prefix), so this is always anchored to the
+    // OLRC host — a foreign host can never become a uslmUrl.
+    const fullUrl = `${OLRC_BASE_URL}${path}`;
 
     results.push({
       title,
@@ -180,6 +210,11 @@ export class OlrcFetcher implements IUsCodeFetcher {
       return result;
     }
 
+    if (exceedsContentLengthLimit(result.value)) {
+      timer();
+      return err(new Error('Download exceeds size limit'));
+    }
+
     const html = await result.value.text();
     let points = parseReleasePoints(html);
     this.metrics.recordDiscovered(points.length);
@@ -209,13 +244,18 @@ export class OlrcFetcher implements IUsCodeFetcher {
       return priorResult;
     }
 
+    if (exceedsContentLengthLimit(priorResult.value)) {
+      timer();
+      return err(new Error('Download exceeds size limit'));
+    }
+
     const priorHtml = await priorResult.value.text();
     const historicalPoints = parsePriorReleasePoints(priorHtml);
     this.metrics.recordDiscovered(historicalPoints.length);
 
     // Also fetch current release point to include it
     const currentResult = await fetchWithRetry(OLRC_DOWNLOAD_PAGE, this.logger);
-    if (currentResult.ok) {
+    if (currentResult.ok && !exceedsContentLengthLimit(currentResult.value)) {
       const currentHtml = await currentResult.value.text();
       const current = parseCurrentRelease(currentHtml);
       if (current) {
@@ -257,7 +297,28 @@ export class OlrcFetcher implements IUsCodeFetcher {
       return result;
     }
 
+    // Size cap (primary guard): reject before reading the body if the server
+    // advertises a Content-Length over the limit, avoiding runner OOM.
+    if (exceedsContentLengthLimit(result.value)) {
+      const durationMs = performance.now() - startMs;
+      this.metrics.recordDuration(durationMs);
+      this.metrics.recordError('network');
+      timer();
+      return err(new Error('Download exceeds size limit'));
+    }
+
     const buffer = Buffer.from(await result.value.arrayBuffer());
+
+    // Defense-in-depth: catch oversized bodies that lacked a (truthful)
+    // Content-Length header.
+    if (buffer.length > MAX_DOWNLOAD_BYTES) {
+      const durationMs = performance.now() - startMs;
+      this.metrics.recordDuration(durationMs);
+      this.metrics.recordError('network');
+      timer();
+      return err(new Error('Download exceeds size limit'));
+    }
+
     const hash = sha256(buffer);
     const hashKey = `xml:${releasePoint.title}:${releasePoint.uslmUrl}`;