From ba18609fed4f23e485c60093a661c500cc760d09 Mon Sep 17 00:00:00 2001 From: William Zujkowski Date: Tue, 23 Jun 2026 00:07:38 -0400 Subject: [PATCH] fix(transformer): escape < and > in extracted text (XSS defense-in-depth) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Statute body text is written into Markdown that the web app renders with no HTML sanitization (#200/H2). The XML parser decodes entities, so a source `<img onerror=...>` becomes the literal `` in extracted text and would render as live HTML. Escape `<`/`>` to entities in extractTextFromNodes — the single choke point all rendered statute text flows through — so the generated Markdown is provably HTML-free regardless of upstream USLM content. The recursive walk does not double-escape (an escaped `<` contains no `<` for a parent level to re-match), and `&` is left untouched to avoid entity double-encoding. Markers, headings and title numbers are alphanumeric and unaffected; section paths derive from element identifiers, not this text. Golden-snapshot tests unchanged (real fixtures carry no raw angle brackets). New xml-utils tests cover escaping, no-double-escape, and ampersand passthrough. transformer: 70 pass; monorepo builds. Refs #200 (H2) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/__tests__/xml-utils.test.ts | 36 +++++++++++++++++++ packages/transformer/src/xml-utils.ts | 14 +++++++- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 packages/transformer/src/__tests__/xml-utils.test.ts diff --git a/packages/transformer/src/__tests__/xml-utils.test.ts b/packages/transformer/src/__tests__/xml-utils.test.ts new file mode 100644 index 0000000..de43b22 --- /dev/null +++ b/packages/transformer/src/__tests__/xml-utils.test.ts @@ -0,0 +1,36 @@ +/** + * Tests for xml-utils text extraction, including HTML-escaping of extracted + * statute text (#200/H2 — defense against stored XSS in the rendered Markdown). + */ + +import { describe, it, expect } from 'vitest'; +import { extractTextFromNodes } from '../xml-utils.js'; + +describe('extractTextFromNodes', () => { + it('concatenates #text nodes in order and collapses whitespace', () => { + const nodes = [{ '#text': 'Hello world' }, { '#text': ' again' }]; + expect(extractTextFromNodes(nodes)).toBe('Hello world again'); + }); + + it('escapes raw < and > so extracted text cannot inject HTML', () => { + // The parser decodes XML entities, so a source `<img ...>` arrives + // here as the literal characters below. + const nodes = [{ '#text': '' }]; + const out = extractTextFromNodes(nodes); + expect(out).toBe('<img src=x onerror=alert(1)>'); + expect(out).not.toContain('<'); + expect(out).not.toContain('>'); + }); + + it('does not double-escape across nested element nodes', () => { + // A `<` inside a nested element must be escaped exactly once, not turned + // into `&lt;` by the recursive walk. + const nodes = [{ p: [{ '#text': 'a < b' }] }]; + expect(extractTextFromNodes(nodes)).toBe('a < b'); + }); + + it('leaves ampersands untouched (no entity double-encoding)', () => { + const nodes = [{ '#text': 'Smith & Co' }]; + expect(extractTextFromNodes(nodes)).toBe('Smith & Co'); + }); +}); diff --git a/packages/transformer/src/xml-utils.ts b/packages/transformer/src/xml-utils.ts index 81bef2a..e593a4d 100644 --- a/packages/transformer/src/xml-utils.ts +++ b/packages/transformer/src/xml-utils.ts @@ -22,6 +22,16 @@ export type PreserveOrderNode = PreserveOrderTextNode | PreserveOrderElementNode /** * Recursively walk nodes and concatenate all #text values in document order. * Joins segments with a single space and collapses whitespace. + * + * Security: the parser decodes XML entities, so `<img>` in the source + * arrives here as the literal characters ``. This text is later written + * into Markdown that the web app renders without HTML sanitization, so any raw + * `<`/`>` would be interpreted as live HTML (stored-XSS surface — see #200/H2). + * We escape `<` and `>` to their entities here, at the single choke point all + * rendered statute text flows through, so the generated Markdown is provably + * HTML-free regardless of the upstream USLM content. Markers, headings and + * title numbers are alphanumeric and unaffected; section paths derive from + * element identifiers, not this text. */ export function extractTextFromNodes(nodes: unknown[]): string { const segments: string[] = []; @@ -43,7 +53,9 @@ export function extractTextFromNodes(nodes: unknown[]): string { .filter((s) => s.length > 0) .join(' ') .replace(/\s+/g, ' ') - .trim(); + .trim() + .replace(//g, '>'); } /**