From ba18609fed4f23e485c60093a661c500cc760d09 Mon Sep 17 00:00:00 2001
From: William Zujkowski <williamzujkowski@gmail.com>
Date: Tue, 23 Jun 2026 00:07:38 -0400
Subject: [PATCH] fix(transformer): escape < and > in extracted text (XSS
 defense-in-depth)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Statute body text is written into Markdown that the web app renders with
no HTML sanitization (#200/H2). The XML parser decodes entities, so a
source `&lt;img onerror=...&gt;` becomes the literal `<img onerror=...>`
in extracted text and would render as live HTML.

Escape `<`/`>` to entities in extractTextFromNodes — the single choke
point all rendered statute text flows through — so the generated
Markdown is provably HTML-free regardless of upstream USLM content. The
recursive walk does not double-escape (an escaped `&lt;` contains no `<`
for a parent level to re-match), and `&` is left untouched to avoid
entity double-encoding. Markers, headings and title numbers are
alphanumeric and unaffected; section paths derive from element
identifiers, not this text.

Golden-snapshot tests unchanged (real fixtures carry no raw angle
brackets). New xml-utils tests cover escaping, no-double-escape, and
ampersand passthrough. transformer: 70 pass; monorepo builds.

Refs #200 (H2)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../src/__tests__/xml-utils.test.ts           | 36 +++++++++++++++++++
 packages/transformer/src/xml-utils.ts         | 14 +++++++-
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 packages/transformer/src/__tests__/xml-utils.test.ts
diff --git a/packages/transformer/src/__tests__/xml-utils.test.ts b/packages/transformer/src/__tests__/xml-utils.test.ts
new file mode 100644
index 0000000..de43b22
--- /dev/null
+++ b/packages/transformer/src/__tests__/xml-utils.test.ts
@@ -0,0 +1,36 @@
+/**
+ * Tests for xml-utils text extraction, including HTML-escaping of extracted
+ * statute text (#200/H2 — defense against stored XSS in the rendered Markdown).
+ */
+
+import { describe, it, expect } from 'vitest';
+import { extractTextFromNodes } from '../xml-utils.js';
+
+describe('extractTextFromNodes', () => {
+  it('concatenates #text nodes in order and collapses whitespace', () => {
+    const nodes = [{ '#text': 'Hello   world' }, { '#text': ' again' }];
+    expect(extractTextFromNodes(nodes)).toBe('Hello world again');
+  });
+
+  it('escapes raw < and > so extracted text cannot inject HTML', () => {
+    // The parser decodes XML entities, so a source `&lt;img ...&gt;` arrives
+    // here as the literal characters below.
+    const nodes = [{ '#text': '<img src=x onerror=alert(1)>' }];
+    const out = extractTextFromNodes(nodes);
+    expect(out).toBe('&lt;img src=x onerror=alert(1)&gt;');
+    expect(out).not.toContain('<');
+    expect(out).not.toContain('>');
+  });
+
+  it('does not double-escape across nested element nodes', () => {
+    // A `<` inside a nested element must be escaped exactly once, not turned
+    // into `&amp;lt;` by the recursive walk.
+    const nodes = [{ p: [{ '#text': 'a < b' }] }];
+    expect(extractTextFromNodes(nodes)).toBe('a &lt; b');
+  });
+
+  it('leaves ampersands untouched (no entity double-encoding)', () => {
+    const nodes = [{ '#text': 'Smith & Co' }];
+    expect(extractTextFromNodes(nodes)).toBe('Smith & Co');
+  });
+});
diff --git a/packages/transformer/src/xml-utils.ts b/packages/transformer/src/xml-utils.ts
index 81bef2a..e593a4d 100644
--- a/packages/transformer/src/xml-utils.ts
+++ b/packages/transformer/src/xml-utils.ts
@@ -22,6 +22,16 @@ export type PreserveOrderNode = PreserveOrderTextNode | PreserveOrderElementNode
 /**
  * Recursively walk nodes and concatenate all #text values in document order.
  * Joins segments with a single space and collapses whitespace.
+ *
+ * Security: the parser decodes XML entities, so `&lt;img&gt;` in the source
+ * arrives here as the literal characters `<img>`. This text is later written
+ * into Markdown that the web app renders without HTML sanitization, so any raw
+ * `<`/`>` would be interpreted as live HTML (stored-XSS surface — see #200/H2).
+ * We escape `<` and `>` to their entities here, at the single choke point all
+ * rendered statute text flows through, so the generated Markdown is provably
+ * HTML-free regardless of the upstream USLM content. Markers, headings and
+ * title numbers are alphanumeric and unaffected; section paths derive from
+ * element identifiers, not this text.
  */
 export function extractTextFromNodes(nodes: unknown[]): string {
   const segments: string[] = [];
@@ -43,7 +53,9 @@ export function extractTextFromNodes(nodes: unknown[]): string {
     .filter((s) => s.length > 0)
     .join(' ')
     .replace(/\s+/g, ' ')
-    .trim();
+    .trim()
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;');
 }
 
 /**