civic-source · williamzujkowski · Jun 23, 2026 · Jun 23, 2026
@@ -0,0 +1,36 @@
+/**
+ * Tests for xml-utils text extraction, including HTML-escaping of extracted
+ * statute text (#200/H2 — defense against stored XSS in the rendered Markdown).
+ */
+
+import { describe, it, expect } from 'vitest';
+import { extractTextFromNodes } from '../xml-utils.js';
+
+describe('extractTextFromNodes', () => {
+  it('concatenates #text nodes in order and collapses whitespace', () => {
+    const nodes = [{ '#text': 'Hello   world' }, { '#text': ' again' }];
+    expect(extractTextFromNodes(nodes)).toBe('Hello world again');
+  });
+
+  it('escapes raw < and > so extracted text cannot inject HTML', () => {
+    // The parser decodes XML entities, so a source `&lt;img ...&gt;` arrives
+    // here as the literal characters below.
+    const nodes = [{ '#text': '<img src=x onerror=alert(1)>' }];
+    const out = extractTextFromNodes(nodes);
+    expect(out).toBe('&lt;img src=x onerror=alert(1)&gt;');
+    expect(out).not.toContain('<');
+    expect(out).not.toContain('>');
+  });
+
+  it('does not double-escape across nested element nodes', () => {
+    // A `<` inside a nested element must be escaped exactly once, not turned
+    // into `&amp;lt;` by the recursive walk.
+    const nodes = [{ p: [{ '#text': 'a < b' }] }];
+    expect(extractTextFromNodes(nodes)).toBe('a &lt; b');
+  });
+
+  it('leaves ampersands untouched (no entity double-encoding)', () => {
+    const nodes = [{ '#text': 'Smith & Co' }];
+    expect(extractTextFromNodes(nodes)).toBe('Smith & Co');
+  });
+});
@@ -22,6 +22,16 @@ export type PreserveOrderNode = PreserveOrderTextNode | PreserveOrderElementNode
 /**
  * Recursively walk nodes and concatenate all #text values in document order.
  * Joins segments with a single space and collapses whitespace.
+ *
+ * Security: the parser decodes XML entities, so `&lt;img&gt;` in the source
+ * arrives here as the literal characters `<img>`. This text is later written
+ * into Markdown that the web app renders without HTML sanitization, so any raw
+ * `<`/`>` would be interpreted as live HTML (stored-XSS surface — see #200/H2).
+ * We escape `<` and `>` to their entities here, at the single choke point all
+ * rendered statute text flows through, so the generated Markdown is provably
+ * HTML-free regardless of the upstream USLM content. Markers, headings and
+ * title numbers are alphanumeric and unaffected; section paths derive from
+ * element identifiers, not this text.
  */
 export function extractTextFromNodes(nodes: unknown[]): string {
   const segments: string[] = [];
@@ -43,7 +53,9 @@ export function extractTextFromNodes(nodes: unknown[]): string {
     .filter((s) => s.length > 0)
     .join(' ')
     .replace(/\s+/g, ' ')
-    .trim();
+    .trim()
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;');
 }
 
 /**