diff --git a/packages/transformer/src/__tests__/xml-utils.test.ts b/packages/transformer/src/__tests__/xml-utils.test.ts
new file mode 100644
index 0000000..de43b22
--- /dev/null
+++ b/packages/transformer/src/__tests__/xml-utils.test.ts
@@ -0,0 +1,36 @@
+/**
+ * Tests for xml-utils text extraction, including HTML-escaping of extracted
+ * statute text (#200/H2 — defense against stored XSS in the rendered Markdown).
+ */
+
+import { describe, it, expect } from 'vitest';
+import { extractTextFromNodes } from '../xml-utils.js';
+
+describe('extractTextFromNodes', () => {
+ it('concatenates #text nodes in order and collapses whitespace', () => {
+ const nodes = [{ '#text': 'Hello world' }, { '#text': ' again' }];
+ expect(extractTextFromNodes(nodes)).toBe('Hello world again');
+ });
+
+ it('escapes raw < and > so extracted text cannot inject HTML', () => {
+ // The parser decodes XML entities, so a source `<img ...>` arrives
+ // here as the literal characters below.
+ const nodes = [{ '#text': '
' }];
+ const out = extractTextFromNodes(nodes);
+ expect(out).toBe('<img src=x onerror=alert(1)>');
+ expect(out).not.toContain('<');
+ expect(out).not.toContain('>');
+ });
+
+ it('does not double-escape across nested element nodes', () => {
+ // A `<` inside a nested element must be escaped exactly once, not turned
+ // into `<` by the recursive walk.
+ const nodes = [{ p: [{ '#text': 'a < b' }] }];
+ expect(extractTextFromNodes(nodes)).toBe('a < b');
+ });
+
+ it('leaves ampersands untouched (no entity double-encoding)', () => {
+ const nodes = [{ '#text': 'Smith & Co' }];
+ expect(extractTextFromNodes(nodes)).toBe('Smith & Co');
+ });
+});
diff --git a/packages/transformer/src/xml-utils.ts b/packages/transformer/src/xml-utils.ts
index 81bef2a..e593a4d 100644
--- a/packages/transformer/src/xml-utils.ts
+++ b/packages/transformer/src/xml-utils.ts
@@ -22,6 +22,16 @@ export type PreserveOrderNode = PreserveOrderTextNode | PreserveOrderElementNode
/**
* Recursively walk nodes and concatenate all #text values in document order.
* Joins segments with a single space and collapses whitespace.
+ *
+ * Security: the parser decodes XML entities, so `<img>` in the source
+ * arrives here as the literal characters `
`. This text is later written
+ * into Markdown that the web app renders without HTML sanitization, so any raw
+ * `<`/`>` would be interpreted as live HTML (stored-XSS surface — see #200/H2).
+ * We escape `<` and `>` to their entities here, at the single choke point all
+ * rendered statute text flows through, so the generated Markdown is provably
+ * HTML-free regardless of the upstream USLM content. Markers, headings and
+ * title numbers are alphanumeric and unaffected; section paths derive from
+ * element identifiers, not this text.
*/
export function extractTextFromNodes(nodes: unknown[]): string {
const segments: string[] = [];
@@ -43,7 +53,9 @@ export function extractTextFromNodes(nodes: unknown[]): string {
.filter((s) => s.length > 0)
.join(' ')
.replace(/\s+/g, ' ')
- .trim();
+ .trim()
+ .replace(//g, '>');
}
/**