Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions packages/transformer/src/__tests__/xml-utils.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/**
* Tests for xml-utils text extraction, including HTML-escaping of extracted
* statute text (#200/H2 — defense against stored XSS in the rendered Markdown).
*/

import { describe, it, expect } from 'vitest';
import { extractTextFromNodes } from '../xml-utils.js';

describe('extractTextFromNodes', () => {
it('concatenates #text nodes in order and collapses whitespace', () => {
const nodes = [{ '#text': 'Hello world' }, { '#text': ' again' }];
expect(extractTextFromNodes(nodes)).toBe('Hello world again');
});

it('escapes raw < and > so extracted text cannot inject HTML', () => {
// The parser decodes XML entities, so a source `&lt;img ...&gt;` arrives
// here as the literal characters below.
const nodes = [{ '#text': '<img src=x onerror=alert(1)>' }];
const out = extractTextFromNodes(nodes);
expect(out).toBe('&lt;img src=x onerror=alert(1)&gt;');
expect(out).not.toContain('<');
expect(out).not.toContain('>');
});

it('does not double-escape across nested element nodes', () => {
// A `<` inside a nested element must be escaped exactly once, not turned
// into `&amp;lt;` by the recursive walk.
const nodes = [{ p: [{ '#text': 'a < b' }] }];
expect(extractTextFromNodes(nodes)).toBe('a &lt; b');
});

it('leaves ampersands untouched (no entity double-encoding)', () => {
const nodes = [{ '#text': 'Smith & Co' }];
expect(extractTextFromNodes(nodes)).toBe('Smith & Co');
});
});
14 changes: 13 additions & 1 deletion packages/transformer/src/xml-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ export type PreserveOrderNode = PreserveOrderTextNode | PreserveOrderElementNode
/**
* Recursively walk nodes and concatenate all #text values in document order.
* Joins segments with a single space and collapses whitespace.
*
* Security: the parser decodes XML entities, so `&lt;img&gt;` in the source
* arrives here as the literal characters `<img>`. This text is later written
* into Markdown that the web app renders without HTML sanitization, so any raw
* `<`/`>` would be interpreted as live HTML (stored-XSS surface — see #200/H2).
* We escape `<` and `>` to their entities here, at the single choke point all
* rendered statute text flows through, so the generated Markdown is provably
* HTML-free regardless of the upstream USLM content. Markers, headings and
* title numbers are alphanumeric and unaffected; section paths derive from
* element identifiers, not this text.
*/
export function extractTextFromNodes(nodes: unknown[]): string {
const segments: string[] = [];
Expand All @@ -43,7 +53,9 @@ export function extractTextFromNodes(nodes: unknown[]): string {
.filter((s) => s.length > 0)
.join(' ')
.replace(/\s+/g, ' ')
.trim();
.trim()
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
}

/**
Expand Down
Loading