Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions packages/annotator/src/__tests__/citation-utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,49 @@ describe('deduplicateCases', () => {
const result = deduplicateCases(cases);
expect(result).toHaveLength(1);
});

it('retains distinct cases that both have empty citations', () => {
const cases = [
makeCase({
caseName: 'Alpha v. United States',
citation: '',
date: '2024-01-15',
sourceUrl: 'https://www.courtlistener.com/opinion/111/',
}),
makeCase({
caseName: 'Beta v. United States',
citation: '',
date: '2023-06-30',
sourceUrl: 'https://www.courtlistener.com/opinion/222/',
}),
];
const result = deduplicateCases(cases);
expect(result).toHaveLength(2);
expect(result.map((c) => c.caseName)).toEqual([
'Alpha v. United States',
'Beta v. United States',
]);
});

it('still collapses truly identical uncited cases', () => {
const cases = [
makeCase({ caseName: 'Same', citation: '', date: '2024-01-15', sourceUrl: 'https://www.courtlistener.com/opinion/999/' }),
makeCase({ caseName: 'Same', citation: '', date: '2024-01-15', sourceUrl: 'https://www.courtlistener.com/opinion/999/' }),
];
const result = deduplicateCases(cases);
expect(result).toHaveLength(1);
});

it('collapses cases with the same non-empty citation while retaining distinct uncited ones', () => {
const cases = [
makeCase({ caseName: 'CitedFirst', citation: '18 USC 111' }),
makeCase({ caseName: 'CitedSecond', citation: '18 U.S.C. 111' }),
makeCase({ caseName: 'UncitedA', citation: '', sourceUrl: 'https://www.courtlistener.com/opinion/aaa/' }),
makeCase({ caseName: 'UncitedB', citation: '', sourceUrl: 'https://www.courtlistener.com/opinion/bbb/' }),
];
const result = deduplicateCases(cases);
// One collapsed cited case + two distinct uncited cases = 3
expect(result).toHaveLength(3);
expect(result.map((c) => c.caseName)).toEqual(['CitedFirst', 'UncitedA', 'UncitedB']);
});
});
17 changes: 16 additions & 1 deletion packages/annotator/src/citation-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,26 @@ export function normalizeCitation(citation: string): string {
.toLowerCase();
}

/**
* Build a deduplication key for a case.
*
* Cases with a non-empty citation are keyed on the normalized citation so that
* genuine duplicate citations collapse. Cases with an empty citation (common
* for CourtListener results that lack a structured citation) fall back to a
* composite key of caseName + date + sourceUrl so that distinct uncited cases
* are NOT mistakenly collapsed into a single entry.
*/
function dedupeKey(c: CaseAnnotation): string {
const normalized = normalizeCitation(c.citation);
if (normalized !== '') return `cite:${normalized}`;
return `composite:${c.caseName}|${c.date}|${c.sourceUrl}`;
}

/** Deduplicate cases by normalized citation, preserving first occurrence */
export function deduplicateCases(cases: CaseAnnotation[]): CaseAnnotation[] {
const seen = new Set<string>();
return cases.filter((c) => {
const key = normalizeCitation(c.citation);
const key = dedupeKey(c);
if (seen.has(key)) return false;
seen.add(key);
return true;
Expand Down
35 changes: 35 additions & 0 deletions packages/transformer/src/__tests__/transformer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,41 @@ describe('XmlToMarkdownAdapter', () => {
expect(s111?.path).toBe('statutes/title-10/chapter-2/section-111.md');
});

it('extracts appendix title number "18a" from identifier (no collision with title 18)', () => {
const xml = `
<lawDoc>
<title identifier="/us/usc/t18a">
<num>Title 18 Appendix</num>
<chapter identifier="/us/usc/t18a/ch1">
<num>Chapter 1</num>
<section identifier="/us/usc/t18a/s1">
<num>1</num>
<heading>Appendix Rule</heading>
<content>Some appendix content.</content>
</section>
</chapter>
</title>
</lawDoc>`;
const parsed = parseUslmXml(xml);
expect(parsed.ok).toBe(true);
if (!parsed.ok) return;
expect(parsed.value.titleNumber).toBe('18a');

const adapter = new XmlToMarkdownAdapter('PL 119-1');
const result = adapter.transformToFiles(xml);
expect(result.ok).toBe(true);
if (!result.ok) return;

const section = result.value.find((f) => f.path.includes('section-1'));
expect(section).toBeDefined();
// Appendix sections live under title-18a, distinct from main title-18
expect(section?.path).toBe('statutes/title-18a/chapter-1/section-1.md');
expect(section?.path).not.toContain('title-18/');
// usc_title frontmatter remains numeric (18) despite the "18a" path
expect(section?.content).toContain('usc_title: 18');
expect(section?.content).not.toContain('usc_title: 18a');
});

it('preserves inline element text in mixed content (cross-references)', () => {
const xml = `
<lawDoc>
Expand Down
10 changes: 7 additions & 3 deletions packages/transformer/src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,19 @@ function findTitleNumber(root: unknown[]): string | undefined {
const titleAttrs = firstTitle.attrs;
const identifier = titleAttrs['@_identifier'];
if (identifier) {
const match = /\/t(\d+)$/.exec(identifier);
// Capture an optional appendix suffix letter so e.g. "/us/usc/t18a"
// yields "18a" and does NOT collide with main Title 18 ("18").
const match = /\/t(\d+[a-zA-Z]?)$/.exec(identifier);
if (match?.[1]) return match[1];
}

// Fallback: look for num element
// Fallback: look for num element (e.g. "Title 18 Appendix")
const firstNum = findElements(firstTitle.children, USLM_ELEMENTS.num)[0];
if (firstNum) {
const text = extractTextFromNodes(firstNum.children);
const numMatch = /\d+/.exec(text);
// Match a number with an optional appendix letter (e.g. "18a"), then fall
// back to a bare number.
const numMatch = /\d+[a-zA-Z]?/.exec(text) ?? /\d+/.exec(text);
return numMatch?.[0];
}

Expand Down
Loading