diff --git a/packages/annotator/src/__tests__/citation-utils.test.ts b/packages/annotator/src/__tests__/citation-utils.test.ts index f16f469..b741ace 100644 --- a/packages/annotator/src/__tests__/citation-utils.test.ts +++ b/packages/annotator/src/__tests__/citation-utils.test.ts @@ -100,4 +100,49 @@ describe('deduplicateCases', () => { const result = deduplicateCases(cases); expect(result).toHaveLength(1); }); + + it('retains distinct cases that both have empty citations', () => { + const cases = [ + makeCase({ + caseName: 'Alpha v. United States', + citation: '', + date: '2024-01-15', + sourceUrl: 'https://www.courtlistener.com/opinion/111/', + }), + makeCase({ + caseName: 'Beta v. United States', + citation: '', + date: '2023-06-30', + sourceUrl: 'https://www.courtlistener.com/opinion/222/', + }), + ]; + const result = deduplicateCases(cases); + expect(result).toHaveLength(2); + expect(result.map((c) => c.caseName)).toEqual([ + 'Alpha v. United States', + 'Beta v. United States', + ]); + }); + + it('still collapses truly identical uncited cases', () => { + const cases = [ + makeCase({ caseName: 'Same', citation: '', date: '2024-01-15', sourceUrl: 'https://www.courtlistener.com/opinion/999/' }), + makeCase({ caseName: 'Same', citation: '', date: '2024-01-15', sourceUrl: 'https://www.courtlistener.com/opinion/999/' }), + ]; + const result = deduplicateCases(cases); + expect(result).toHaveLength(1); + }); + + it('collapses cases with the same non-empty citation while retaining distinct uncited ones', () => { + const cases = [ + makeCase({ caseName: 'CitedFirst', citation: '18 USC 111' }), + makeCase({ caseName: 'CitedSecond', citation: '18 U.S.C. 111' }), + makeCase({ caseName: 'UncitedA', citation: '', sourceUrl: 'https://www.courtlistener.com/opinion/aaa/' }), + makeCase({ caseName: 'UncitedB', citation: '', sourceUrl: 'https://www.courtlistener.com/opinion/bbb/' }), + ]; + const result = deduplicateCases(cases); + // One collapsed cited case + two distinct uncited cases = 3 + expect(result).toHaveLength(3); + expect(result.map((c) => c.caseName)).toEqual(['CitedFirst', 'UncitedA', 'UncitedB']); + }); }); diff --git a/packages/annotator/src/citation-utils.ts b/packages/annotator/src/citation-utils.ts index c621f66..17815c9 100644 --- a/packages/annotator/src/citation-utils.ts +++ b/packages/annotator/src/citation-utils.ts @@ -13,11 +13,26 @@ export function normalizeCitation(citation: string): string { .toLowerCase(); } +/** + * Build a deduplication key for a case. + * + * Cases with a non-empty citation are keyed on the normalized citation so that + * genuine duplicate citations collapse. Cases with an empty citation (common + * for CourtListener results that lack a structured citation) fall back to a + * composite key of caseName + date + sourceUrl so that distinct uncited cases + * are NOT mistakenly collapsed into a single entry. + */ +function dedupeKey(c: CaseAnnotation): string { + const normalized = normalizeCitation(c.citation); + if (normalized !== '') return `cite:${normalized}`; + return `composite:${c.caseName}|${c.date}|${c.sourceUrl}`; +} + /** Deduplicate cases by normalized citation, preserving first occurrence */ export function deduplicateCases(cases: CaseAnnotation[]): CaseAnnotation[] { const seen = new Set(); return cases.filter((c) => { - const key = normalizeCitation(c.citation); + const key = dedupeKey(c); if (seen.has(key)) return false; seen.add(key); return true; diff --git a/packages/transformer/src/__tests__/transformer.test.ts b/packages/transformer/src/__tests__/transformer.test.ts index 739f806..a3920e4 100644 --- a/packages/transformer/src/__tests__/transformer.test.ts +++ b/packages/transformer/src/__tests__/transformer.test.ts @@ -381,6 +381,41 @@ describe('XmlToMarkdownAdapter', () => { expect(s111?.path).toBe('statutes/title-10/chapter-2/section-111.md'); }); + it('extracts appendix title number "18a" from identifier (no collision with title 18)', () => { + const xml = ` + + + <num>Title 18 Appendix</num> + <chapter identifier="/us/usc/t18a/ch1"> + <num>Chapter 1</num> + <section identifier="/us/usc/t18a/s1"> + <num>1</num> + <heading>Appendix Rule</heading> + <content>Some appendix content.</content> + </section> + </chapter> + +`; + const parsed = parseUslmXml(xml); + expect(parsed.ok).toBe(true); + if (!parsed.ok) return; + expect(parsed.value.titleNumber).toBe('18a'); + + const adapter = new XmlToMarkdownAdapter('PL 119-1'); + const result = adapter.transformToFiles(xml); + expect(result.ok).toBe(true); + if (!result.ok) return; + + const section = result.value.find((f) => f.path.includes('section-1')); + expect(section).toBeDefined(); + // Appendix sections live under title-18a, distinct from main title-18 + expect(section?.path).toBe('statutes/title-18a/chapter-1/section-1.md'); + expect(section?.path).not.toContain('title-18/'); + // usc_title frontmatter remains numeric (18) despite the "18a" path + expect(section?.content).toContain('usc_title: 18'); + expect(section?.content).not.toContain('usc_title: 18a'); + }); + it('preserves inline element text in mixed content (cross-references)', () => { const xml = ` diff --git a/packages/transformer/src/parser.ts b/packages/transformer/src/parser.ts index 82d2684..64dc5dc 100644 --- a/packages/transformer/src/parser.ts +++ b/packages/transformer/src/parser.ts @@ -74,15 +74,19 @@ function findTitleNumber(root: unknown[]): string | undefined { const titleAttrs = firstTitle.attrs; const identifier = titleAttrs['@_identifier']; if (identifier) { - const match = /\/t(\d+)$/.exec(identifier); + // Capture an optional appendix suffix letter so e.g. "/us/usc/t18a" + // yields "18a" and does NOT collide with main Title 18 ("18"). + const match = /\/t(\d+[a-zA-Z]?)$/.exec(identifier); if (match?.[1]) return match[1]; } - // Fallback: look for num element + // Fallback: look for num element (e.g. "Title 18 Appendix") const firstNum = findElements(firstTitle.children, USLM_ELEMENTS.num)[0]; if (firstNum) { const text = extractTextFromNodes(firstNum.children); - const numMatch = /\d+/.exec(text); + // Match a number with an optional appendix letter (e.g. "18a"), then fall + // back to a bare number. + const numMatch = /\d+[a-zA-Z]?/.exec(text) ?? /\d+/.exec(text); return numMatch?.[0]; }