From 09cda7d904b09c761294805956f204c69f1cdb54 Mon Sep 17 00:00:00 2001 From: William Zujkowski Date: Mon, 22 Jun 2026 23:43:30 -0400 Subject: [PATCH] fix(ci): decode+unzip OLRC archive before transforming in sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OlrcFetcher.fetchXml() returns base64-encoded ZIP bytes (the archive containing the USLM .xml), but the sync workflow's "Transform statutes" step passed that value straight into transformer.transformToFiles(), which expects raw XML. Every title failed to parse, so the weekly sync transformed 0 sections and never committed anything. Add a tested, pure-Node `extractXmlFromZip(zip: Buffer)` to the fetcher package (lifted from the proven logic in scripts/fetch-title.ts; uses inflateRawSync for stored/deflate entries) and wire the workflow to decode base64 → unzip → transform, failing the title cleanly if the archive has no .xml entry. New unit tests cover stored (method 0), deflate (method 8 round-trip), and no-.xml-entry cases. fetcher: 126 tests pass; full monorepo builds. Closes #199 Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/sync-law.yml | 11 ++++-- packages/fetcher/src/__tests__/zip.test.ts | 40 +++++++++++++++++++++ packages/fetcher/src/index.ts | 1 + packages/fetcher/src/zip.ts | 42 ++++++++++++++++++++++ 4 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 packages/fetcher/src/__tests__/zip.test.ts create mode 100644 packages/fetcher/src/zip.ts diff --git a/.github/workflows/sync-law.yml b/.github/workflows/sync-law.yml index 6305708..42e1864 100644 --- a/.github/workflows/sync-law.yml +++ b/.github/workflows/sync-law.yml @@ -88,7 +88,7 @@ jobs: run: | mkdir -p workspace/statutes node --input-type=module <<'SCRIPT' - import { OlrcFetcher, HashStore } from '@civic-source/fetcher'; + import { OlrcFetcher, HashStore, extractXmlFromZip } from '@civic-source/fetcher'; import { XmlToMarkdownAdapter } from '@civic-source/transformer'; import { writeFile, mkdir } from 'node:fs/promises'; import { join, dirname } from 'node:path'; @@ -114,8 +114,15 @@ jobs: continue; } + const xml = extractXmlFromZip(Buffer.from(xmlResult.value, 'base64')); + if (xml === null) { + console.error(`Title ${rp.title}: no XML in archive`); + failedTitles++; + continue; + } + const transformer = new XmlToMarkdownAdapter(rp.publicLaw); - const transformResult = transformer.transformToFiles(xmlResult.value); + const transformResult = transformer.transformToFiles(xml); if (!transformResult.ok) { console.error(`Title ${rp.title}: transform failed — ${transformResult.error.message}`); failedTitles++; diff --git a/packages/fetcher/src/__tests__/zip.test.ts b/packages/fetcher/src/__tests__/zip.test.ts new file mode 100644 index 0000000..0594756 --- /dev/null +++ b/packages/fetcher/src/__tests__/zip.test.ts @@ -0,0 +1,40 @@ +import { deflateRawSync } from 'node:zlib'; +import { describe, expect, it } from 'vitest'; + +import { extractXmlFromZip } from '../zip.js'; + +/** + * Construct a minimal single-entry ZIP buffer (local file header + data). + * Byte offsets match the parser in zip.ts: sig@0, method@8, compressedSize@18, + * fileNameLen@26, extraLen@28, filename@30, data@30+fileNameLen. + */ +function makeZip(fileName: string, compressionMethod: number, data: Buffer): Buffer { + const fileNameBytes = Buffer.from(fileName, 'utf-8'); + const header = Buffer.alloc(30); + header.writeUInt32LE(0x04034b50, 0); // local file header signature + header.writeUInt16LE(compressionMethod, 8); + header.writeUInt32LE(data.length, 18); // compressed size + header.writeUInt32LE(data.length, 22); // uncompressed size (unused by parser) + header.writeUInt16LE(fileNameBytes.length, 26); + header.writeUInt16LE(0, 28); // extra field length + return Buffer.concat([header, fileNameBytes, data]); +} + +describe('extractXmlFromZip', () => { + it('returns the contents of a stored (method 0) .xml entry', () => { + const xml = 'hello'; + const zip = makeZip('doc.xml', 0, Buffer.from(xml, 'utf-8')); + expect(extractXmlFromZip(zip)).toBe(xml); + }); + + it('inflates a deflate (method 8) .xml entry', () => { + const xml = `${'a'.repeat(500)}
some longer content for round-tripping
`; + const zip = makeZip('doc.xml', 8, deflateRawSync(Buffer.from(xml, 'utf-8'))); + expect(extractXmlFromZip(zip)).toBe(xml); + }); + + it('returns null when no .xml entry is present', () => { + const zip = makeZip('readme.txt', 0, Buffer.from('not xml', 'utf-8')); + expect(extractXmlFromZip(zip)).toBeNull(); + }); +}); diff --git a/packages/fetcher/src/index.ts b/packages/fetcher/src/index.ts index f501758..0a50895 100644 --- a/packages/fetcher/src/index.ts +++ b/packages/fetcher/src/index.ts @@ -20,6 +20,7 @@ export { parseCurrentRelease, type CurrentReleaseInfo, } from './fetcher.js'; +export { extractXmlFromZip } from './zip.js'; export { HashStore } from './hash-store.js'; export { FetcherMetrics, type FetcherMetricsSnapshot, type DownloadErrorType } from './metrics.js'; export { createLogger, type Logger, type LogLevel } from '@civic-source/shared'; diff --git a/packages/fetcher/src/zip.ts b/packages/fetcher/src/zip.ts new file mode 100644 index 0000000..b6fe933 --- /dev/null +++ b/packages/fetcher/src/zip.ts @@ -0,0 +1,42 @@ +import { inflateRawSync } from 'node:zlib'; + +/** + * Extract the first `.xml` entry from a ZIP archive buffer. + * + * Walks ZIP local file headers (signature 0x04034b50), reading the + * compression method, compressed size, and filename for each entry. For the + * first entry whose filename ends with `.xml` it returns the decoded UTF-8 + * contents: stored entries (method 0) are read directly, deflated entries + * (method 8) are inflated synchronously via `inflateRawSync`. + * + * The caller is responsible for decoding any base64 before passing the buffer. + * + * @param zip Raw ZIP archive bytes. + * @returns The XML string, or `null` if the archive contains no `.xml` entry. + */ +export function extractXmlFromZip(zip: Buffer): string | null { + let offset = 0; + while (offset < zip.length - 30) { + const sig = zip.readUInt32LE(offset); + if (sig !== 0x04034b50) break; + + const compressionMethod = zip.readUInt16LE(offset + 8); + const compressedSize = zip.readUInt32LE(offset + 18); + const fileNameLen = zip.readUInt16LE(offset + 26); + const extraLen = zip.readUInt16LE(offset + 28); + const fileName = zip.toString('utf-8', offset + 30, offset + 30 + fileNameLen); + + const dataStart = offset + 30 + fileNameLen + extraLen; + + if (fileName.endsWith('.xml')) { + if (compressionMethod === 0) { + return zip.toString('utf-8', dataStart, dataStart + compressedSize); + } else if (compressionMethod === 8) { + return inflateRawSync(zip.subarray(dataStart, dataStart + compressedSize)).toString('utf-8'); + } + } + + offset = dataStart + compressedSize; + } + return null; +}