diff --git a/.github/workflows/sync-law.yml b/.github/workflows/sync-law.yml index 6305708..42e1864 100644 --- a/.github/workflows/sync-law.yml +++ b/.github/workflows/sync-law.yml @@ -88,7 +88,7 @@ jobs: run: | mkdir -p workspace/statutes node --input-type=module <<'SCRIPT' - import { OlrcFetcher, HashStore } from '@civic-source/fetcher'; + import { OlrcFetcher, HashStore, extractXmlFromZip } from '@civic-source/fetcher'; import { XmlToMarkdownAdapter } from '@civic-source/transformer'; import { writeFile, mkdir } from 'node:fs/promises'; import { join, dirname } from 'node:path'; @@ -114,8 +114,15 @@ jobs: continue; } + const xml = extractXmlFromZip(Buffer.from(xmlResult.value, 'base64')); + if (xml === null) { + console.error(`Title ${rp.title}: no XML in archive`); + failedTitles++; + continue; + } + const transformer = new XmlToMarkdownAdapter(rp.publicLaw); - const transformResult = transformer.transformToFiles(xmlResult.value); + const transformResult = transformer.transformToFiles(xml); if (!transformResult.ok) { console.error(`Title ${rp.title}: transform failed — ${transformResult.error.message}`); failedTitles++; diff --git a/packages/fetcher/src/__tests__/zip.test.ts b/packages/fetcher/src/__tests__/zip.test.ts new file mode 100644 index 0000000..0594756 --- /dev/null +++ b/packages/fetcher/src/__tests__/zip.test.ts @@ -0,0 +1,40 @@ +import { deflateRawSync } from 'node:zlib'; +import { describe, expect, it } from 'vitest'; + +import { extractXmlFromZip } from '../zip.js'; + +/** + * Construct a minimal single-entry ZIP buffer (local file header + data). + * Byte offsets match the parser in zip.ts: sig@0, method@8, compressedSize@18, + * fileNameLen@26, extraLen@28, filename@30, data@30+fileNameLen. + */ +function makeZip(fileName: string, compressionMethod: number, data: Buffer): Buffer { + const fileNameBytes = Buffer.from(fileName, 'utf-8'); + const header = Buffer.alloc(30); + header.writeUInt32LE(0x04034b50, 0); // local file header signature + header.writeUInt16LE(compressionMethod, 8); + header.writeUInt32LE(data.length, 18); // compressed size + header.writeUInt32LE(data.length, 22); // uncompressed size (unused by parser) + header.writeUInt16LE(fileNameBytes.length, 26); + header.writeUInt16LE(0, 28); // extra field length + return Buffer.concat([header, fileNameBytes, data]); +} + +describe('extractXmlFromZip', () => { + it('returns the contents of a stored (method 0) .xml entry', () => { + const xml = 'hello'; + const zip = makeZip('doc.xml', 0, Buffer.from(xml, 'utf-8')); + expect(extractXmlFromZip(zip)).toBe(xml); + }); + + it('inflates a deflate (method 8) .xml entry', () => { + const xml = `${'a'.repeat(500)}
some longer content for round-tripping
`; + const zip = makeZip('doc.xml', 8, deflateRawSync(Buffer.from(xml, 'utf-8'))); + expect(extractXmlFromZip(zip)).toBe(xml); + }); + + it('returns null when no .xml entry is present', () => { + const zip = makeZip('readme.txt', 0, Buffer.from('not xml', 'utf-8')); + expect(extractXmlFromZip(zip)).toBeNull(); + }); +}); diff --git a/packages/fetcher/src/index.ts b/packages/fetcher/src/index.ts index f501758..0a50895 100644 --- a/packages/fetcher/src/index.ts +++ b/packages/fetcher/src/index.ts @@ -20,6 +20,7 @@ export { parseCurrentRelease, type CurrentReleaseInfo, } from './fetcher.js'; +export { extractXmlFromZip } from './zip.js'; export { HashStore } from './hash-store.js'; export { FetcherMetrics, type FetcherMetricsSnapshot, type DownloadErrorType } from './metrics.js'; export { createLogger, type Logger, type LogLevel } from '@civic-source/shared'; diff --git a/packages/fetcher/src/zip.ts b/packages/fetcher/src/zip.ts new file mode 100644 index 0000000..b6fe933 --- /dev/null +++ b/packages/fetcher/src/zip.ts @@ -0,0 +1,42 @@ +import { inflateRawSync } from 'node:zlib'; + +/** + * Extract the first `.xml` entry from a ZIP archive buffer. + * + * Walks ZIP local file headers (signature 0x04034b50), reading the + * compression method, compressed size, and filename for each entry. For the + * first entry whose filename ends with `.xml` it returns the decoded UTF-8 + * contents: stored entries (method 0) are read directly, deflated entries + * (method 8) are inflated synchronously via `inflateRawSync`. + * + * The caller is responsible for decoding any base64 before passing the buffer. + * + * @param zip Raw ZIP archive bytes. + * @returns The XML string, or `null` if the archive contains no `.xml` entry. + */ +export function extractXmlFromZip(zip: Buffer): string | null { + let offset = 0; + while (offset < zip.length - 30) { + const sig = zip.readUInt32LE(offset); + if (sig !== 0x04034b50) break; + + const compressionMethod = zip.readUInt16LE(offset + 8); + const compressedSize = zip.readUInt32LE(offset + 18); + const fileNameLen = zip.readUInt16LE(offset + 26); + const extraLen = zip.readUInt16LE(offset + 28); + const fileName = zip.toString('utf-8', offset + 30, offset + 30 + fileNameLen); + + const dataStart = offset + 30 + fileNameLen + extraLen; + + if (fileName.endsWith('.xml')) { + if (compressionMethod === 0) { + return zip.toString('utf-8', dataStart, dataStart + compressedSize); + } else if (compressionMethod === 8) { + return inflateRawSync(zip.subarray(dataStart, dataStart + compressedSize)).toString('utf-8'); + } + } + + offset = dataStart + compressedSize; + } + return null; +}