Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions .github/workflows/sync-law.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
run: |
mkdir -p workspace/statutes
node --input-type=module <<'SCRIPT'
import { OlrcFetcher, HashStore } from '@civic-source/fetcher';
import { OlrcFetcher, HashStore, extractXmlFromZip } from '@civic-source/fetcher';
import { XmlToMarkdownAdapter } from '@civic-source/transformer';
import { writeFile, mkdir } from 'node:fs/promises';
import { join, dirname } from 'node:path';
Expand All @@ -114,8 +114,15 @@ jobs:
continue;
}

const xml = extractXmlFromZip(Buffer.from(xmlResult.value, 'base64'));
if (xml === null) {
console.error(`Title ${rp.title}: no XML in archive`);
failedTitles++;
continue;
}

const transformer = new XmlToMarkdownAdapter(rp.publicLaw);
const transformResult = transformer.transformToFiles(xmlResult.value);
const transformResult = transformer.transformToFiles(xml);
if (!transformResult.ok) {
console.error(`Title ${rp.title}: transform failed — ${transformResult.error.message}`);
failedTitles++;
Expand Down
40 changes: 40 additions & 0 deletions packages/fetcher/src/__tests__/zip.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import { deflateRawSync } from 'node:zlib';
import { describe, expect, it } from 'vitest';

import { extractXmlFromZip } from '../zip.js';

/**
* Construct a minimal single-entry ZIP buffer (local file header + data).
* Byte offsets match the parser in zip.ts: sig@0, method@8, compressedSize@18,
* fileNameLen@26, extraLen@28, filename@30, data@30+fileNameLen.
*/
function makeZip(fileName: string, compressionMethod: number, data: Buffer): Buffer {
const fileNameBytes = Buffer.from(fileName, 'utf-8');
const header = Buffer.alloc(30);
header.writeUInt32LE(0x04034b50, 0); // local file header signature
header.writeUInt16LE(compressionMethod, 8);
header.writeUInt32LE(data.length, 18); // compressed size
header.writeUInt32LE(data.length, 22); // uncompressed size (unused by parser)
header.writeUInt16LE(fileNameBytes.length, 26);
header.writeUInt16LE(0, 28); // extra field length
return Buffer.concat([header, fileNameBytes, data]);
}

describe('extractXmlFromZip', () => {
it('returns the contents of a stored (method 0) .xml entry', () => {
const xml = '<uscDoc>hello</uscDoc>';
const zip = makeZip('doc.xml', 0, Buffer.from(xml, 'utf-8'));
expect(extractXmlFromZip(zip)).toBe(xml);
});

it('inflates a deflate (method 8) .xml entry', () => {
const xml = `<uscDoc>${'a'.repeat(500)}<section>some longer content for round-tripping</section></uscDoc>`;
const zip = makeZip('doc.xml', 8, deflateRawSync(Buffer.from(xml, 'utf-8')));
expect(extractXmlFromZip(zip)).toBe(xml);
});

it('returns null when no .xml entry is present', () => {
const zip = makeZip('readme.txt', 0, Buffer.from('not xml', 'utf-8'));
expect(extractXmlFromZip(zip)).toBeNull();
});
});
1 change: 1 addition & 0 deletions packages/fetcher/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export {
parseCurrentRelease,
type CurrentReleaseInfo,
} from './fetcher.js';
export { extractXmlFromZip } from './zip.js';
export { HashStore } from './hash-store.js';
export { FetcherMetrics, type FetcherMetricsSnapshot, type DownloadErrorType } from './metrics.js';
export { createLogger, type Logger, type LogLevel } from '@civic-source/shared';
42 changes: 42 additions & 0 deletions packages/fetcher/src/zip.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import { inflateRawSync } from 'node:zlib';

/**
* Extract the first `.xml` entry from a ZIP archive buffer.
*
* Walks ZIP local file headers (signature 0x04034b50), reading the
* compression method, compressed size, and filename for each entry. For the
* first entry whose filename ends with `.xml` it returns the decoded UTF-8
* contents: stored entries (method 0) are read directly, deflated entries
* (method 8) are inflated synchronously via `inflateRawSync`.
*
* The caller is responsible for decoding any base64 before passing the buffer.
*
* @param zip Raw ZIP archive bytes.
* @returns The XML string, or `null` if the archive contains no `.xml` entry.
*/
export function extractXmlFromZip(zip: Buffer): string | null {
let offset = 0;
while (offset < zip.length - 30) {
const sig = zip.readUInt32LE(offset);
if (sig !== 0x04034b50) break;

const compressionMethod = zip.readUInt16LE(offset + 8);
const compressedSize = zip.readUInt32LE(offset + 18);
const fileNameLen = zip.readUInt16LE(offset + 26);
const extraLen = zip.readUInt16LE(offset + 28);
const fileName = zip.toString('utf-8', offset + 30, offset + 30 + fileNameLen);

const dataStart = offset + 30 + fileNameLen + extraLen;

if (fileName.endsWith('.xml')) {
if (compressionMethod === 0) {
return zip.toString('utf-8', dataStart, dataStart + compressedSize);
} else if (compressionMethod === 8) {
return inflateRawSync(zip.subarray(dataStart, dataStart + compressedSize)).toString('utf-8');
}
}

offset = dataStart + compressedSize;
}
return null;
}
Loading