Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion apps/api/scripts/import-laddr/importer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,13 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise<ImportRe

log(`[import] fetching blog from ${opts.sourceHost}`);
const blogPosts: BlogPost[] = [];
// `?include=*` is the only way to get the body content — laddr stores
// it as a typed `items` array on `AbstractContent`, not as a flat Body
// field. translateBlogPost assembles markdown from those items.
for await (const row of fetchAllPages<RawBlogPost>(
'/blog',
RawBlogPostSchema,
{},
{ include: '*' },
fetchOpts,
)) {
const bp = translateBlogPost(row, ctx);
Expand Down
29 changes: 27 additions & 2 deletions apps/api/scripts/import-laddr/json-fetcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -150,26 +150,51 @@ export const RawProjectBuzzSchema = z
.passthrough();
export type RawProjectBuzz = z.infer<typeof RawProjectBuzzSchema>;

/**
* One item in a blog post's body. Laddr's `Emergence\CMS\AbstractContent`
* stores body as an ordered list of typed items rather than a single
* markdown string. Three item classes appear in production: Markdown
* (raw markdown), Media (image reference), Embed (raw HTML — iframes etc.).
*
* Surfaced only when the request asks `?include=*`.
*/
export const RawBlogPostItemSchema = z
.object({
ID: z.number().int().positive(),
Class: z.string(),
Order: z.number().int().optional(),
// Markdown items: Data is a string. Media items: Data is an object
// ({ MediaID, Caption }). Embed items: Data is a string (raw HTML).
Data: z.unknown().optional(),
})
.passthrough();
export type RawBlogPostItem = z.infer<typeof RawBlogPostItemSchema>;

/**
* Blog post — laddr's `BlogPost` class. The field set is best-effort
* against laddr's `BlogRequestHandler` template output; unknown fields
* pass through.
*
* ID, Class, Handle (slug), Title, Body, Summary,
* ID, Class, Handle (slug), Title, Summary,
* AuthorID, Published (epoch), Modified (epoch), Created (epoch)
*
* Body is *not* a top-level field in laddr's JSON. The body content
* lives in `items` (only surfaced when the request uses `?include=*`)
* as an ordered list of typed content blocks.
*/
export const RawBlogPostSchema = z
.object({
ID: z.number().int().positive(),
Class: z.string(),
Handle: z.string().nullable().optional(),
Title: z.string().nullable().optional(),
Body: z.string().nullable().optional(),
Summary: z.string().nullable().optional(),
AuthorID: z.number().int().nullable().optional(),
Published: z.number().int().nullable().optional(),
Created: z.number().int().nullable().optional(),
Modified: z.number().int().nullable().optional(),
/** Present when the request asks `?include=*`. */
items: z.array(RawBlogPostItemSchema).optional(),
})
.passthrough();
export type RawBlogPost = z.infer<typeof RawBlogPostSchema>;
Expand Down
78 changes: 72 additions & 6 deletions apps/api/scripts/import-laddr/translators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import type {

import type {
RawBlogPost,
RawBlogPostItem,
RawMembership,
RawPerson,
RawProject,
Expand Down Expand Up @@ -652,15 +653,80 @@ export function translateBuzz(
};
}

/**
* Source host used for legacy media URLs in blog bodies. Items of class
* `Emergence\CMS\Item\Media` reference a numeric `MediaID` resolved
* against laddr's `/thumbnail/<id>/<dimensions>` endpoint; we render
* those as `![Caption](https://<host>/thumbnail/<id>/1920x1920)` so the
* markdown body stays viewable on its own. Eventually those images
* should migrate into the data repo as attachments, but that's a
* separate concern from this importer pass.
*/
const LADDR_MEDIA_HOST = 'codeforphilly.org';
const LADDR_MEDIA_DIMENSIONS = '1920x1920';

/**
* Assemble a blog post's markdown body from laddr's typed `items` array.
* Items are sorted by `Order` (defensive — laddr's JSON tends to come
* pre-sorted, but the contract isn't documented).
*
* Three item classes appear in production:
* - `Emergence\CMS\Item\Markdown` — `Data` is the raw markdown string;
* append verbatim.
* - `Emergence\CMS\Item\Media` — `Data` is `{ MediaID, Caption }`;
* render as a markdown image with the laddr media URL.
* - `Emergence\CMS\Item\Embed` — `Data` is raw HTML (iframes, divs);
* append as a raw HTML block (legal in CommonMark).
*/
function assembleBlogBody(
items: readonly RawBlogPostItem[] | undefined,
warnings: Warnings,
legacyId: number,
): string {
if (!items || items.length === 0) return '';
const sorted = [...items].sort((a, b) => (a.Order ?? 0) - (b.Order ?? 0));
const blocks: string[] = [];
for (const item of sorted) {
if (item.Class.endsWith('Item\\Markdown')) {
if (typeof item.Data === 'string') {
blocks.push(item.Data);
}
} else if (item.Class.endsWith('Item\\Media')) {
const data = item.Data;
if (data && typeof data === 'object' && 'MediaID' in data) {
const mediaId = (data as { MediaID?: unknown }).MediaID;
const caption = (data as { Caption?: unknown }).Caption;
const captionText =
typeof caption === 'string' && caption.trim().length > 0 ? caption.trim() : '';
if (typeof mediaId === 'number') {
const url = `https://${LADDR_MEDIA_HOST}/thumbnail/${mediaId}/${LADDR_MEDIA_DIMENSIONS}`;
blocks.push(`![${captionText}](${url})`);
}
}
} else if (item.Class.endsWith('Item\\Embed')) {
if (typeof item.Data === 'string' && item.Data.trim().length > 0) {
blocks.push(item.Data);
}
} else {
warnings.push(
`[blog-posts] legacyId=${legacyId} item=${item.ID} unknown Class ${JSON.stringify(item.Class)}; skipped`,
);
}
}
// Markdown blocks separate cleanly with a blank line. markdownlint
// (run on gitsheets serialize) will normalize any drift.
return blocks.join('\n\n');
}

/**
* Translate a laddr `BlogPost` row into a v1 `BlogPost` record.
*
* Slug source priority: `Handle` (laddr's URL-safe identifier) →
* slugified `Title` → `legacy-<ID>`. Bodies are kept verbatim; the
* gitsheets markdown format will normalize them via markdownlint on
* serialize. `AuthorID` resolves via the people-by-legacy map; an
* unresolved author is recorded as a warning but doesn't block the
* post (the runtime treats `authorId === null` as anonymous).
* slugified `Title` → `legacy-<ID>`. Bodies are assembled from the
* row's `items` array (see assembleBlogBody). `AuthorID` resolves via
* the people-by-legacy map; an unresolved author is recorded as a
* warning but doesn't block the post (the runtime treats
* `authorId === null` as anonymous).
*/
export function translateBlogPost(
row: RawBlogPost,
Expand Down Expand Up @@ -702,7 +768,7 @@ export function translateBlogPost(
? epochToIsoOr(row.Modified, createdAt)
: undefined;

const body = nonEmptyStr(row.Body) ?? '';
const body = assembleBlogBody(row.items, ctx.warnings, legacyId);
const summary = nonEmptyStr(row.Summary);
// The schema caps summary at 500 chars; truncate longer laddr summaries
// rather than failing validation on import.
Expand Down
132 changes: 122 additions & 10 deletions apps/api/tests/import-laddr.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -389,12 +389,19 @@ describe('translateBlogPost', () => {
Class: 'BlogPost',
Handle: 'civic-tech-roundup-2026',
Title: 'Civic Tech Roundup, May 2026',
Body: '# Heading\n\nA blog body.',
Summary: 'A short blurb.',
AuthorID: 12,
Published: 1746028800, // 2025-04-30
Created: 1746028800,
Modified: 1746028800,
items: [
{
ID: 100,
Class: 'Emergence\\CMS\\Item\\Markdown',
Order: 1,
Data: '# Heading\n\nA blog body.',
},
],
};
const bp = translateBlogPost(row, c);
expect(bp).not.toBeNull();
Expand All @@ -409,13 +416,110 @@ describe('translateBlogPost', () => {
expect(bp!.editedAt).toBeUndefined();
});

it('falls back through Title → legacy-<id> when Handle is missing', () => {
it('assembles a body from interleaved Markdown / Media / Embed items', () => {
const c = ctx();
const row: RawBlogPost = {
ID: 7,
Class: 'BlogPost',
Handle: 'multi-item',
Title: 'Multi-item Post',
Published: 1746028800,
items: [
{
ID: 200,
Class: 'Emergence\\CMS\\Item\\Media',
Order: 1,
Data: { MediaID: 3349, Caption: 'A photo' },
},
{
ID: 201,
Class: 'Emergence\\CMS\\Item\\Markdown',
Order: 2,
Data: 'Some intro markdown.',
},
{
ID: 202,
Class: 'Emergence\\CMS\\Item\\Embed',
Order: 3,
Data: '<iframe src="https://www.youtube.com/embed/abc"></iframe>',
},
],
};
const bp = translateBlogPost(row, c);
expect(bp).not.toBeNull();
expect(bp!.body).toBe(
[
'![A photo](https://codeforphilly.org/thumbnail/3349/1920x1920)',
'Some intro markdown.',
'<iframe src="https://www.youtube.com/embed/abc"></iframe>',
].join('\n\n'),
);
});

it('sorts items by Order before assembling', () => {
const c = ctx();
const row: RawBlogPost = {
ID: 8,
Class: 'BlogPost',
Handle: 'unordered',
Title: 'Unordered',
Published: 1746028800,
items: [
{ ID: 300, Class: 'Emergence\\CMS\\Item\\Markdown', Order: 2, Data: 'second' },
{ ID: 301, Class: 'Emergence\\CMS\\Item\\Markdown', Order: 1, Data: 'first' },
],
};
const bp = translateBlogPost(row, c);
expect(bp!.body).toBe('first\n\nsecond');
});

it('returns an empty body when items is absent', () => {
const c = ctx();
const row: RawBlogPost = {
ID: 9,
Class: 'BlogPost',
Handle: 'bodiless',
Title: 'Bodiless',
Published: 1746028800,
};
const bp = translateBlogPost(row, c);
expect(bp!.body).toBe('');
});

it('warns on unknown Item class but keeps the post', () => {
const c = ctx();
const row: RawBlogPost = {
ID: 10,
Class: 'BlogPost',
Handle: 'unknown-item',
Title: 'Unknown Item',
Published: 1746028800,
items: [
{
ID: 400,
Class: 'Emergence\\CMS\\Item\\NewType',
Order: 1,
Data: 'whatever',
},
{
ID: 401,
Class: 'Emergence\\CMS\\Item\\Markdown',
Order: 2,
Data: 'still here',
},
],
};
const bp = translateBlogPost(row, c);
expect(bp!.body).toBe('still here');
expect(c.warnings.items.some((w) => w.includes('item=400'))).toBe(true);
});

it('falls back through Title → legacy-<id> when Handle is missing', () => {
const c = ctx();
const row: RawBlogPost = {
ID: 11,
Class: 'BlogPost',
Title: 'A Hello Post',
Body: 'body',
Published: 1746028800,
};
const bp = translateBlogPost(row, c);
Expand All @@ -426,18 +530,17 @@ describe('translateBlogPost', () => {
it('warns and posts anonymously when AuthorID does not resolve', () => {
const c = ctx();
const row: RawBlogPost = {
ID: 11,
ID: 12,
Class: 'BlogPost',
Handle: 'orphan',
Title: 'Orphan',
Body: 'orphan',
AuthorID: 999,
Published: 1746028800,
};
const bp = translateBlogPost(row, c);
expect(bp).not.toBeNull();
expect(bp!.authorId).toBeUndefined();
expect(c.warnings.items.some((w) => w.includes('legacyId=11'))).toBe(true);
expect(c.warnings.items.some((w) => w.includes('legacyId=12'))).toBe(true);
});

it('sets editedAt when Modified is >60s after Published', () => {
Expand All @@ -447,7 +550,6 @@ describe('translateBlogPost', () => {
Class: 'BlogPost',
Handle: 'edited',
Title: 'Edited',
Body: 'edited body',
Published: 1746028800,
Modified: 1746028800 + 3600, // +1 hour
};
Expand All @@ -463,7 +565,6 @@ describe('translateBlogPost', () => {
Class: 'BlogPost',
Handle: 'long-summary',
Title: 'Long Summary',
Body: 'body',
Summary: overlong,
Published: 1746028800,
};
Expand Down Expand Up @@ -653,7 +754,11 @@ function mockRoutes(): MockRoutes {
],
],
[
'/blog?format=json&limit=200&offset=0',
// Importer fetches /blog with `?include=*` so it can read the
// structured body items (laddr doesn't expose Body via the flat
// JSON fields). `*` is a sub-delim per RFC 3986 and stays
// unencoded through URLSearchParams.
'/blog?format=json&include=*&limit=200&offset=0',
[
envelope(
[
Expand All @@ -662,12 +767,19 @@ function mockRoutes(): MockRoutes {
Class: 'BlogPost',
Handle: 'hello-philly',
Title: 'Hello Philly',
Body: '# Hello\n\nFirst blog post.',
Summary: 'A short hello.',
AuthorID: 10,
Published: 1377126953,
Created: 1377126953,
Modified: 1377126953,
items: [
{
ID: 1000,
Class: 'Emergence\\CMS\\Item\\Markdown',
Order: 1,
Data: '# Hello\n\nFirst blog post.',
},
],
},
],
1,
Expand Down