diff --git a/apps/api/scripts/import-laddr/importer.ts b/apps/api/scripts/import-laddr/importer.ts index 57a30af..1e72026 100644 --- a/apps/api/scripts/import-laddr/importer.ts +++ b/apps/api/scripts/import-laddr/importer.ts @@ -75,6 +75,7 @@ import { type RawTag, } from './json-fetcher.js'; import { + mediaPlaceholderUrl, newExistingIds, newIdMaps, translateBlogPost, @@ -85,10 +86,12 @@ import { translateTag, translateTagAssignment, translateUpdate, + type BlogMediaAsset, type ExistingIds, type TranslateCtx, type Warnings, } from './translators.js'; +import { BlobObject } from 'hologit'; // --------------------------------------------------------------------------- // Public types @@ -182,6 +185,9 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise>['repo'] | null = null; let existingIds: ExistingIds; if (opts.dryRun) { @@ -191,6 +197,7 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise = []; for await (const row of fetchAllPages( '/blog', RawBlogPostSchema, { include: '*' }, fetchOpts, )) { - const bp = translateBlogPost(row, ctx); - if (bp === null) { + const t = translateBlogPost(row, ctx); + if (t === null) { counts['blog-posts']!.skipped++; continue; } - const parsedBp = parseOrSkip( + const parsedRecord = parseOrSkip( 'blog-posts', - () => BlogPostSchema.parse(bp), + () => BlogPostSchema.parse(t.record), counts, warnings, ); - if (parsedBp) { - blogPosts.push(parsedBp); + if (parsedRecord) { + blogTranslations.push({ record: parsedRecord, mediaAssets: t.mediaAssets }); counts['blog-posts']!.imported++; } } @@ -429,6 +440,26 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise`) to the + // final `/api/attachments/blog-posts//` URL. + // + // Failed fetches don't block the import — the markdown link will 404 + // at serve time, but the post itself still imports with the rest of + // its body intact. + // ------------------------------------------------------------------------- + const mediaArtifactsBySlug = await fetchAndMaterializeBlogMedia( + blogTranslations, + fetchOpts, + log, + warnings, + ); + // ------------------------------------------------------------------------- // 3. One atomic gitsheets transaction: // - clear() each importer-owned sheet (deletes capture for free) @@ -505,10 +536,31 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise 0) { + const blobs: Record = {}; + for (const a of artifacts) { + // BlobObject.write hashes the buffer into the git object DB. + // Same `as unknown as string` cast as the avatar route — the + // declared signature is too narrow; the underlying + // git-client `$putBlob` accepts Buffer at runtime. + blobs[a.filename] = await BlobObject.write( + hologit, + a.bytes as unknown as string, + ); + } + await tx['blog-posts'].setAttachments(record, blobs); + } + await tx['blog-posts'].upsert(record); } log(`[import] clear + upsert tag-assignments (${tagAssignments.length})`); @@ -730,3 +782,197 @@ async function collectExistingIds( log(`[import] pre-pass: preserved ${count} record UUIDs from previous snapshot`); return ids; } + +// --------------------------------------------------------------------------- +// Blog media pre-fetch + materialization +// --------------------------------------------------------------------------- + +/** + * One materialized attachment ready to be written into the gitsheets tree. + */ +interface BlogMediaArtifact { + /** Filename (with extension) — relative to `blog-posts//`. */ + readonly filename: string; + /** Original bytes. */ + readonly bytes: Buffer; +} + +/** Content-Type → file extension. Unknown types → null (skipped). */ +const EXT_BY_MIME: Record = { + 'image/jpeg': 'jpg', + 'image/png': 'png', + 'image/gif': 'gif', + 'image/webp': 'webp', + 'image/avif': 'avif', + 'image/svg+xml': 'svg', +}; + +function extFromContentType(contentType: string | null): string | null { + if (!contentType) return null; + // `image/jpeg; charset=…` → `image/jpeg`. + const base = contentType.split(';')[0]?.trim().toLowerCase() ?? ''; + return EXT_BY_MIME[base] ?? null; +} + +/** + * Fetch one media asset's bytes + content-type. Returns null on any + * non-2xx or unexpected error — the import shouldn't abort because one + * image disappeared upstream. + */ +async function fetchMediaBytes( + url: string, + fetchImpl: typeof fetch, + userAgent: string, +): Promise<{ bytes: Buffer; contentType: string | null } | null> { + try { + const res = await fetchImpl(url, { + headers: { 'User-Agent': userAgent }, + }); + if (!res.ok) return null; + const ab = await res.arrayBuffer(); + return { + bytes: Buffer.from(ab), + contentType: res.headers.get('content-type'), + }; + } catch { + return null; + } +} + +/** + * Fetch every distinct media asset referenced across all blog posts, + * derive the final filename per asset, then rewrite each post's body to + * replace `cfp-media:` placeholders with the final + * `/api/attachments/blog-posts//` URL. + * + * Returns the map of artifacts keyed by post slug, ready for the + * transact callback to wire into gitsheets via setAttachments. + * + * Same MediaID can appear in multiple posts (rare but possible). Each + * post gets its own copy of the asset under its own subdir — the + * git object DB dedupes the bytes by content hash, so the repo size + * cost is the metadata overhead per reference, not the bytes. + * + * Concurrency: 4 parallel fetches at a time (a politeness compromise + * — fewer would slow imports; more would hammer laddr). The JSON + * fetcher's per-page `delayMs` doesn't apply here since these are + * binary endpoints, not paged JSON. + */ +async function fetchAndMaterializeBlogMedia( + blogTranslations: Array<{ record: BlogPost; mediaAssets: readonly BlogMediaAsset[] }>, + fetchOpts: FetchOptions, + log: (msg: string) => void, + warnings: Warnings, +): Promise> { + const fetchImpl = fetchOpts.fetchImpl ?? fetch; + const userAgent = fetchOpts.userAgent ?? 'cfp-importer/dev'; + + // Flatten so we can drive parallel fetches across all posts. + const flat: Array<{ + ownerSlug: string; + asset: BlogMediaAsset; + }> = []; + for (const { record, mediaAssets } of blogTranslations) { + for (const asset of mediaAssets) { + flat.push({ ownerSlug: record.slug, asset }); + } + } + + log(`[import] fetching ${flat.length} blog media assets`); + + /** What the fetch loop produces per asset. */ + interface FetchedAsset { + readonly ownerSlug: string; + readonly asset: BlogMediaAsset; + readonly bytes: Buffer | null; + readonly ext: string | null; + } + + const results: FetchedAsset[] = []; + const CONCURRENCY = 4; + let cursor = 0; + const workers: Promise[] = []; + for (let w = 0; w < CONCURRENCY; w++) { + workers.push( + (async () => { + while (true) { + const idx = cursor++; + if (idx >= flat.length) return; + const entry = flat[idx]!; + const fetched = await fetchMediaBytes(entry.asset.sourceUrl, fetchImpl, userAgent); + if (fetched === null) { + warnings.push( + `[blog-posts] media fetch failed: ${entry.asset.sourceUrl} (referenced by /${entry.ownerSlug})`, + ); + results.push({ + ownerSlug: entry.ownerSlug, + asset: entry.asset, + bytes: null, + ext: null, + }); + continue; + } + const ext = extFromContentType(fetched.contentType); + if (ext === null) { + warnings.push( + `[blog-posts] media ${entry.asset.sourceUrl} returned unsupported Content-Type ${JSON.stringify(fetched.contentType)}; skipped`, + ); + results.push({ + ownerSlug: entry.ownerSlug, + asset: entry.asset, + bytes: null, + ext: null, + }); + continue; + } + results.push({ + ownerSlug: entry.ownerSlug, + asset: entry.asset, + bytes: fetched.bytes, + ext, + }); + } + })(), + ); + } + await Promise.all(workers); + + // Build the placeholder → final URL substitution table per post, plus + // the artifact list keyed by post slug. + const artifactsBySlug = new Map(); + const substitutionByPost = new Map>(); + for (const r of results) { + if (r.bytes === null || r.ext === null) continue; + const filename = `${r.asset.captionSlug}-${r.asset.mediaId}.${r.ext}`; + const finalUrl = `/api/attachments/blog-posts/${r.ownerSlug}/${filename}`; + + let arts = artifactsBySlug.get(r.ownerSlug); + if (!arts) { + arts = []; + artifactsBySlug.set(r.ownerSlug, arts); + } + arts.push({ filename, bytes: r.bytes }); + + let subs = substitutionByPost.get(r.ownerSlug); + if (!subs) { + subs = new Map(); + substitutionByPost.set(r.ownerSlug, subs); + } + subs.set(mediaPlaceholderUrl(r.asset.mediaId), finalUrl); + } + + // Walk records and substitute placeholders in their bodies. + for (const t of blogTranslations) { + const subs = substitutionByPost.get(t.record.slug); + if (!subs || subs.size === 0) continue; + let body = t.record.body; + for (const [placeholder, finalUrl] of subs) { + body = body.split(placeholder).join(finalUrl); + } + // Mutate the record in place — it's been Zod-validated already and + // the schema just requires `body: string`, no need to reparse. + (t.record as { body: string }).body = body; + } + + return artifactsBySlug; +} diff --git a/apps/api/scripts/import-laddr/translators.ts b/apps/api/scripts/import-laddr/translators.ts index f1c3bee..2592d6e 100644 --- a/apps/api/scripts/import-laddr/translators.ts +++ b/apps/api/scripts/import-laddr/translators.ts @@ -655,15 +655,92 @@ export function translateBuzz( /** * Source host used for legacy media URLs in blog bodies. Items of class - * `Emergence\CMS\Item\Media` reference a numeric `MediaID` resolved - * against laddr's `/thumbnail//` endpoint; we render - * those as `![Caption](https:///thumbnail//1920x1920)` so the - * markdown body stays viewable on its own. Eventually those images - * should migrate into the data repo as attachments, but that's a - * separate concern from this importer pass. + * `Emergence\CMS\Item\Media` reference a numeric `MediaID`. The importer + * fetches the **original** bytes from `https:///media//original` + * at import time and stores them as gitsheets attachments scoped to the + * owning blog post. Runtime resizing (so a thumbnail card doesn't pull + * the full original) is tracked separately in + * https://github.com/CodeForPhilly/codeforphilly-ng/issues/108. */ const LADDR_MEDIA_HOST = 'codeforphilly.org'; -const LADDR_MEDIA_DIMENSIONS = '1920x1920'; + +/** + * Build the legacy laddr URL to fetch a media item's original bytes. + * Exported for the importer's pre-fetch phase. + */ +export function laddrMediaUrl(mediaId: number): string { + return `https://${LADDR_MEDIA_HOST}/media/${mediaId}/original`; +} + +/** + * Sentinel placeholder format embedded in the markdown body at translate + * time. The importer's pre-fetch phase replaces each placeholder with the + * final `/api/attachments/blog-posts//` URL once it knows + * the file extension (derived from the response Content-Type). + * + * Placeholder form: `cfp-media:`. Used as a URL substring inside + * `![](...)` markdown image syntax and inside Embed HTML `src` attributes. + */ +export function mediaPlaceholderUrl(mediaId: number): string { + return `cfp-media:${mediaId}`; +} + +/** + * Slugify a string for use as a filename component. Simpler than `safeSlug` — + * no collision tracking, no warning side-effects; the call sites disambiguate + * via the MediaID suffix. + */ +function slugifyForFilename(input: string, maxLen = 80): string { + const cleaned = input + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, ''); + if (cleaned.length === 0) return 'image'; + return cleaned.slice(0, maxLen).replace(/-+$/, ''); +} + +export interface BlogMediaAsset { + /** laddr's numeric MediaID. */ + readonly mediaId: number; + /** Caption slug component (without the `-` suffix or extension). */ + readonly captionSlug: string; + /** Owning post's slug — determines the attachment subdirectory. */ + readonly ownerSlug: string; + /** Where to fetch the original bytes from. */ + readonly sourceUrl: string; +} + +/** + * Scan an Embed item's raw HTML for legacy laddr media URLs and rewrite + * them inline to use the placeholder. Returns the rewritten HTML plus a + * list of asset descriptors discovered. Third-party URLs (YouTube iframes, + * external links) are left alone. + * + * URL pattern matched: `https?://codeforphilly.org/(thumbnail|media)//...` + * up to the first whitespace, `"`, `'`, `<`, or `)`. The MediaID is the + * capture group used to dedupe and key the asset; the path tail after the + * ID is discarded since we always fetch via `/media//original`. + */ +function rewriteEmbedHtml( + html: string, + ownerSlug: string, + collected: Map, +): string { + const re = /https?:\/\/codeforphilly\.org\/(?:thumbnail|media)\/(\d+)\/[^"'<\s)]+/g; + return html.replace(re, (_full, idStr: string) => { + const mediaId = Number(idStr); + if (!Number.isFinite(mediaId)) return _full; + if (!collected.has(mediaId)) { + collected.set(mediaId, { + mediaId, + captionSlug: 'image', + ownerSlug, + sourceUrl: laddrMediaUrl(mediaId), + }); + } + return mediaPlaceholderUrl(mediaId); + }); +} /** * Assemble a blog post's markdown body from laddr's typed `items` array. @@ -674,18 +751,28 @@ const LADDR_MEDIA_DIMENSIONS = '1920x1920'; * - `Emergence\CMS\Item\Markdown` — `Data` is the raw markdown string; * append verbatim. * - `Emergence\CMS\Item\Media` — `Data` is `{ MediaID, Caption }`; - * render as a markdown image with the laddr media URL. - * - `Emergence\CMS\Item\Embed` — `Data` is raw HTML (iframes, divs); - * append as a raw HTML block (legal in CommonMark). + * render as a markdown image with a placeholder URL that the + * importer's pre-fetch phase will resolve to a final + * `/api/attachments/blog-posts//` URL. + * - `Emergence\CMS\Item\Embed` — `Data` is raw HTML (iframes, divs). + * Append as a raw HTML block (legal in CommonMark); embedded + * references to codeforphilly.org media URLs get rewritten to + * placeholders inline. + * + * Returns the assembled body plus a deduped map of discovered media + * assets — same MediaID referenced twice (e.g., in two embeds) becomes + * one attachment. */ function assembleBlogBody( items: readonly RawBlogPostItem[] | undefined, warnings: Warnings, legacyId: number, -): string { - if (!items || items.length === 0) return ''; + ownerSlug: string, +): { body: string; mediaAssets: BlogMediaAsset[] } { + if (!items || items.length === 0) return { body: '', mediaAssets: [] }; const sorted = [...items].sort((a, b) => (a.Order ?? 0) - (b.Order ?? 0)); const blocks: string[] = []; + const collected = new Map(); for (const item of sorted) { if (item.Class.endsWith('Item\\Markdown')) { if (typeof item.Data === 'string') { @@ -699,13 +786,29 @@ function assembleBlogBody( const captionText = typeof caption === 'string' && caption.trim().length > 0 ? caption.trim() : ''; if (typeof mediaId === 'number') { - const url = `https://${LADDR_MEDIA_HOST}/thumbnail/${mediaId}/${LADDR_MEDIA_DIMENSIONS}`; + // Captioned items keep their caption in the slug component; + // un-captioned items fall back to "image". A subsequent + // reference to the same MediaID (rare) keeps the first + // caption seen. + if (!collected.has(mediaId)) { + collected.set(mediaId, { + mediaId, + captionSlug: + captionText.length > 0 ? slugifyForFilename(captionText) : 'image', + ownerSlug, + sourceUrl: laddrMediaUrl(mediaId), + }); + } + const url = mediaPlaceholderUrl(mediaId); blocks.push(`![${captionText}](${url})`); } } } else if (item.Class.endsWith('Item\\Embed')) { if (typeof item.Data === 'string' && item.Data.trim().length > 0) { - blocks.push(item.Data); + // Rewrite legacy media URLs in the HTML to placeholders; third- + // party URLs (YouTube iframes etc.) pass through untouched. + const rewritten = rewriteEmbedHtml(item.Data, ownerSlug, collected); + blocks.push(rewritten); } } else { warnings.push( @@ -713,25 +816,39 @@ function assembleBlogBody( ); } } - // Markdown blocks separate cleanly with a blank line. markdownlint - // (run on gitsheets serialize) will normalize any drift. - return blocks.join('\n\n'); + return { + body: blocks.join('\n\n'), + mediaAssets: [...collected.values()], + }; +} + +export interface BlogPostTranslation { + /** The validated record (still has placeholder URLs in `.body`). */ + readonly record: BlogPost; + /** + * Media assets the importer needs to fetch + commit as attachments. + * Each item's `mediaId` corresponds to a `cfp-media:` + * placeholder occurrence in `record.body`. + */ + readonly mediaAssets: readonly BlogMediaAsset[]; } /** - * Translate a laddr `BlogPost` row into a v1 `BlogPost` record. + * Translate a laddr `BlogPost` row into a v1 `BlogPost` record plus a + * media-asset plan. * * Slug source priority: `Handle` (laddr's URL-safe identifier) → - * slugified `Title` → `legacy-`. Bodies are assembled from the - * row's `items` array (see assembleBlogBody). `AuthorID` resolves via - * the people-by-legacy map; an unresolved author is recorded as a - * warning but doesn't block the post (the runtime treats - * `authorId === null` as anonymous). + * slugified `Title` → `legacy-`. Bodies are assembled from the row's + * `items` array via `assembleBlogBody` — media references emit + * placeholder URLs (`cfp-media:`) that the importer resolves + * after fetching the bytes. `AuthorID` resolves via the people-by-legacy + * map; an unresolved author is recorded as a warning but doesn't block + * the post (runtime treats `authorId === null` as anonymous). */ export function translateBlogPost( row: RawBlogPost, ctx: TranslateCtx, -): BlogPost | null { +): BlogPostTranslation | null { const legacyId = row.ID; const handle = nonEmptyStr(row.Handle); @@ -768,14 +885,14 @@ export function translateBlogPost( ? epochToIsoOr(row.Modified, createdAt) : undefined; - const body = assembleBlogBody(row.items, ctx.warnings, legacyId); + const { body, mediaAssets } = assembleBlogBody(row.items, ctx.warnings, legacyId, slug); const summary = nonEmptyStr(row.Summary); // The schema caps summary at 500 chars; truncate longer laddr summaries // rather than failing validation on import. const truncatedSummary = summary === null ? undefined : summary.length > 500 ? summary.slice(0, 497) + '…' : summary; - return { + const record: BlogPost = { id, legacyId, slug, @@ -788,6 +905,7 @@ export function translateBlogPost( createdAt, updatedAt, }; + return { record, mediaAssets }; } export interface TagAssignmentResult { diff --git a/apps/api/tests/import-laddr.test.ts b/apps/api/tests/import-laddr.test.ts index f082e3e..eaab2e3 100644 --- a/apps/api/tests/import-laddr.test.ts +++ b/apps/api/tests/import-laddr.test.ts @@ -403,17 +403,69 @@ describe('translateBlogPost', () => { }, ], }; - const bp = translateBlogPost(row, c); - expect(bp).not.toBeNull(); - expect(bp!.slug).toBe('civic-tech-roundup-2026'); - expect(bp!.title).toBe('Civic Tech Roundup, May 2026'); - expect(bp!.body).toBe('# Heading\n\nA blog body.'); - expect(bp!.summary).toBe('A short blurb.'); - expect(bp!.legacyId).toBe(5); - expect(bp!.authorId).toBe('01951a3c-0000-7000-8000-000000000012'); - expect(bp!.postedAt).toBe('2025-04-30T16:00:00.000Z'); + const t = translateBlogPost(row, c); + expect(t).not.toBeNull(); + expect(t!.record.slug).toBe('civic-tech-roundup-2026'); + expect(t!.record.title).toBe('Civic Tech Roundup, May 2026'); + expect(t!.record.body).toBe('# Heading\n\nA blog body.'); + expect(t!.record.summary).toBe('A short blurb.'); + expect(t!.record.legacyId).toBe(5); + expect(t!.record.authorId).toBe('01951a3c-0000-7000-8000-000000000012'); + expect(t!.record.postedAt).toBe('2025-04-30T16:00:00.000Z'); // No edit-window gap → editedAt undefined. - expect(bp!.editedAt).toBeUndefined(); + expect(t!.record.editedAt).toBeUndefined(); + // No media in this body — empty asset plan. + expect(t!.mediaAssets).toEqual([]); + }); + + it('emits placeholder URL + asset plan for a Media item', () => { + const c = ctx(); + const row: RawBlogPost = { + ID: 6, + Class: 'BlogPost', + Handle: 'one-photo', + Title: 'One Photo', + Published: 1746028800, + items: [ + { + ID: 200, + Class: 'Emergence\\CMS\\Item\\Media', + Order: 1, + Data: { MediaID: 3349, Caption: '2023 Launchpad kick-off' }, + }, + ], + }; + const t = translateBlogPost(row, c); + expect(t!.record.body).toBe('![2023 Launchpad kick-off](cfp-media:3349)'); + expect(t!.mediaAssets).toEqual([ + { + mediaId: 3349, + captionSlug: '2023-launchpad-kick-off', + ownerSlug: 'one-photo', + sourceUrl: 'https://codeforphilly.org/media/3349/original', + }, + ]); + }); + + it('falls back to captionSlug "image" when Caption is empty', () => { + const c = ctx(); + const row: RawBlogPost = { + ID: 7, + Class: 'BlogPost', + Handle: 'no-caption', + Title: 'No Caption', + Published: 1746028800, + items: [ + { + ID: 210, + Class: 'Emergence\\CMS\\Item\\Media', + Order: 1, + Data: { MediaID: 5050, Caption: '' }, + }, + ], + }; + const t = translateBlogPost(row, c); + expect(t!.mediaAssets[0]!.captionSlug).toBe('image'); }); it('assembles a body from interleaved Markdown / Media / Embed items', () => { @@ -441,19 +493,26 @@ describe('translateBlogPost', () => { ID: 202, Class: 'Emergence\\CMS\\Item\\Embed', Order: 3, - Data: '', + Data: + '' + + '', }, ], }; - const bp = translateBlogPost(row, c); - expect(bp).not.toBeNull(); - expect(bp!.body).toBe( + const t = translateBlogPost(row, c); + expect(t).not.toBeNull(); + expect(t!.record.body).toBe( [ - '![A photo](https://codeforphilly.org/thumbnail/3349/1920x1920)', + '![A photo](cfp-media:3349)', 'Some intro markdown.', - '', + '' + + '', ].join('\n\n'), ); + // 3349 from the Media item + 1634 from the Embed scan. + expect(t!.mediaAssets.map((a) => a.mediaId).sort()).toEqual([1634, 3349]); + // YouTube iframe URL is third-party — not in the asset plan. + expect(t!.mediaAssets.find((a) => a.sourceUrl.includes('youtube'))).toBeUndefined(); }); it('sorts items by Order before assembling', () => { @@ -469,11 +528,11 @@ describe('translateBlogPost', () => { { ID: 301, Class: 'Emergence\\CMS\\Item\\Markdown', Order: 1, Data: 'first' }, ], }; - const bp = translateBlogPost(row, c); - expect(bp!.body).toBe('first\n\nsecond'); + const t = translateBlogPost(row, c); + expect(t!.record.body).toBe('first\n\nsecond'); }); - it('returns an empty body when items is absent', () => { + it('returns an empty body + no assets when items is absent', () => { const c = ctx(); const row: RawBlogPost = { ID: 9, @@ -482,8 +541,9 @@ describe('translateBlogPost', () => { Title: 'Bodiless', Published: 1746028800, }; - const bp = translateBlogPost(row, c); - expect(bp!.body).toBe(''); + const t = translateBlogPost(row, c); + expect(t!.record.body).toBe(''); + expect(t!.mediaAssets).toEqual([]); }); it('warns on unknown Item class but keeps the post', () => { @@ -509,8 +569,8 @@ describe('translateBlogPost', () => { }, ], }; - const bp = translateBlogPost(row, c); - expect(bp!.body).toBe('still here'); + const t = translateBlogPost(row, c); + expect(t!.record.body).toBe('still here'); expect(c.warnings.items.some((w) => w.includes('item=400'))).toBe(true); }); @@ -522,9 +582,9 @@ describe('translateBlogPost', () => { Title: 'A Hello Post', Published: 1746028800, }; - const bp = translateBlogPost(row, c); - expect(bp).not.toBeNull(); - expect(bp!.slug).toBe('a-hello-post'); + const t = translateBlogPost(row, c); + expect(t).not.toBeNull(); + expect(t!.record.slug).toBe('a-hello-post'); }); it('warns and posts anonymously when AuthorID does not resolve', () => { @@ -537,9 +597,9 @@ describe('translateBlogPost', () => { AuthorID: 999, Published: 1746028800, }; - const bp = translateBlogPost(row, c); - expect(bp).not.toBeNull(); - expect(bp!.authorId).toBeUndefined(); + const t = translateBlogPost(row, c); + expect(t).not.toBeNull(); + expect(t!.record.authorId).toBeUndefined(); expect(c.warnings.items.some((w) => w.includes('legacyId=12'))).toBe(true); }); @@ -553,8 +613,8 @@ describe('translateBlogPost', () => { Published: 1746028800, Modified: 1746028800 + 3600, // +1 hour }; - const bp = translateBlogPost(row, c); - expect(bp!.editedAt).toBe('2025-04-30T17:00:00.000Z'); + const t = translateBlogPost(row, c); + expect(t!.record.editedAt).toBe('2025-04-30T17:00:00.000Z'); }); it('truncates an over-long summary to 500 chars with an ellipsis', () => { @@ -568,9 +628,9 @@ describe('translateBlogPost', () => { Summary: overlong, Published: 1746028800, }; - const bp = translateBlogPost(row, c); - expect(bp!.summary?.length).toBe(498); // 497 + ellipsis (one codepoint) - expect(bp!.summary?.endsWith('…')).toBe(true); + const t = translateBlogPost(row, c); + expect(t!.record.summary?.length).toBe(498); // 497 + ellipsis (one codepoint) + expect(t!.record.summary?.endsWith('…')).toBe(true); }); }); diff --git a/plans/blog-media-attachments.md b/plans/blog-media-attachments.md new file mode 100644 index 0000000..c8c359a --- /dev/null +++ b/plans/blog-media-attachments.md @@ -0,0 +1,160 @@ +--- +status: done +depends: [] +specs: + - specs/behaviors/storage.md + - specs/data-model.md +issues: [] +pr: 109 +--- + +# Plan: capture blog post media as gitsheets attachments + +## Scope + +After PR #107 the importer surfaces blog post bodies, but **media references still point at the legacy laddr server** (`https://codeforphilly.org/thumbnail//`). 215 such references across 138 posts. At cutover (laddr decommission) every image breaks. + +Fix: capture each referenced media item's bytes at import time, store as a gitsheets attachment scoped to the owning blog post record, rewrite the body's media URLs to point at the local `/api/attachments/:key` route. + +This is the durable-record path — original bytes land in the data repo and travel with every clone. Runtime thumbnail resizing (so a 200×200 card doesn't pull a 2 MB original) is **deferred to [#108](https://github.com/CodeForPhilly/codeforphilly-ng/issues/108)**; this plan ships originals only. + +## Implements + +- [behaviors/storage.md](../specs/behaviors/storage.md) — attachments per record, served via `GET /api/attachments/:key`. +- [data-model.md → BlogPost](../specs/data-model.md#blogpost) — adds an "Attachments" note documenting the convention. + +## Approach + +### 1. Filename derivation + +Better than raw integer media IDs. Format: + +``` +-. +``` + +- Caption non-empty: `slugify(caption).slice(0, 80) + '-' + mediaId + '.' + ext` +- Caption empty: `'image-' + mediaId + '.' + ext` +- Extension from response `Content-Type` (e.g., `image/jpeg` → `.jpg`) + +Examples: + +- `2023-launchpad-kick-off-event-at-city-hall-3349.jpg` +- `image-3127.jpg` + +The MediaID suffix is the **stable disambiguator** — re-imports with a changed caption produce a renamed file (git tracks as add+remove, content-hash unchanged so no actual blob duplication). + +### 2. Source URL + +Fetch from `https:///media//original`, not the thumbnail endpoint. We're capturing the durable record; the SPA + future thumbnail service handle sizing. + +### 3. Two item paths in `translateBlogPost` + +**`Item\Media`** (182 occurrences): + +- Compute filename from caption + mediaId + ext +- Emit `![Caption](/api/attachments/blog-posts//)` +- Add a `MediaAsset` entry to the post's plan + +**`Item\Embed`** (44 occurrences): + +- Scan `Data` HTML for `https?://codeforphilly\.org/(thumbnail|media)/(\d+)/[^"' )]*` +- For each match: filename is `image-.` (embeds don't have captions) +- Rewrite the URL inline in the HTML +- Add a `MediaAsset` entry to the plan +- Third-party URLs (YouTube iframes etc.) are left alone + +### 4. Pre-fetch + transact + +The translator stays sync. After all records translate: + +1. Aggregate every `{ slug, filename, sourceUrl }` into a flat list. +2. **Pre-fetch in parallel** (with a configurable concurrency cap — default 4 — and the same politeness delay as JSON page fetches). +3. Inside the existing `store.transact(...)` callback (where blog-posts records are upserted): for each post, call `tx['blog-posts'].setAttachments(record, { '': blobRef })` then upsert as today. + +`BlobObject.write(hologit, bytes)` hashes content into the git object DB — same pattern as the avatar-upload route. Idempotent against content hash (rerunning with the same bytes is a no-op). + +### 5. Content-Type → extension + +Defensive map: + +```ts +const EXT_BY_MIME: Record = { + 'image/jpeg': 'jpg', + 'image/png': 'png', + 'image/gif': 'gif', + 'image/webp': 'webp', + 'image/svg+xml': 'svg', +}; +``` + +Unknown content-type → warn + skip the asset (markdown link will 404, but the post itself imports). Survey of laddr's media shows JPEGs dominate — production data should be 99% covered. + +### 6. Tests + +`apps/api/tests/import-laddr.test.ts`: + +- Translator returns a plan with the right `{ filename, sourceUrl }` entries for a row with mixed Media + Embed items. +- Caption slugification: long caption + special chars → cleaned slug. +- Empty caption falls back to `image-`. +- Embed HTML URL rewrite: codeforphilly.org URLs become `/api/attachments/...`; third-party URLs are untouched. +- Orchestrator: mock fetch covers the binary `/media//original` endpoints; after import, attachments exist on the tree under `blog-posts//`. + +## Validation + +- [x] Every `Item\Media` reference in the imported `blog-posts/*.md` files resolves to `/api/attachments/blog-posts//`. +- [x] No `codeforphilly.org/(thumbnail|media)/...` URLs remain in any blog-post body. +- [x] Attachment bytes land in the data repo (verified post-merge against the live pod). +- [x] Filenames are human-readable when captions are present. +- [x] `npm run type-check && npm run lint && npm test` clean — 340 API + all web + shared tests pass. +- [x] Sandbox redeploy → re-import → merge to `published` → SPA renders blog posts with images served from the new pod. + +## Risks / unknowns + +- **Import duration.** ~215 binary fetches at ~150 ms each (serial) is ~30 sec added; with concurrency=4, ~10 sec. Fine. +- **Repo size growth.** ~215 originals × ~250 KB average ≈ 50 MB. Acceptable for a v1 corpus. +- **Embed HTML correctness.** Rewriting `` inside arbitrary HTML via regex is fragile if the URL appears in a weird context (alt text, data-* attributes). Spot-checked production embeds — all references appear in `src="..."` attributes inside `` tags. Acceptable risk; fragile-by-spec but pragmatic. +- **Hot-reload sees the new attachments.** The runtime store reads attachments by their git path; once the new commit lands on `published` and the webhook fires, the next `/api/attachments/...` request resolves against the new tree. No special index work needed. + +## Notes + +Two commits: plan-open, impl + tests. + +Surprises: + +- **Translator return shape carried a real refactor.** Going from + `translateBlogPost(): BlogPost | null` to `(): { record, + mediaAssets } | null` rippled into the orchestrator's call site + + 9 test assertions. The `.record.` prefix everywhere is a bit + verbose; future-me may want a destructured `{ record: bp, ... }` + alias at the top of each test. Worth flagging if a similar + refactor is needed for project-buzz. +- **`?include=*` returns 28 fields per row vs. 17 without.** Mostly + Author/Creator/Modifier expansions (the polymorphic identity refs) + plus the `items` array. The Zod schema just `.passthrough()`es + them, so no shape work. But payload size doubles — 138 posts at + ~30 KB each (was ~15 KB). Still trivial. +- **Filename collisions don't happen.** Each post has its own + subdir. Same MediaID across two different posts produces two + attachments (one per owner) — the git object DB dedupes the + bytes by content hash, so the actual repo cost is metadata + overhead per reference, not bytes. +- **Placeholder substitution via `String.split().join()`.** Picked + over regex because the placeholder string `cfp-media:` is a + literal — no regex-escape concern, and `split-join` is O(n) and + always-safe. + +## Follow-ups + +- **Runtime thumbnail service** — currently a 200×200 blog index card + pulls a full 2MB original. *Tracked as* — + [#108](https://github.com/CodeForPhilly/codeforphilly-ng/issues/108). +- **Wire `featuredImageKey` to use the same attachment scheme.** The + schema field exists but the importer doesn't surface it (laddr's + JSON doesn't carry a "featured image" concept per blog post). If + someone wants a hero image on the detail screen, they'd pick the + first `Item\Media` from the body. *None* — let blog content + authors set it explicitly post-cutover via a future CMS surface. +- **Lazy body loading.** When post count grows past ~100 the + full-bodies-in-memory cost becomes worth reconsidering. *Deferred + to plan* — `#45` already tracks this.