From d7db367a2d14543a6dc5ba7f300a1327fc7cab69 Mon Sep 17 00:00:00 2001 From: Chris Alfano Date: Sat, 30 May 2026 14:20:22 -0400 Subject: [PATCH] fix(importer): catch inline markdown image URLs + alt laddr URL shapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After PR #109 landed, a sandbox smoke check found 25 codeforphilly.org media URLs still leaking through to bodies. Two root causes: 1. **Inline markdown image syntax inside Markdown items.** Authors wrote `![alt](https://codeforphilly.org/thumbnail/...)` directly in some posts rather than using the structured Media item path. The Embed-only URL scan never reached them. 2. **Alternate URL shapes** my regex didn't accept: /media/ (no trailing slash) /media/open/ (legacy "open media" namespace) /sitedata/ (older asset namespace) Fix: rename `rewriteEmbedHtml` → `rewriteLaddrMediaUrls`, broaden the regex to accept the four observed URL shapes, and apply it as a final pass over the **assembled body** (after items have been joined). That catches Markdown-item inline image references the per-item code path can't reach. Per-item Embed rewriting still happens — same regex, same dedup map — because some Embeds reference media that doesn't appear elsewhere (YouTube thumbs etc.) and we want to capture them too. Two new tests cover the inline-markdown case and the alt URL shapes. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/api/scripts/import-laddr/translators.ts | 57 ++++++++++++++------ apps/api/tests/import-laddr.test.ts | 54 +++++++++++++++++++ 2 files changed, 96 insertions(+), 15 deletions(-) diff --git a/apps/api/scripts/import-laddr/translators.ts b/apps/api/scripts/import-laddr/translators.ts index 2592d6e..6e622ec 100644 --- a/apps/api/scripts/import-laddr/translators.ts +++ b/apps/api/scripts/import-laddr/translators.ts @@ -711,25 +711,44 @@ export interface BlogMediaAsset { } /** - * Scan an Embed item's raw HTML for legacy laddr media URLs and rewrite - * them inline to use the placeholder. Returns the rewritten HTML plus a - * list of asset descriptors discovered. Third-party URLs (YouTube iframes, - * external links) are left alone. + * Match laddr media URLs in any text — markdown source, raw HTML inside + * Embed items, or stray references in author-written prose. URL shapes + * surveyed across production data: * - * URL pattern matched: `https?://codeforphilly.org/(thumbnail|media)//...` - * up to the first whitespace, `"`, `'`, `<`, or `)`. The MediaID is the - * capture group used to dedupe and key the asset; the path tail after the - * ID is discarded since we always fetch via `/media//original`. + * https://codeforphilly.org/thumbnail// + * https://codeforphilly.org/media/ (no trailing slash) + * https://codeforphilly.org/media/open/ (legacy "open media") + * https://codeforphilly.org/sitedata/ (older asset namespace) + * + * The MediaID is the capture group; the path tail is discarded because + * we always fetch via `/media//original`. The match terminates at + * the first whitespace, `"`, `'`, `<`, or `)` — sufficient because + * URLs never embed those characters and the alternatives (parentheses + * inside URLs) don't occur in laddr's data. + */ +const LADDR_MEDIA_URL_RE = + /https?:\/\/codeforphilly\.org\/(?:thumbnail|media\/open|media|sitedata)\/(\d+)(?:\/[^"'<\s)]*)?/g; + +/** + * Rewrite any laddr media URL in `text` to use the `cfp-media:` + * placeholder. Returns the rewritten text plus discovers each + * MediaID into `collected`. Third-party URLs (YouTube iframes, + * external links) pass through untouched. + * + * Applied at two layers: + * 1. Per-item, to Embed HTML (where attribute-quoted URLs live). + * 2. As a final defensive pass over the assembled body (catches + * author-written `![alt](https://codeforphilly.org/...)` inside + * Markdown items that no per-item path would have rewritten). */ -function rewriteEmbedHtml( - html: string, +function rewriteLaddrMediaUrls( + text: string, ownerSlug: string, collected: Map, ): string { - const re = /https?:\/\/codeforphilly\.org\/(?:thumbnail|media)\/(\d+)\/[^"'<\s)]+/g; - return html.replace(re, (_full, idStr: string) => { + return text.replace(LADDR_MEDIA_URL_RE, (full, idStr: string) => { const mediaId = Number(idStr); - if (!Number.isFinite(mediaId)) return _full; + if (!Number.isFinite(mediaId)) return full; if (!collected.has(mediaId)) { collected.set(mediaId, { mediaId, @@ -807,7 +826,7 @@ function assembleBlogBody( if (typeof item.Data === 'string' && item.Data.trim().length > 0) { // Rewrite legacy media URLs in the HTML to placeholders; third- // party URLs (YouTube iframes etc.) pass through untouched. - const rewritten = rewriteEmbedHtml(item.Data, ownerSlug, collected); + const rewritten = rewriteLaddrMediaUrls(item.Data, ownerSlug, collected); blocks.push(rewritten); } } else { @@ -816,8 +835,16 @@ function assembleBlogBody( ); } } + // Final defensive pass over the assembled body — catches inline + // `![alt](https://codeforphilly.org/thumbnail/...)` references that + // authors wrote directly inside Markdown items rather than via the + // structured Media item path. Without this, those URLs would survive + // unrewritten and break at cutover. + const joined = blocks.join('\n\n'); + const finalBody = rewriteLaddrMediaUrls(joined, ownerSlug, collected); + return { - body: blocks.join('\n\n'), + body: finalBody, mediaAssets: [...collected.values()], }; } diff --git a/apps/api/tests/import-laddr.test.ts b/apps/api/tests/import-laddr.test.ts index eaab2e3..1b35e50 100644 --- a/apps/api/tests/import-laddr.test.ts +++ b/apps/api/tests/import-laddr.test.ts @@ -515,6 +515,60 @@ describe('translateBlogPost', () => { expect(t!.mediaAssets.find((a) => a.sourceUrl.includes('youtube'))).toBeUndefined(); }); + it('rewrites inline markdown image URLs in Markdown items', () => { + // Authors sometimes write `![alt](https://codeforphilly.org/thumbnail//)` + // directly inside a Markdown item instead of using the structured Media + // item path. The final body pass catches those. + const c = ctx(); + const row: RawBlogPost = { + ID: 18, + Class: 'BlogPost', + Handle: 'inline-markdown-image', + Title: 'Inline', + Published: 1746028800, + items: [ + { + ID: 220, + Class: 'Emergence\\CMS\\Item\\Markdown', + Order: 1, + Data: 'See ![hero](https://codeforphilly.org/thumbnail/2953/700x700) for context.', + }, + ], + }; + const t = translateBlogPost(row, c); + expect(t!.record.body).toBe('See ![hero](cfp-media:2953) for context.'); + expect(t!.mediaAssets.map((a) => a.mediaId)).toEqual([2953]); + }); + + it('matches alternate laddr URL shapes (media without slash, open-media, sitedata)', () => { + const c = ctx(); + const row: RawBlogPost = { + ID: 19, + Class: 'BlogPost', + Handle: 'alt-shapes', + Title: 'Alt shapes', + Published: 1746028800, + items: [ + { + ID: 230, + Class: 'Emergence\\CMS\\Item\\Markdown', + Order: 1, + Data: + 'a https://codeforphilly.org/media/679 ' + + 'b https://codeforphilly.org/media/open/1379 ' + + 'c https://codeforphilly.org/sitedata/2200', + }, + ], + }; + const t = translateBlogPost(row, c); + expect(t!.record.body).toContain('cfp-media:679'); + expect(t!.record.body).toContain('cfp-media:1379'); + expect(t!.record.body).toContain('cfp-media:2200'); + expect(t!.mediaAssets.map((a) => a.mediaId).sort((x, y) => x - y)).toEqual([ + 679, 1379, 2200, + ]); + }); + it('sorts items by Order before assembling', () => { const c = ctx(); const row: RawBlogPost = {