From d7db367a2d14543a6dc5ba7f300a1327fc7cab69 Mon Sep 17 00:00:00 2001
From: Chris Alfano <chris@jarv.us>
Date: Sat, 30 May 2026 14:20:22 -0400
Subject: [PATCH] fix(importer): catch inline markdown image URLs + alt laddr
 URL shapes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After PR #109 landed, a sandbox smoke check found 25 codeforphilly.org
media URLs still leaking through to bodies. Two root causes:

  1. **Inline markdown image syntax inside Markdown items.** Authors
     wrote `![alt](https://codeforphilly.org/thumbnail/...)` directly
     in some posts rather than using the structured Media item path.
     The Embed-only URL scan never reached them.
  2. **Alternate URL shapes** my regex didn't accept:
       /media/<id>         (no trailing slash)
       /media/open/<id>    (legacy "open media" namespace)
       /sitedata/<id>      (older asset namespace)

Fix: rename `rewriteEmbedHtml` → `rewriteLaddrMediaUrls`, broaden the
regex to accept the four observed URL shapes, and apply it as a final
pass over the **assembled body** (after items have been joined). That
catches Markdown-item inline image references the per-item code path
can't reach.

Per-item Embed rewriting still happens — same regex, same dedup map —
because some Embeds reference media that doesn't appear elsewhere
(YouTube thumbs etc.) and we want to capture them too.

Two new tests cover the inline-markdown case and the alt URL shapes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 apps/api/scripts/import-laddr/translators.ts | 57 ++++++++++++++------
 apps/api/tests/import-laddr.test.ts          | 54 +++++++++++++++++++
 2 files changed, 96 insertions(+), 15 deletions(-)
diff --git a/apps/api/scripts/import-laddr/translators.ts b/apps/api/scripts/import-laddr/translators.ts
index 2592d6e..6e622ec 100644
--- a/apps/api/scripts/import-laddr/translators.ts
+++ b/apps/api/scripts/import-laddr/translators.ts
@@ -711,25 +711,44 @@ export interface BlogMediaAsset {
 }
 
 /**
- * Scan an Embed item's raw HTML for legacy laddr media URLs and rewrite
- * them inline to use the placeholder. Returns the rewritten HTML plus a
- * list of asset descriptors discovered. Third-party URLs (YouTube iframes,
- * external links) are left alone.
+ * Match laddr media URLs in any text — markdown source, raw HTML inside
+ * Embed items, or stray references in author-written prose. URL shapes
+ * surveyed across production data:
  *
- * URL pattern matched: `https?://codeforphilly.org/(thumbnail|media)/<id>/...`
- * up to the first whitespace, `"`, `'`, `<`, or `)`. The MediaID is the
- * capture group used to dedupe and key the asset; the path tail after the
- * ID is discarded since we always fetch via `/media/<id>/original`.
+ *   https://codeforphilly.org/thumbnail/<id>/<dimensions>
+ *   https://codeforphilly.org/media/<id>             (no trailing slash)
+ *   https://codeforphilly.org/media/open/<id>        (legacy "open media")
+ *   https://codeforphilly.org/sitedata/<id>          (older asset namespace)
+ *
+ * The MediaID is the capture group; the path tail is discarded because
+ * we always fetch via `/media/<id>/original`. The match terminates at
+ * the first whitespace, `"`, `'`, `<`, or `)` — sufficient because
+ * URLs never embed those characters and the alternatives (parentheses
+ * inside URLs) don't occur in laddr's data.
+ */
+const LADDR_MEDIA_URL_RE =
+  /https?:\/\/codeforphilly\.org\/(?:thumbnail|media\/open|media|sitedata)\/(\d+)(?:\/[^"'<\s)]*)?/g;
+
+/**
+ * Rewrite any laddr media URL in `text` to use the `cfp-media:<id>`
+ * placeholder. Returns the rewritten text plus discovers each
+ * MediaID into `collected`. Third-party URLs (YouTube iframes,
+ * external links) pass through untouched.
+ *
+ * Applied at two layers:
+ *   1. Per-item, to Embed HTML (where attribute-quoted URLs live).
+ *   2. As a final defensive pass over the assembled body (catches
+ *      author-written `![alt](https://codeforphilly.org/...)` inside
+ *      Markdown items that no per-item path would have rewritten).
  */
-function rewriteEmbedHtml(
-  html: string,
+function rewriteLaddrMediaUrls(
+  text: string,
   ownerSlug: string,
   collected: Map<number, BlogMediaAsset>,
 ): string {
-  const re = /https?:\/\/codeforphilly\.org\/(?:thumbnail|media)\/(\d+)\/[^"'<\s)]+/g;
-  return html.replace(re, (_full, idStr: string) => {
+  return text.replace(LADDR_MEDIA_URL_RE, (full, idStr: string) => {
     const mediaId = Number(idStr);
-    if (!Number.isFinite(mediaId)) return _full;
+    if (!Number.isFinite(mediaId)) return full;
     if (!collected.has(mediaId)) {
       collected.set(mediaId, {
         mediaId,
@@ -807,7 +826,7 @@ function assembleBlogBody(
       if (typeof item.Data === 'string' && item.Data.trim().length > 0) {
         // Rewrite legacy media URLs in the HTML to placeholders; third-
         // party URLs (YouTube iframes etc.) pass through untouched.
-        const rewritten = rewriteEmbedHtml(item.Data, ownerSlug, collected);
+        const rewritten = rewriteLaddrMediaUrls(item.Data, ownerSlug, collected);
         blocks.push(rewritten);
       }
     } else {
@@ -816,8 +835,16 @@ function assembleBlogBody(
       );
     }
   }
+  // Final defensive pass over the assembled body — catches inline
+  // `![alt](https://codeforphilly.org/thumbnail/...)` references that
+  // authors wrote directly inside Markdown items rather than via the
+  // structured Media item path. Without this, those URLs would survive
+  // unrewritten and break at cutover.
+  const joined = blocks.join('\n\n');
+  const finalBody = rewriteLaddrMediaUrls(joined, ownerSlug, collected);
+
   return {
-    body: blocks.join('\n\n'),
+    body: finalBody,
     mediaAssets: [...collected.values()],
   };
 }
diff --git a/apps/api/tests/import-laddr.test.ts b/apps/api/tests/import-laddr.test.ts
index eaab2e3..1b35e50 100644
--- a/apps/api/tests/import-laddr.test.ts
+++ b/apps/api/tests/import-laddr.test.ts
@@ -515,6 +515,60 @@ describe('translateBlogPost', () => {
     expect(t!.mediaAssets.find((a) => a.sourceUrl.includes('youtube'))).toBeUndefined();
   });
 
+  it('rewrites inline markdown image URLs in Markdown items', () => {
+    // Authors sometimes write `![alt](https://codeforphilly.org/thumbnail/<id>/<dim>)`
+    // directly inside a Markdown item instead of using the structured Media
+    // item path. The final body pass catches those.
+    const c = ctx();
+    const row: RawBlogPost = {
+      ID: 18,
+      Class: 'BlogPost',
+      Handle: 'inline-markdown-image',
+      Title: 'Inline',
+      Published: 1746028800,
+      items: [
+        {
+          ID: 220,
+          Class: 'Emergence\\CMS\\Item\\Markdown',
+          Order: 1,
+          Data: 'See ![hero](https://codeforphilly.org/thumbnail/2953/700x700) for context.',
+        },
+      ],
+    };
+    const t = translateBlogPost(row, c);
+    expect(t!.record.body).toBe('See ![hero](cfp-media:2953) for context.');
+    expect(t!.mediaAssets.map((a) => a.mediaId)).toEqual([2953]);
+  });
+
+  it('matches alternate laddr URL shapes (media without slash, open-media, sitedata)', () => {
+    const c = ctx();
+    const row: RawBlogPost = {
+      ID: 19,
+      Class: 'BlogPost',
+      Handle: 'alt-shapes',
+      Title: 'Alt shapes',
+      Published: 1746028800,
+      items: [
+        {
+          ID: 230,
+          Class: 'Emergence\\CMS\\Item\\Markdown',
+          Order: 1,
+          Data:
+            'a https://codeforphilly.org/media/679 ' +
+            'b https://codeforphilly.org/media/open/1379 ' +
+            'c https://codeforphilly.org/sitedata/2200',
+        },
+      ],
+    };
+    const t = translateBlogPost(row, c);
+    expect(t!.record.body).toContain('cfp-media:679');
+    expect(t!.record.body).toContain('cfp-media:1379');
+    expect(t!.record.body).toContain('cfp-media:2200');
+    expect(t!.mediaAssets.map((a) => a.mediaId).sort((x, y) => x - y)).toEqual([
+      679, 1379, 2200,
+    ]);
+  });
+
   it('sorts items by Order before assembling', () => {
     const c = ctx();
     const row: RawBlogPost = {