Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 257 additions & 11 deletions apps/api/scripts/import-laddr/importer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ import {
type RawTag,
} from './json-fetcher.js';
import {
mediaPlaceholderUrl,
newExistingIds,
newIdMaps,
translateBlogPost,
Expand All @@ -85,10 +86,12 @@ import {
translateTag,
translateTagAssignment,
translateUpdate,
type BlogMediaAsset,
type ExistingIds,
type TranslateCtx,
type Warnings,
} from './translators.js';
import { BlobObject } from 'hologit';

// ---------------------------------------------------------------------------
// Public types
Expand Down Expand Up @@ -182,6 +185,9 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise<ImportRe
// against the last even when nothing changed upstream.
// -------------------------------------------------------------------------
let store: PublicStore | null = null;
// Gitsheets Repository — needed to write attachment blobs via
// BlobObject.write into the underlying git object DB.
let publicRepo: Awaited<ReturnType<typeof openPublicStore>>['repo'] | null = null;
let existingIds: ExistingIds;

if (opts.dryRun) {
Expand All @@ -191,6 +197,7 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise<ImportRe
await ensureBranchCheckedOut(repo, branch, initialParent);
const opened = await openPublicStore(repo);
store = opened.store;
publicRepo = opened.repo;
existingIds = await collectExistingIds(store, log);
}

Expand Down Expand Up @@ -387,29 +394,33 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise<ImportRe
}

log(`[import] fetching blog from ${opts.sourceHost}`);
const blogPosts: BlogPost[] = [];
// `?include=*` is the only way to get the body content — laddr stores
// it as a typed `items` array on `AbstractContent`, not as a flat Body
// field. translateBlogPost assembles markdown from those items.
// field. translateBlogPost assembles markdown from those items and
// returns the record alongside a media-asset plan.
const blogTranslations: Array<{
record: BlogPost;
mediaAssets: readonly BlogMediaAsset[];
}> = [];
for await (const row of fetchAllPages<RawBlogPost>(
'/blog',
RawBlogPostSchema,
{ include: '*' },
fetchOpts,
)) {
const bp = translateBlogPost(row, ctx);
if (bp === null) {
const t = translateBlogPost(row, ctx);
if (t === null) {
counts['blog-posts']!.skipped++;
continue;
}
const parsedBp = parseOrSkip(
const parsedRecord = parseOrSkip(
'blog-posts',
() => BlogPostSchema.parse(bp),
() => BlogPostSchema.parse(t.record),
counts,
warnings,
);
if (parsedBp) {
blogPosts.push(parsedBp);
if (parsedRecord) {
blogTranslations.push({ record: parsedRecord, mediaAssets: t.mediaAssets });
counts['blog-posts']!.imported++;
}
}
Expand All @@ -429,6 +440,26 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise<ImportRe
};
}

// -------------------------------------------------------------------------
// 2a. Pre-fetch blog post media assets.
//
// Each translated blog post may reference some number of laddr Media
// items by their numeric MediaID. We fetch the original bytes for each
// referenced MediaID, derive a content-type-aware filename, and rewrite
// the placeholder URLs in each post's body (`cfp-media:<id>`) to the
// final `/api/attachments/blog-posts/<slug>/<filename>` URL.
//
// Failed fetches don't block the import — the markdown link will 404
// at serve time, but the post itself still imports with the rest of
// its body intact.
// -------------------------------------------------------------------------
const mediaArtifactsBySlug = await fetchAndMaterializeBlogMedia(
blogTranslations,
fetchOpts,
log,
warnings,
);

// -------------------------------------------------------------------------
// 3. One atomic gitsheets transaction:
// - clear() each importer-owned sheet (deletes capture for free)
Expand Down Expand Up @@ -505,10 +536,31 @@ export async function importLaddrFromJson(opts: ImportOptions): Promise<ImportRe
await tx['project-buzz'].upsert({ ...record, projectSlug } as ProjectBuzz);
}

log(`[import] clear + upsert blog-posts (${blogPosts.length})`);
log(
`[import] clear + upsert blog-posts (${blogTranslations.length}) + media attachments`,
);
await tx['blog-posts'].clear();
for (const post of blogPosts) {
await tx['blog-posts'].upsert(post);
if (publicRepo === null) {
throw new Error('[import-laddr] internal: publicRepo not opened');
}
const hologit = publicRepo.hologitRepo;
for (const { record } of blogTranslations) {
const artifacts = mediaArtifactsBySlug.get(record.slug) ?? [];
if (artifacts.length > 0) {
const blobs: Record<string, BlobObject> = {};
for (const a of artifacts) {
// BlobObject.write hashes the buffer into the git object DB.
// Same `as unknown as string` cast as the avatar route — the
// declared signature is too narrow; the underlying
// git-client `$putBlob` accepts Buffer at runtime.
blobs[a.filename] = await BlobObject.write(
hologit,
a.bytes as unknown as string,
);
}
await tx['blog-posts'].setAttachments(record, blobs);
}
await tx['blog-posts'].upsert(record);
}

log(`[import] clear + upsert tag-assignments (${tagAssignments.length})`);
Expand Down Expand Up @@ -730,3 +782,197 @@ async function collectExistingIds(
log(`[import] pre-pass: preserved ${count} record UUIDs from previous snapshot`);
return ids;
}

// ---------------------------------------------------------------------------
// Blog media pre-fetch + materialization
// ---------------------------------------------------------------------------

/**
* One materialized attachment ready to be written into the gitsheets tree.
*/
interface BlogMediaArtifact {
/** Filename (with extension) — relative to `blog-posts/<slug>/`. */
readonly filename: string;
/** Original bytes. */
readonly bytes: Buffer;
}

/** Content-Type → file extension. Unknown types → null (skipped). */
const EXT_BY_MIME: Record<string, string> = {
'image/jpeg': 'jpg',
'image/png': 'png',
'image/gif': 'gif',
'image/webp': 'webp',
'image/avif': 'avif',
'image/svg+xml': 'svg',
};

function extFromContentType(contentType: string | null): string | null {
if (!contentType) return null;
// `image/jpeg; charset=…` → `image/jpeg`.
const base = contentType.split(';')[0]?.trim().toLowerCase() ?? '';
return EXT_BY_MIME[base] ?? null;
}

/**
* Fetch one media asset's bytes + content-type. Returns null on any
* non-2xx or unexpected error — the import shouldn't abort because one
* image disappeared upstream.
*/
async function fetchMediaBytes(
url: string,
fetchImpl: typeof fetch,
userAgent: string,
): Promise<{ bytes: Buffer; contentType: string | null } | null> {
try {
const res = await fetchImpl(url, {
headers: { 'User-Agent': userAgent },
});
if (!res.ok) return null;
const ab = await res.arrayBuffer();
return {
bytes: Buffer.from(ab),
contentType: res.headers.get('content-type'),
};
} catch {
return null;
}
}

/**
* Fetch every distinct media asset referenced across all blog posts,
* derive the final filename per asset, then rewrite each post's body to
* replace `cfp-media:<id>` placeholders with the final
* `/api/attachments/blog-posts/<slug>/<filename>` URL.
*
* Returns the map of artifacts keyed by post slug, ready for the
* transact callback to wire into gitsheets via setAttachments.
*
* Same MediaID can appear in multiple posts (rare but possible). Each
* post gets its own copy of the asset under its own subdir — the
* git object DB dedupes the bytes by content hash, so the repo size
* cost is the metadata overhead per reference, not the bytes.
*
* Concurrency: 4 parallel fetches at a time (a politeness compromise
* — fewer would slow imports; more would hammer laddr). The JSON
* fetcher's per-page `delayMs` doesn't apply here since these are
* binary endpoints, not paged JSON.
*/
async function fetchAndMaterializeBlogMedia(
blogTranslations: Array<{ record: BlogPost; mediaAssets: readonly BlogMediaAsset[] }>,
fetchOpts: FetchOptions,
log: (msg: string) => void,
warnings: Warnings,
): Promise<Map<string, BlogMediaArtifact[]>> {
const fetchImpl = fetchOpts.fetchImpl ?? fetch;
const userAgent = fetchOpts.userAgent ?? 'cfp-importer/dev';

// Flatten so we can drive parallel fetches across all posts.
const flat: Array<{
ownerSlug: string;
asset: BlogMediaAsset;
}> = [];
for (const { record, mediaAssets } of blogTranslations) {
for (const asset of mediaAssets) {
flat.push({ ownerSlug: record.slug, asset });
}
}

log(`[import] fetching ${flat.length} blog media assets`);

/** What the fetch loop produces per asset. */
interface FetchedAsset {
readonly ownerSlug: string;
readonly asset: BlogMediaAsset;
readonly bytes: Buffer | null;
readonly ext: string | null;
}

const results: FetchedAsset[] = [];
const CONCURRENCY = 4;
let cursor = 0;
const workers: Promise<void>[] = [];
for (let w = 0; w < CONCURRENCY; w++) {
workers.push(
(async () => {
while (true) {
const idx = cursor++;
if (idx >= flat.length) return;
const entry = flat[idx]!;
const fetched = await fetchMediaBytes(entry.asset.sourceUrl, fetchImpl, userAgent);
if (fetched === null) {
warnings.push(
`[blog-posts] media fetch failed: ${entry.asset.sourceUrl} (referenced by /${entry.ownerSlug})`,
);
results.push({
ownerSlug: entry.ownerSlug,
asset: entry.asset,
bytes: null,
ext: null,
});
continue;
}
const ext = extFromContentType(fetched.contentType);
if (ext === null) {
warnings.push(
`[blog-posts] media ${entry.asset.sourceUrl} returned unsupported Content-Type ${JSON.stringify(fetched.contentType)}; skipped`,
);
results.push({
ownerSlug: entry.ownerSlug,
asset: entry.asset,
bytes: null,
ext: null,
});
continue;
}
results.push({
ownerSlug: entry.ownerSlug,
asset: entry.asset,
bytes: fetched.bytes,
ext,
});
}
})(),
);
}
await Promise.all(workers);

// Build the placeholder → final URL substitution table per post, plus
// the artifact list keyed by post slug.
const artifactsBySlug = new Map<string, BlogMediaArtifact[]>();
const substitutionByPost = new Map<string, Map<string, string>>();
for (const r of results) {
if (r.bytes === null || r.ext === null) continue;
const filename = `${r.asset.captionSlug}-${r.asset.mediaId}.${r.ext}`;
const finalUrl = `/api/attachments/blog-posts/${r.ownerSlug}/${filename}`;

let arts = artifactsBySlug.get(r.ownerSlug);
if (!arts) {
arts = [];
artifactsBySlug.set(r.ownerSlug, arts);
}
arts.push({ filename, bytes: r.bytes });

let subs = substitutionByPost.get(r.ownerSlug);
if (!subs) {
subs = new Map();
substitutionByPost.set(r.ownerSlug, subs);
}
subs.set(mediaPlaceholderUrl(r.asset.mediaId), finalUrl);
}

// Walk records and substitute placeholders in their bodies.
for (const t of blogTranslations) {
const subs = substitutionByPost.get(t.record.slug);
if (!subs || subs.size === 0) continue;
let body = t.record.body;
for (const [placeholder, finalUrl] of subs) {
body = body.split(placeholder).join(finalUrl);
}
// Mutate the record in place — it's been Zod-validated already and
// the schema just requires `body: string`, no need to reparse.
(t.record as { body: string }).body = body;
}

return artifactsBySlug;
}
Loading