From f81947138f8f62fb56aa4c5852387ad81030a9f5 Mon Sep 17 00:00:00 2001
From: tt-a1i <53142663+tt-a1i@users.noreply.github.com>
Date: Sat, 20 Jun 2026 08:53:51 +0800
Subject: [PATCH] fix(tui): save pasted images for text-only models

---
 .changeset/pasted-image-text-model-path.md    |  5 ++
 apps/kimi-code/src/tui/kimi-tui.ts            | 20 +++---
 .../src/tui/utils/image-placeholder.ts        | 68 +++++++++++++++++--
 .../test/tui/input/image-placeholder.test.ts  | 27 ++++++++
 .../test/tui/kimi-tui-message-flow.test.ts    | 44 ++++++++++++
 5 files changed, 152 insertions(+), 12 deletions(-)
 create mode 100644 .changeset/pasted-image-text-model-path.md
diff --git a/.changeset/pasted-image-text-model-path.md b/.changeset/pasted-image-text-model-path.md
new file mode 100644
index 000000000..5f1206bca
--- /dev/null
+++ b/.changeset/pasted-image-text-model-path.md
@@ -0,0 +1,5 @@
+---
+"@moonshot-ai/kimi-code": patch
+---
+
+Save pasted images as local file references when the selected model does not support image input.
diff --git a/apps/kimi-code/src/tui/kimi-tui.ts b/apps/kimi-code/src/tui/kimi-tui.ts
index b0a2094de..7dc458371 100644
--- a/apps/kimi-code/src/tui/kimi-tui.ts
+++ b/apps/kimi-code/src/tui/kimi-tui.ts
@@ -769,13 +769,24 @@ export class KimiTUI {
       this.showError(LLM_NOT_SET_MESSAGE);
       return;
     }
-    const extraction = extractMediaAttachments(text, this.imageStore);
+    let extraction = extractMediaAttachments(text, this.imageStore);
     if (!this.validateMediaCapabilities(extraction)) return;
     const session = this.session;
     if (session === undefined) {
       this.showError(LLM_NOT_SET_MESSAGE);
       return;
     }
+    if (
+      extraction.imageAttachmentIds.length > 0 &&
+      !this.supportsCurrentModelCapability('image_in')
+    ) {
+      try {
+        extraction = extractMediaAttachments(text, this.imageStore, { imageMode: 'file-tag' });
+      } catch (error) {
+        this.showError(`Failed to save pasted image: ${formatErrorMessage(error)}`);
+        return;
+      }
+    }
     if (extraction.hasMedia) {
       this.sendMessage(session, text, {
         hasMedia: true,
@@ -793,13 +804,6 @@ export class KimiTUI {
     extraction: ReturnType<typeof extractMediaAttachments>,
   ): boolean {
     if (!extraction.hasMedia) return true;
-    if (
-      extraction.imageAttachmentIds.length > 0 &&
-      !this.supportsCurrentModelCapability('image_in')
-    ) {
-      this.showError('Current model does not support image input.');
-      return false;
-    }
     if (
       extraction.videoAttachmentIds.length > 0 &&
       !this.supportsCurrentModelCapability('video_in')
diff --git a/apps/kimi-code/src/tui/utils/image-placeholder.ts b/apps/kimi-code/src/tui/utils/image-placeholder.ts
index 11c401f2f..80dca3e0b 100644
--- a/apps/kimi-code/src/tui/utils/image-placeholder.ts
+++ b/apps/kimi-code/src/tui/utils/image-placeholder.ts
@@ -7,14 +7,19 @@
  *     A literal `[image #999 ...]` the user typed themselves stays in
  *     the text (we can't hallucinate files for it).
  *   - Order is preserved for text/image/video segments. Image placeholders
- *     expand to image content parts so the prompt reaches the provider
- *     without relying on a model tool call. Video placeholders still expand
- *     to file-path tags so `ReadMediaFile` can own video upload behavior.
+ *     expand to image content parts when the model can see images. For
+ *     text-only models they can instead expand to file-path tags, matching
+ *     the video path flow.
  *   - Adjacent text segments are flattened — empty / whitespace-only
  *     segments drop out so we never emit `{type:'text', text:' '}`
  *     noise between two media parts.
  */
 
+import { randomUUID } from 'node:crypto';
+import { mkdtempSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
 import type { PromptPart } from '@moonshot-ai/kimi-code-sdk';
 
 import type {
@@ -24,6 +29,15 @@ import type {
 } from './image-attachment-store';
 
 const PLACEHOLDER_REGEX = /\[(image|video) #(\d+) (?:(\(\d+×\d+\))|([^\]]+))\]/g;
+const PASTED_IMAGE_DIR_PREFIX = join(tmpdir(), 'kimi-code-pasted-images-');
+let pastedImageDir: string | undefined;
+
+export type ImageExtractionMode = 'content-part' | 'file-tag';
+
+export interface ExtractionOptions {
+  readonly imageMode?: ImageExtractionMode;
+  readonly writeImageAttachment?: (attachment: ImageAttachment) => string;
+}
 
 export interface ExtractionResult {
   /** Flat list of parts in input order; empty array when no media matched. */
@@ -42,6 +56,7 @@ export interface ExtractionResult {
 export function extractMediaAttachments(
   text: string,
   store: ImageAttachmentStore,
+  options?: ExtractionOptions,
 ): ExtractionResult {
   const parts: PromptPart[] = [];
   const imageAttachmentIds: number[] = [];
@@ -66,7 +81,11 @@ export function extractMediaAttachments(
       pushText(parts, mediaText);
       videoAttachmentIds.push(id);
     } else {
-      parts.push(imagePartForAttachment(attachment));
+      if ((options?.imageMode ?? 'content-part') === 'file-tag') {
+        pushText(parts, tagTextForImage(attachment, options));
+      } else {
+        parts.push(imagePartForAttachment(attachment));
+      }
       imageAttachmentIds.push(id);
     }
     hasMedia = true;
@@ -113,10 +132,51 @@ function tagTextForVideo(att: VideoAttachment): string {
   return formatMediaTag('video', att.sourcePath);
 }
 
+function tagTextForImage(att: ImageAttachment, options: ExtractionOptions | undefined): string {
+  const path = options?.writeImageAttachment?.(att) ?? writeImageAttachmentToTempFile(att);
+  return formatMediaTag('image', path);
+}
+
 function formatMediaTag(tag: 'image' | 'video', path: string): string {
   return `<${tag} path="${escapeAttribute(path)}"></${tag}>`;
 }
 
+function writeImageAttachmentToTempFile(att: ImageAttachment): string {
+  const path = join(
+    getPastedImageDir(),
+    `pasted-${String(Date.now())}-${String(att.id)}-${randomUUID()}${extensionForMime(att.mime)}`,
+  );
+  writeFileSync(path, att.bytes, { mode: 0o600 });
+  return path;
+}
+
+function getPastedImageDir(): string {
+  pastedImageDir ??= mkdtempSync(PASTED_IMAGE_DIR_PREFIX);
+  return pastedImageDir;
+}
+
+function extensionForMime(mime: string): string {
+  const [type] = mime.toLowerCase().split(';', 1);
+  switch (type ?? '') {
+    case 'image/png':
+      return '.png';
+    case 'image/jpeg':
+      return '.jpg';
+    case 'image/gif':
+      return '.gif';
+    case 'image/webp':
+      return '.webp';
+    case 'image/bmp':
+      return '.bmp';
+    case 'image/heic':
+      return '.heic';
+    case 'image/heif':
+      return '.heif';
+    default:
+      return '.img';
+  }
+}
+
 function escapeAttribute(value: string): string {
   return value
     .replaceAll('&', '&amp;')
diff --git a/apps/kimi-code/test/tui/input/image-placeholder.test.ts b/apps/kimi-code/test/tui/input/image-placeholder.test.ts
index cdc74e913..a3cc39a4d 100644
--- a/apps/kimi-code/test/tui/input/image-placeholder.test.ts
+++ b/apps/kimi-code/test/tui/input/image-placeholder.test.ts
@@ -84,6 +84,33 @@ describe('extractMediaAttachments', () => {
     });
   });
 
+  it('expands image placeholders to file-path tags for text-only models', () => {
+    const bytes = new Uint8Array([0x89, 0x50, 0x4e, 0x47]);
+    const { store, placeholder } = storeWith(bytes);
+    const r = extractMediaAttachments(`inspect ${placeholder}`, store, {
+      imageMode: 'file-tag',
+      writeImageAttachment: (att) => `/tmp/kimi/pasted-${String(att.id)}.png`,
+    });
+
+    expect(r.hasMedia).toBe(true);
+    expect(r.imageAttachmentIds).toEqual([1]);
+    expect(r.parts).toEqual([
+      { type: 'text', text: 'inspect <image path="/tmp/kimi/pasted-1.png"></image>' },
+    ]);
+  });
+
+  it('escapes generated image path tags', () => {
+    const { store, placeholder } = storeWith(new Uint8Array([1]));
+    const r = extractMediaAttachments(placeholder, store, {
+      imageMode: 'file-tag',
+      writeImageAttachment: () => '/tmp/a&"<>.png',
+    });
+
+    expect(r.parts).toEqual([
+      { type: 'text', text: '<image path="/tmp/a&amp;&quot;&lt;&gt;.png"></image>' },
+    ]);
+  });
+
   it('escapes media paths in generated tags', () => {
     const store = new ImageAttachmentStore();
     const att = store.addVideo('video/mp4', '/tmp/a&"<>.mp4', 'sample.mp4');
diff --git a/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts b/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts
index cab1bfeae..aef90731a 100644
--- a/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts
+++ b/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts
@@ -1164,6 +1164,50 @@ command = "vim"
     ]);
   });
 
+  it('sends pasted images as file tags when the selected model is text-only', async () => {
+    const { driver, session } = await makeDriver(makeSession(), {
+      getConfig: vi.fn(async () => ({
+        models: {
+          k2: {
+            model: 'moonshot-v1',
+            maxContextSize: 100,
+            capabilities: ['thinking', 'tool_use'],
+          },
+        },
+      })),
+    });
+    const imageStore = (driver as unknown as { imageStore: ImageAttachmentStore }).imageStore;
+    const attachment = imageStore.addImage(new Uint8Array([0xaa, 0xbb]), 'image/png', 1, 1);
+
+    driver.handleUserInput(`describe ${attachment.placeholder}`);
+
+    const promptCalls = (session.prompt as unknown as { mock: { calls: unknown[][] } }).mock
+      .calls;
+    const promptArg = promptCalls[0]?.[0];
+    expect(promptArg).toEqual([
+      {
+        type: 'text',
+        text: expect.stringMatching(
+          /^describe <image path=".*kimi-code-pasted-images.*\.png"><\/image>$/,
+        ),
+      },
+    ]);
+    expect(JSON.stringify(promptArg)).not.toContain('image_url');
+    expect(driver.state.transcriptEntries).toEqual([
+      expect.objectContaining({
+        kind: 'user',
+        content: `describe ${attachment.placeholder}`,
+        imageAttachmentIds: [attachment.id],
+      }),
+    ]);
+
+    const text = (promptArg as Array<{ text?: string }>)[0]?.text;
+    const path = text?.match(/<image path="([^"]+)"><\/image>/)?.[1];
+    if (path !== undefined) {
+      await rm(path, { force: true });
+    }
+  });
+
   it('queues editor input instead of prompting while a turn is already streaming', async () => {
     const { driver, session, harness } = await makeDriver();
     driver.state.appState.streamingPhase = 'waiting';