From f81947138f8f62fb56aa4c5852387ad81030a9f5 Mon Sep 17 00:00:00 2001 From: tt-a1i <53142663+tt-a1i@users.noreply.github.com> Date: Sat, 20 Jun 2026 08:53:51 +0800 Subject: [PATCH] fix(tui): save pasted images for text-only models --- .changeset/pasted-image-text-model-path.md | 5 ++ apps/kimi-code/src/tui/kimi-tui.ts | 20 +++--- .../src/tui/utils/image-placeholder.ts | 68 +++++++++++++++++-- .../test/tui/input/image-placeholder.test.ts | 27 ++++++++ .../test/tui/kimi-tui-message-flow.test.ts | 44 ++++++++++++ 5 files changed, 152 insertions(+), 12 deletions(-) create mode 100644 .changeset/pasted-image-text-model-path.md diff --git a/.changeset/pasted-image-text-model-path.md b/.changeset/pasted-image-text-model-path.md new file mode 100644 index 000000000..5f1206bca --- /dev/null +++ b/.changeset/pasted-image-text-model-path.md @@ -0,0 +1,5 @@ +--- +"@moonshot-ai/kimi-code": patch +--- + +Save pasted images as local file references when the selected model does not support image input. diff --git a/apps/kimi-code/src/tui/kimi-tui.ts b/apps/kimi-code/src/tui/kimi-tui.ts index b0a2094de..7dc458371 100644 --- a/apps/kimi-code/src/tui/kimi-tui.ts +++ b/apps/kimi-code/src/tui/kimi-tui.ts @@ -769,13 +769,24 @@ export class KimiTUI { this.showError(LLM_NOT_SET_MESSAGE); return; } - const extraction = extractMediaAttachments(text, this.imageStore); + let extraction = extractMediaAttachments(text, this.imageStore); if (!this.validateMediaCapabilities(extraction)) return; const session = this.session; if (session === undefined) { this.showError(LLM_NOT_SET_MESSAGE); return; } + if ( + extraction.imageAttachmentIds.length > 0 && + !this.supportsCurrentModelCapability('image_in') + ) { + try { + extraction = extractMediaAttachments(text, this.imageStore, { imageMode: 'file-tag' }); + } catch (error) { + this.showError(`Failed to save pasted image: ${formatErrorMessage(error)}`); + return; + } + } if (extraction.hasMedia) { this.sendMessage(session, text, { hasMedia: true, @@ -793,13 +804,6 @@ export class KimiTUI { extraction: ReturnType, ): boolean { if (!extraction.hasMedia) return true; - if ( - extraction.imageAttachmentIds.length > 0 && - !this.supportsCurrentModelCapability('image_in') - ) { - this.showError('Current model does not support image input.'); - return false; - } if ( extraction.videoAttachmentIds.length > 0 && !this.supportsCurrentModelCapability('video_in') diff --git a/apps/kimi-code/src/tui/utils/image-placeholder.ts b/apps/kimi-code/src/tui/utils/image-placeholder.ts index 11c401f2f..80dca3e0b 100644 --- a/apps/kimi-code/src/tui/utils/image-placeholder.ts +++ b/apps/kimi-code/src/tui/utils/image-placeholder.ts @@ -7,14 +7,19 @@ * A literal `[image #999 ...]` the user typed themselves stays in * the text (we can't hallucinate files for it). * - Order is preserved for text/image/video segments. Image placeholders - * expand to image content parts so the prompt reaches the provider - * without relying on a model tool call. Video placeholders still expand - * to file-path tags so `ReadMediaFile` can own video upload behavior. + * expand to image content parts when the model can see images. For + * text-only models they can instead expand to file-path tags, matching + * the video path flow. * - Adjacent text segments are flattened — empty / whitespace-only * segments drop out so we never emit `{type:'text', text:' '}` * noise between two media parts. */ +import { randomUUID } from 'node:crypto'; +import { mkdtempSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + import type { PromptPart } from '@moonshot-ai/kimi-code-sdk'; import type { @@ -24,6 +29,15 @@ import type { } from './image-attachment-store'; const PLACEHOLDER_REGEX = /\[(image|video) #(\d+) (?:(\(\d+×\d+\))|([^\]]+))\]/g; +const PASTED_IMAGE_DIR_PREFIX = join(tmpdir(), 'kimi-code-pasted-images-'); +let pastedImageDir: string | undefined; + +export type ImageExtractionMode = 'content-part' | 'file-tag'; + +export interface ExtractionOptions { + readonly imageMode?: ImageExtractionMode; + readonly writeImageAttachment?: (attachment: ImageAttachment) => string; +} export interface ExtractionResult { /** Flat list of parts in input order; empty array when no media matched. */ @@ -42,6 +56,7 @@ export interface ExtractionResult { export function extractMediaAttachments( text: string, store: ImageAttachmentStore, + options?: ExtractionOptions, ): ExtractionResult { const parts: PromptPart[] = []; const imageAttachmentIds: number[] = []; @@ -66,7 +81,11 @@ export function extractMediaAttachments( pushText(parts, mediaText); videoAttachmentIds.push(id); } else { - parts.push(imagePartForAttachment(attachment)); + if ((options?.imageMode ?? 'content-part') === 'file-tag') { + pushText(parts, tagTextForImage(attachment, options)); + } else { + parts.push(imagePartForAttachment(attachment)); + } imageAttachmentIds.push(id); } hasMedia = true; @@ -113,10 +132,51 @@ function tagTextForVideo(att: VideoAttachment): string { return formatMediaTag('video', att.sourcePath); } +function tagTextForImage(att: ImageAttachment, options: ExtractionOptions | undefined): string { + const path = options?.writeImageAttachment?.(att) ?? writeImageAttachmentToTempFile(att); + return formatMediaTag('image', path); +} + function formatMediaTag(tag: 'image' | 'video', path: string): string { return `<${tag} path="${escapeAttribute(path)}">`; } +function writeImageAttachmentToTempFile(att: ImageAttachment): string { + const path = join( + getPastedImageDir(), + `pasted-${String(Date.now())}-${String(att.id)}-${randomUUID()}${extensionForMime(att.mime)}`, + ); + writeFileSync(path, att.bytes, { mode: 0o600 }); + return path; +} + +function getPastedImageDir(): string { + pastedImageDir ??= mkdtempSync(PASTED_IMAGE_DIR_PREFIX); + return pastedImageDir; +} + +function extensionForMime(mime: string): string { + const [type] = mime.toLowerCase().split(';', 1); + switch (type ?? '') { + case 'image/png': + return '.png'; + case 'image/jpeg': + return '.jpg'; + case 'image/gif': + return '.gif'; + case 'image/webp': + return '.webp'; + case 'image/bmp': + return '.bmp'; + case 'image/heic': + return '.heic'; + case 'image/heif': + return '.heif'; + default: + return '.img'; + } +} + function escapeAttribute(value: string): string { return value .replaceAll('&', '&') diff --git a/apps/kimi-code/test/tui/input/image-placeholder.test.ts b/apps/kimi-code/test/tui/input/image-placeholder.test.ts index cdc74e913..a3cc39a4d 100644 --- a/apps/kimi-code/test/tui/input/image-placeholder.test.ts +++ b/apps/kimi-code/test/tui/input/image-placeholder.test.ts @@ -84,6 +84,33 @@ describe('extractMediaAttachments', () => { }); }); + it('expands image placeholders to file-path tags for text-only models', () => { + const bytes = new Uint8Array([0x89, 0x50, 0x4e, 0x47]); + const { store, placeholder } = storeWith(bytes); + const r = extractMediaAttachments(`inspect ${placeholder}`, store, { + imageMode: 'file-tag', + writeImageAttachment: (att) => `/tmp/kimi/pasted-${String(att.id)}.png`, + }); + + expect(r.hasMedia).toBe(true); + expect(r.imageAttachmentIds).toEqual([1]); + expect(r.parts).toEqual([ + { type: 'text', text: 'inspect ' }, + ]); + }); + + it('escapes generated image path tags', () => { + const { store, placeholder } = storeWith(new Uint8Array([1])); + const r = extractMediaAttachments(placeholder, store, { + imageMode: 'file-tag', + writeImageAttachment: () => '/tmp/a&"<>.png', + }); + + expect(r.parts).toEqual([ + { type: 'text', text: '' }, + ]); + }); + it('escapes media paths in generated tags', () => { const store = new ImageAttachmentStore(); const att = store.addVideo('video/mp4', '/tmp/a&"<>.mp4', 'sample.mp4'); diff --git a/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts b/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts index cab1bfeae..aef90731a 100644 --- a/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts +++ b/apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts @@ -1164,6 +1164,50 @@ command = "vim" ]); }); + it('sends pasted images as file tags when the selected model is text-only', async () => { + const { driver, session } = await makeDriver(makeSession(), { + getConfig: vi.fn(async () => ({ + models: { + k2: { + model: 'moonshot-v1', + maxContextSize: 100, + capabilities: ['thinking', 'tool_use'], + }, + }, + })), + }); + const imageStore = (driver as unknown as { imageStore: ImageAttachmentStore }).imageStore; + const attachment = imageStore.addImage(new Uint8Array([0xaa, 0xbb]), 'image/png', 1, 1); + + driver.handleUserInput(`describe ${attachment.placeholder}`); + + const promptCalls = (session.prompt as unknown as { mock: { calls: unknown[][] } }).mock + .calls; + const promptArg = promptCalls[0]?.[0]; + expect(promptArg).toEqual([ + { + type: 'text', + text: expect.stringMatching( + /^describe <\/image>$/, + ), + }, + ]); + expect(JSON.stringify(promptArg)).not.toContain('image_url'); + expect(driver.state.transcriptEntries).toEqual([ + expect.objectContaining({ + kind: 'user', + content: `describe ${attachment.placeholder}`, + imageAttachmentIds: [attachment.id], + }), + ]); + + const text = (promptArg as Array<{ text?: string }>)[0]?.text; + const path = text?.match(/<\/image>/)?.[1]; + if (path !== undefined) { + await rm(path, { force: true }); + } + }); + it('queues editor input instead of prompting while a turn is already streaming', async () => { const { driver, session, harness } = await makeDriver(); driver.state.appState.streamingPhase = 'waiting';