Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/pasted-image-text-model-path.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@moonshot-ai/kimi-code": patch
---

Save pasted images as local file references when the selected model does not support image input.
20 changes: 12 additions & 8 deletions apps/kimi-code/src/tui/kimi-tui.ts
Original file line number Diff line number Diff line change
Expand Up @@ -769,13 +769,24 @@ export class KimiTUI {
this.showError(LLM_NOT_SET_MESSAGE);
return;
}
const extraction = extractMediaAttachments(text, this.imageStore);
let extraction = extractMediaAttachments(text, this.imageStore);
if (!this.validateMediaCapabilities(extraction)) return;
const session = this.session;
if (session === undefined) {
this.showError(LLM_NOT_SET_MESSAGE);
return;
}
if (
extraction.imageAttachmentIds.length > 0 &&
!this.supportsCurrentModelCapability('image_in')
) {
try {
extraction = extractMediaAttachments(text, this.imageStore, { imageMode: 'file-tag' });
} catch (error) {
this.showError(`Failed to save pasted image: ${formatErrorMessage(error)}`);
return;
}
}
if (extraction.hasMedia) {
this.sendMessage(session, text, {
hasMedia: true,
Expand All @@ -793,13 +804,6 @@ export class KimiTUI {
extraction: ReturnType<typeof extractMediaAttachments>,
): boolean {
if (!extraction.hasMedia) return true;
if (
extraction.imageAttachmentIds.length > 0 &&
!this.supportsCurrentModelCapability('image_in')
) {
this.showError('Current model does not support image input.');
return false;
}
if (
extraction.videoAttachmentIds.length > 0 &&
!this.supportsCurrentModelCapability('video_in')
Expand Down
68 changes: 64 additions & 4 deletions apps/kimi-code/src/tui/utils/image-placeholder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,19 @@
* A literal `[image #999 ...]` the user typed themselves stays in
* the text (we can't hallucinate files for it).
* - Order is preserved for text/image/video segments. Image placeholders
* expand to image content parts so the prompt reaches the provider
* without relying on a model tool call. Video placeholders still expand
* to file-path tags so `ReadMediaFile` can own video upload behavior.
* expand to image content parts when the model can see images. For
* text-only models they can instead expand to file-path tags, matching
* the video path flow.
* - Adjacent text segments are flattened — empty / whitespace-only
* segments drop out so we never emit `{type:'text', text:' '}`
* noise between two media parts.
*/

import { randomUUID } from 'node:crypto';
import { mkdtempSync, writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';

import type { PromptPart } from '@moonshot-ai/kimi-code-sdk';

import type {
Expand All @@ -24,6 +29,15 @@ import type {
} from './image-attachment-store';

const PLACEHOLDER_REGEX = /\[(image|video) #(\d+) (?:(\(\d+×\d+\))|([^\]]+))\]/g;
const PASTED_IMAGE_DIR_PREFIX = join(tmpdir(), 'kimi-code-pasted-images-');
let pastedImageDir: string | undefined;

export type ImageExtractionMode = 'content-part' | 'file-tag';

export interface ExtractionOptions {
readonly imageMode?: ImageExtractionMode;
readonly writeImageAttachment?: (attachment: ImageAttachment) => string;
}

export interface ExtractionResult {
/** Flat list of parts in input order; empty array when no media matched. */
Expand All @@ -42,6 +56,7 @@ export interface ExtractionResult {
export function extractMediaAttachments(
text: string,
store: ImageAttachmentStore,
options?: ExtractionOptions,
): ExtractionResult {
const parts: PromptPart[] = [];
const imageAttachmentIds: number[] = [];
Expand All @@ -66,7 +81,11 @@ export function extractMediaAttachments(
pushText(parts, mediaText);
videoAttachmentIds.push(id);
} else {
parts.push(imagePartForAttachment(attachment));
if ((options?.imageMode ?? 'content-part') === 'file-tag') {
pushText(parts, tagTextForImage(attachment, options));
} else {
parts.push(imagePartForAttachment(attachment));
}
imageAttachmentIds.push(id);
}
hasMedia = true;
Expand Down Expand Up @@ -113,10 +132,51 @@ function tagTextForVideo(att: VideoAttachment): string {
return formatMediaTag('video', att.sourcePath);
}

function tagTextForImage(att: ImageAttachment, options: ExtractionOptions | undefined): string {
const path = options?.writeImageAttachment?.(att) ?? writeImageAttachmentToTempFile(att);
return formatMediaTag('image', path);
}

function formatMediaTag(tag: 'image' | 'video', path: string): string {
return `<${tag} path="${escapeAttribute(path)}"></${tag}>`;
}

function writeImageAttachmentToTempFile(att: ImageAttachment): string {
const path = join(
getPastedImageDir(),
`pasted-${String(Date.now())}-${String(att.id)}-${randomUUID()}${extensionForMime(att.mime)}`,
);
writeFileSync(path, att.bytes, { mode: 0o600 });

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Remove pasted-image temp files during cleanup

When a user pastes an image while using a model without image_in, this writes the raw pasted bytes under /tmp/kimi-code-pasted-images-*, but the new temp directory/files are never tracked or removed on successful send, queued-message cancellation, or KimiTUI.stop(). Pasted screenshots often contain sensitive data and can be large, so they will remain on disk across CLI sessions until the OS eventually cleans temp storage; please add lifecycle cleanup for the generated files/directories once they are no longer needed.

Useful? React with 👍 / 👎.

return path;
}

function getPastedImageDir(): string {
pastedImageDir ??= mkdtempSync(PASTED_IMAGE_DIR_PREFIX);
return pastedImageDir;
}

function extensionForMime(mime: string): string {
const [type] = mime.toLowerCase().split(';', 1);
switch (type ?? '') {
case 'image/png':
return '.png';
case 'image/jpeg':
return '.jpg';
case 'image/gif':
return '.gif';
case 'image/webp':
return '.webp';
case 'image/bmp':
return '.bmp';
case 'image/heic':
return '.heic';
case 'image/heif':
return '.heif';
default:
return '.img';
}
}

function escapeAttribute(value: string): string {
return value
.replaceAll('&', '&amp;')
Expand Down
27 changes: 27 additions & 0 deletions apps/kimi-code/test/tui/input/image-placeholder.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,33 @@ describe('extractMediaAttachments', () => {
});
});

it('expands image placeholders to file-path tags for text-only models', () => {
const bytes = new Uint8Array([0x89, 0x50, 0x4e, 0x47]);
const { store, placeholder } = storeWith(bytes);
const r = extractMediaAttachments(`inspect ${placeholder}`, store, {
imageMode: 'file-tag',
writeImageAttachment: (att) => `/tmp/kimi/pasted-${String(att.id)}.png`,
});

expect(r.hasMedia).toBe(true);
expect(r.imageAttachmentIds).toEqual([1]);
expect(r.parts).toEqual([
{ type: 'text', text: 'inspect <image path="/tmp/kimi/pasted-1.png"></image>' },
]);
});

it('escapes generated image path tags', () => {
const { store, placeholder } = storeWith(new Uint8Array([1]));
const r = extractMediaAttachments(placeholder, store, {
imageMode: 'file-tag',
writeImageAttachment: () => '/tmp/a&"<>.png',
});

expect(r.parts).toEqual([
{ type: 'text', text: '<image path="/tmp/a&amp;&quot;&lt;&gt;.png"></image>' },
]);
});

it('escapes media paths in generated tags', () => {
const store = new ImageAttachmentStore();
const att = store.addVideo('video/mp4', '/tmp/a&"<>.mp4', 'sample.mp4');
Expand Down
44 changes: 44 additions & 0 deletions apps/kimi-code/test/tui/kimi-tui-message-flow.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1164,6 +1164,50 @@ command = "vim"
]);
});

it('sends pasted images as file tags when the selected model is text-only', async () => {
const { driver, session } = await makeDriver(makeSession(), {
getConfig: vi.fn(async () => ({
models: {
k2: {
model: 'moonshot-v1',
maxContextSize: 100,
capabilities: ['thinking', 'tool_use'],
},
},
})),
});
const imageStore = (driver as unknown as { imageStore: ImageAttachmentStore }).imageStore;
const attachment = imageStore.addImage(new Uint8Array([0xaa, 0xbb]), 'image/png', 1, 1);

driver.handleUserInput(`describe ${attachment.placeholder}`);

const promptCalls = (session.prompt as unknown as { mock: { calls: unknown[][] } }).mock
.calls;
const promptArg = promptCalls[0]?.[0];
expect(promptArg).toEqual([
{
type: 'text',
text: expect.stringMatching(
/^describe <image path=".*kimi-code-pasted-images.*\.png"><\/image>$/,
),
},
]);
expect(JSON.stringify(promptArg)).not.toContain('image_url');
expect(driver.state.transcriptEntries).toEqual([
expect.objectContaining({
kind: 'user',
content: `describe ${attachment.placeholder}`,
imageAttachmentIds: [attachment.id],
}),
]);

const text = (promptArg as Array<{ text?: string }>)[0]?.text;
const path = text?.match(/<image path="([^"]+)"><\/image>/)?.[1];
if (path !== undefined) {
await rm(path, { force: true });
}
});

it('queues editor input instead of prompting while a turn is already streaming', async () => {
const { driver, session, harness } = await makeDriver();
driver.state.appState.streamingPhase = 'waiting';
Expand Down