From f36bd80f8ef25c0cca3eddc3b876921b0946ddf6 Mon Sep 17 00:00:00 2001 From: guglxni Date: Sat, 20 Jun 2026 18:40:25 +0530 Subject: [PATCH 1/4] feat: add azure-foundry provider for Microsoft Foundry model access Introduce a first-class azure-foundry provider that targets Foundry's OpenAI v1-compatible route with api-key authentication, so users no longer need to hand-wire the generic openai provider for Azure deployments. Closes #918 Co-authored-by: Cursor --- .changeset/azure-foundry-provider.md | 5 + docs/en/configuration/config-files.md | 2 +- docs/en/configuration/env-vars.md | 2 + docs/en/configuration/providers.md | 25 +++++ docs/zh/configuration/config-files.md | 2 +- docs/zh/configuration/env-vars.md | 2 + docs/zh/configuration/providers.md | 25 +++++ packages/agent-core/src/config/schema.ts | 1 + .../modelCatalog/modelCatalogService.ts | 2 + .../src/session/provider-manager.ts | 11 ++ .../test/harness/runtime-provider.test.ts | 31 ++++++ packages/kosong/src/catalog.ts | 2 + .../kosong/src/providers/azure-foundry.ts | 72 +++++++++++++ packages/kosong/src/providers/index.ts | 5 + packages/kosong/test/azure-foundry.test.ts | 100 ++++++++++++++++++ packages/kosong/test/catalog.test.ts | 5 + packages/kosong/tsdown.config.ts | 1 + packages/oauth/src/custom-registry.ts | 2 + 18 files changed, 293 insertions(+), 2 deletions(-) create mode 100644 .changeset/azure-foundry-provider.md create mode 100644 packages/kosong/src/providers/azure-foundry.ts create mode 100644 packages/kosong/test/azure-foundry.test.ts diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md new file mode 100644 index 000000000..1c01dcefc --- /dev/null +++ b/.changeset/azure-foundry-provider.md @@ -0,0 +1,5 @@ +--- +"@moonshot-ai/kimi-code": minor +--- + +Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. diff --git a/docs/en/configuration/config-files.md b/docs/en/configuration/config-files.md index 0d64f5044..cd20b9dfe 100644 --- a/docs/en/configuration/config-files.md +++ b/docs/en/configuration/config-files.md @@ -100,7 +100,7 @@ Each entry in the `providers` table defines an API provider, keyed by a unique n | Field | Type | Required | Description | | --- | --- | --- | --- | -| `type` | `string` | Yes | Provider type: `kimi`, `anthropic`, `openai`, `openai_responses`, `google-genai`, `vertexai` | +| `type` | `string` | Yes | Provider type: `kimi`, `anthropic`, `openai`, `openai_responses`, `azure-foundry`, `google-genai`, `vertexai` | | `api_key` | `string` | No | API key, written in plain text in the config file | | `base_url` | `string` | No | API base URL | | `oauth` | `table` | No | OAuth credential reference (`storage` and `key` fields); injected automatically by the login flow — normally no need to write this by hand | diff --git a/docs/en/configuration/env-vars.md b/docs/en/configuration/env-vars.md index 933232511..791d0e88a 100644 --- a/docs/en/configuration/env-vars.md +++ b/docs/en/configuration/env-vars.md @@ -58,6 +58,8 @@ Key names per provider: | `ANTHROPIC_BASE_URL` | Anthropic | Follows Anthropic SDK default | | `OPENAI_API_KEY` | OpenAI (`openai` and `openai_responses`) | None | | `OPENAI_BASE_URL` | OpenAI (`openai` and `openai_responses`) | `https://api.openai.com/v1` | +| `AZURE_FOUNDRY_API_KEY` | Microsoft Foundry (`azure-foundry`) | None | +| `AZURE_FOUNDRY_BASE_URL` | Microsoft Foundry (`azure-foundry`) | None | | `GOOGLE_API_KEY` | Google GenAI, Vertex AI | None | | `VERTEXAI_API_KEY` | Vertex AI | None | | `GOOGLE_CLOUD_PROJECT` | Vertex AI | None | diff --git a/docs/en/configuration/providers.md b/docs/en/configuration/providers.md index 8fed5c4e1..89c5f0c65 100644 --- a/docs/en/configuration/providers.md +++ b/docs/en/configuration/providers.md @@ -12,6 +12,7 @@ The `type` field in the `providers` table determines which protocol implementati | `anthropic` | Anthropic Messages | Claude model family | | `openai` | OpenAI Chat Completions | OpenAI and compatible services, DeepSeek, Qwen, etc. | | `openai_responses` | OpenAI Responses API | OpenAI's newer Responses interface | +| `azure-foundry` | Microsoft Foundry (OpenAI v1) | Azure AI Foundry model deployments (GPT, DeepSeek, Llama, Mistral, etc.) | | `google-genai` | Google GenAI | Gemini API | | `vertexai` | Google GenAI on Vertex | Google Cloud Vertex AI | @@ -107,6 +108,30 @@ base_url = "https://api.openai.com/v1" api_key = "sk-xxxxx" ``` +## `azure-foundry` + +For connecting to [Microsoft Foundry](https://learn.microsoft.com/en-us/azure/foundry/) model deployments through the OpenAI v1-compatible inference route. Foundry hosts multiple model families — OpenAI GPT, DeepSeek, Meta Llama, Mistral, and others sold directly by Azure — not just OpenAI models. Put the model ID from your Foundry deployment in `[models.]`. + +Microsoft recommends the OpenAI v1 route for third-party SDKs and custom applications. See [Integrate Microsoft Foundry with your applications](https://learn.microsoft.com/en-us/azure/foundry/how-to/integrate-with-other-apps). + +- Recommended `base_url`: `https://{resource}.openai.azure.com/openai/v1` +- Credential key names: `AZURE_FOUNDRY_API_KEY`, `AZURE_FOUNDRY_BASE_URL` +- Auth: sends the Foundry `api-key` header + +```toml +[providers.foundry] +type = "azure-foundry" +base_url = "https://YOUR-RESOURCE.openai.azure.com/openai/v1" +api_key = "YOUR_KEY" + +[models.foundry-gpt4o] +provider = "foundry" +model = "gpt-4o" +max_context_size = 128000 +``` + +Third-party reasoning models on Foundry work the same way as on the generic `openai` provider: set `reasoning_key` on the model alias when your gateway returns reasoning content under a non-standard field name. + ## `google-genai` For connecting directly to the Google Gemini API. Thinking, vision, and multimodal capabilities are auto-detected by model name. diff --git a/docs/zh/configuration/config-files.md b/docs/zh/configuration/config-files.md index e0a215f56..f29b2fa22 100644 --- a/docs/zh/configuration/config-files.md +++ b/docs/zh/configuration/config-files.md @@ -100,7 +100,7 @@ timeout = 5 | 字段 | 类型 | 必填 | 说明 | | --- | --- | --- | --- | -| `type` | `string` | 是 | 供应商类型:`kimi`、`anthropic`、`openai`、`openai_responses`、`google-genai`、`vertexai` | +| `type` | `string` | 是 | 供应商类型:`kimi`、`anthropic`、`openai`、`openai_responses`、`azure-foundry`、`google-genai`、`vertexai` | | `api_key` | `string` | 否 | API 密钥,明文写在配置文件里 | | `base_url` | `string` | 否 | API 基础 URL | | `oauth` | `table` | 否 | OAuth 凭据引用(`storage`、`key` 两个字段),由登录流程自动注入,通常无需手写 | diff --git a/docs/zh/configuration/env-vars.md b/docs/zh/configuration/env-vars.md index 227b1ced1..46b87eb4c 100644 --- a/docs/zh/configuration/env-vars.md +++ b/docs/zh/configuration/env-vars.md @@ -58,6 +58,8 @@ KIMI_BASE_URL = "https://api.moonshot.ai/v1" | `ANTHROPIC_BASE_URL` | Anthropic | Anthropic SDK 默认值 | | `OPENAI_API_KEY` | OpenAI(`openai` 和 `openai_responses`) | 无 | | `OPENAI_BASE_URL` | OpenAI(`openai` 和 `openai_responses`) | `https://api.openai.com/v1` | +| `AZURE_FOUNDRY_API_KEY` | Microsoft Foundry(`azure-foundry`) | 无 | +| `AZURE_FOUNDRY_BASE_URL` | Microsoft Foundry(`azure-foundry`) | 无 | | `GOOGLE_API_KEY` | Google GenAI、Vertex AI | 无 | | `VERTEXAI_API_KEY` | Vertex AI | 无 | | `GOOGLE_CLOUD_PROJECT` | Vertex AI | 无 | diff --git a/docs/zh/configuration/providers.md b/docs/zh/configuration/providers.md index 41aae2736..fed4d57ba 100644 --- a/docs/zh/configuration/providers.md +++ b/docs/zh/configuration/providers.md @@ -12,6 +12,7 @@ Kimi Code CLI 支持同时接入多家 LLM 平台——用 Kimi Code 托管服 | `anthropic` | Anthropic Messages | Claude 系列模型 | | `openai` | OpenAI Chat Completions | OpenAI 及兼容服务、DeepSeek、Qwen 等 | | `openai_responses` | OpenAI Responses API | OpenAI 较新的 Responses 接口 | +| `azure-foundry` | Microsoft Foundry(OpenAI v1) | Azure AI Foundry 模型部署(GPT、DeepSeek、Llama、Mistral 等) | | `google-genai` | Google GenAI | Gemini API | | `vertexai` | Google GenAI on Vertex | Google Cloud Vertex AI | @@ -107,6 +108,30 @@ base_url = "https://api.openai.com/v1" api_key = "sk-xxxxx" ``` +## `azure-foundry` + +用于连接 [Microsoft Foundry](https://learn.microsoft.com/en-us/azure/foundry/) 上的模型部署,走 OpenAI v1 兼容推理路由。Foundry 托管多种模型家族——OpenAI GPT、DeepSeek、Meta Llama、Mistral 等 Azure 直售模型,并非只有 OpenAI。在 `[models.]` 中填写 Foundry 部署的模型 ID。 + +Microsoft 建议第三方 SDK 和自定义应用使用 OpenAI v1 路由。详见 [Integrate Microsoft Foundry with your applications](https://learn.microsoft.com/en-us/azure/foundry/how-to/integrate-with-other-apps)。 + +- 推荐 `base_url`:`https://{resource}.openai.azure.com/openai/v1` +- 凭证键名:`AZURE_FOUNDRY_API_KEY`、`AZURE_FOUNDRY_BASE_URL` +- 认证:发送 Foundry 的 `api-key` 请求头 + +```toml +[providers.foundry] +type = "azure-foundry" +base_url = "https://YOUR-RESOURCE.openai.azure.com/openai/v1" +api_key = "YOUR_KEY" + +[models.foundry-gpt4o] +provider = "foundry" +model = "gpt-4o" +max_context_size = 128000 +``` + +Foundry 上的第三方推理模型与通用 `openai` 供应商用法相同:若网关以非标准字段返回推理内容,可在模型别名上设置 `reasoning_key`。 + ## `google-genai` 用于直连 Google Gemini API。thinking、视觉及多模态能力按模型名自动识别。 diff --git a/packages/agent-core/src/config/schema.ts b/packages/agent-core/src/config/schema.ts index 9b3d11cf0..d58655c15 100644 --- a/packages/agent-core/src/config/schema.ts +++ b/packages/agent-core/src/config/schema.ts @@ -5,6 +5,7 @@ import { z } from 'zod'; export const ProviderTypeSchema = z.enum([ 'anthropic', + 'azure-foundry', 'openai', 'kimi', 'google-genai', diff --git a/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts b/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts index bd8eb79f3..233a209df 100644 --- a/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts +++ b/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts @@ -214,6 +214,8 @@ function hasConfiguredApiKey(provider: ProviderConfig): boolean { case 'openai': case 'openai_responses': return nonEmpty(provider.env?.['OPENAI_API_KEY']) !== undefined; + case 'azure-foundry': + return nonEmpty(provider.env?.['AZURE_FOUNDRY_API_KEY']) !== undefined; case 'kimi': return nonEmpty(provider.env?.['KIMI_API_KEY']) !== undefined; case 'google-genai': diff --git a/packages/agent-core/src/session/provider-manager.ts b/packages/agent-core/src/session/provider-manager.ts index 34dc82091..b447b7f8c 100644 --- a/packages/agent-core/src/session/provider-manager.ts +++ b/packages/agent-core/src/session/provider-manager.ts @@ -245,6 +245,15 @@ function toKosongProviderConfig( reasoningKey, ...defaultHeadersField(provider.customHeaders), }; + case 'azure-foundry': + return { + type: 'azure-foundry', + model, + baseUrl: providerValue(provider.baseUrl, provider.env, 'AZURE_FOUNDRY_BASE_URL'), + apiKey: providerApiKey(provider), + reasoningKey, + ...defaultHeadersField(provider.customHeaders), + }; case 'kimi': return { type: 'kimi', @@ -306,6 +315,8 @@ function providerApiKey(provider: ProviderConfig): string | undefined { case 'openai': case 'openai_responses': return providerValue(provider.apiKey, provider.env, 'OPENAI_API_KEY'); + case 'azure-foundry': + return providerValue(provider.apiKey, provider.env, 'AZURE_FOUNDRY_API_KEY'); case 'kimi': return providerValue(provider.apiKey, provider.env, 'KIMI_API_KEY'); case 'google-genai': diff --git a/packages/agent-core/test/harness/runtime-provider.test.ts b/packages/agent-core/test/harness/runtime-provider.test.ts index c93283b65..0eaee9b93 100644 --- a/packages/agent-core/test/harness/runtime-provider.test.ts +++ b/packages/agent-core/test/harness/runtime-provider.test.ts @@ -532,6 +532,37 @@ describe('resolveRuntimeProvider customHeaders propagation', () => { }); }); + it('resolves an azure-foundry provider with env credential keys', () => { + const resolved = resolveRuntimeProvider({ + config: { + defaultModel: 'foundry-alias', + providers: { + foundry: { + type: 'azure-foundry', + env: { + AZURE_FOUNDRY_API_KEY: 'foundry-key', + AZURE_FOUNDRY_BASE_URL: 'https://example.openai.azure.com/openai/v1', + }, + }, + }, + models: { + 'foundry-alias': { + provider: 'foundry', + model: 'gpt-4o', + maxContextSize: 128000, + }, + }, + }, + }); + + expect(resolved.provider).toMatchObject({ + type: 'azure-foundry', + apiKey: 'foundry-key', + baseUrl: 'https://example.openai.azure.com/openai/v1', + model: 'gpt-4o', + }); + }); + it('forwards customHeaders to an openai_responses provider', () => { const resolved = resolveRuntimeProvider({ config: { diff --git a/packages/kosong/src/catalog.ts b/packages/kosong/src/catalog.ts index 40975430c..40ba29edd 100644 --- a/packages/kosong/src/catalog.ts +++ b/packages/kosong/src/catalog.ts @@ -48,6 +48,7 @@ export interface CatalogModel { const KNOWN_WIRE_TYPES = [ 'anthropic', + 'azure-foundry', 'openai', 'kimi', 'google-genai', @@ -87,6 +88,7 @@ export function inferWireType(entry: CatalogProviderEntry): ProviderType | undef if (npm.includes('anthropic') || id.includes('anthropic') || id.includes('claude')) { return 'anthropic'; } + if (id.includes('azure') || id.includes('foundry')) return 'azure-foundry'; if (id.includes('vertex')) return 'vertexai'; if (npm.includes('google') || id.includes('google') || id.includes('gemini')) { return 'google-genai'; diff --git a/packages/kosong/src/providers/azure-foundry.ts b/packages/kosong/src/providers/azure-foundry.ts new file mode 100644 index 000000000..0fd53b42e --- /dev/null +++ b/packages/kosong/src/providers/azure-foundry.ts @@ -0,0 +1,72 @@ +import type { ProviderRequestAuth } from '#/provider'; +import OpenAI from 'openai'; + +import { + OpenAILegacyChatProvider, + type OpenAILegacyOptions, +} from './openai-legacy'; +import { mergeRequestHeaders, requireProviderApiKey } from './request-auth'; + +export type AzureFoundryOptions = OpenAILegacyOptions; + +function normalizeAzureFoundryBaseUrl(baseUrl: string | undefined): string | undefined { + const trimmed = baseUrl?.trim(); + if (trimmed === undefined || trimmed.length === 0) return undefined; + return trimmed.replace(/\/+$/, ''); +} + +function buildAzureFoundryClient( + apiKey: string, + baseUrl: string | undefined, + defaultHeaders: Record | undefined, + httpClient: unknown, + auth?: ProviderRequestAuth, +): OpenAI { + const key = requireProviderApiKey('AzureFoundryChatProvider', auth, apiKey); + const headers: Record = { authorization: null, 'api-key': key }; + const merged = mergeRequestHeaders(defaultHeaders, auth?.headers); + if (merged !== undefined) { + for (const [name, value] of Object.entries(merged)) { + headers[name.toLowerCase()] = value; + } + } + headers['api-key'] = key; + + const clientOpts: Record = { + apiKey: key, + baseURL: baseUrl, + defaultHeaders: headers, + }; + if (httpClient !== undefined) { + clientOpts['httpClient'] = httpClient; + } + return new OpenAI(clientOpts as ConstructorParameters[0]); +} + +/** + * Microsoft Foundry chat provider. + * + * Targets Foundry's OpenAI v1-compatible inference route + * (`https://{resource}.openai.azure.com/openai/v1`) and authenticates with + * the Foundry `api-key` header rather than Bearer auth. + */ +export class AzureFoundryChatProvider extends OpenAILegacyChatProvider { + override readonly name = 'azure-foundry'; + + constructor(options: AzureFoundryOptions) { + const baseUrl = normalizeAzureFoundryBaseUrl(options.baseUrl); + const apiKey = options.apiKey; + super({ + ...options, + baseUrl, + clientFactory: (auth) => + buildAzureFoundryClient( + apiKey ?? '', + baseUrl, + options.defaultHeaders, + options.httpClient, + auth, + ), + }); + } +} diff --git a/packages/kosong/src/providers/index.ts b/packages/kosong/src/providers/index.ts index d95e9c58e..3d4f38cb4 100644 --- a/packages/kosong/src/providers/index.ts +++ b/packages/kosong/src/providers/index.ts @@ -1,6 +1,7 @@ import { UNKNOWN_CAPABILITY, type ModelCapability } from '../capability'; import type { ChatProvider } from '../provider'; import { AnthropicChatProvider, type AnthropicOptions } from './anthropic'; +import { AzureFoundryChatProvider, type AzureFoundryOptions } from './azure-foundry'; import { getAnthropicModelCapability, getGoogleGenAIModelCapability, @@ -14,6 +15,7 @@ import { OpenAIResponsesChatProvider, type OpenAIResponsesOptions } from './open export type ProviderConfig = | ({ type: 'anthropic' } & AnthropicOptions) + | ({ type: 'azure-foundry' } & AzureFoundryOptions) | ({ type: 'openai' } & OpenAILegacyOptions) | ({ type: 'kimi' } & KimiOptions) | ({ type: 'google-genai' } & GoogleGenAIOptions) @@ -26,6 +28,8 @@ export function createProvider(config: ProviderConfig): ChatProvider { switch (config.type) { case 'anthropic': return new AnthropicChatProvider(config); + case 'azure-foundry': + return new AzureFoundryChatProvider(config); case 'openai': return new OpenAILegacyChatProvider(config); case 'kimi': @@ -55,6 +59,7 @@ export function getModelCapability(wire: ProviderType, modelName: string): Model switch (wire) { case 'anthropic': return getAnthropicModelCapability(modelName); + case 'azure-foundry': case 'openai': return getOpenAILegacyModelCapability(modelName); case 'openai_responses': diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts new file mode 100644 index 000000000..8db86847e --- /dev/null +++ b/packages/kosong/test/azure-foundry.test.ts @@ -0,0 +1,100 @@ +import { AzureFoundryChatProvider } from '#/providers/azure-foundry'; +import { describe, expect, it } from 'vitest'; + +import { createFakeProviderHarness, type FakeProviderHarness } from './e2e/fake-provider-harness'; + +async function withHarness(fn: (harness: FakeProviderHarness) => Promise): Promise { + const harness = await createFakeProviderHarness(); + try { + return await fn(harness); + } finally { + await harness.close(); + } +} + +describe('AzureFoundryChatProvider', () => { + it('uses the azure-foundry provider name', () => { + const provider = new AzureFoundryChatProvider({ + model: 'gpt-4o', + apiKey: 'test-key', + baseUrl: 'https://example.openai.azure.com/openai/v1', + }); + expect(provider.name).toBe('azure-foundry'); + }); + + it('sends Foundry api-key auth instead of Bearer for chat completions', async () => { + await withHarness(async (harness) => { + harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => { + expect(request.headers['api-key']).toBe('foundry-key'); + expect(request.headers['authorization']).toBeUndefined(); + await reply.sseJson(200, [ + { + id: 'chatcmpl-azure-1', + object: 'chat.completion.chunk', + created: 1234567890, + model: 'gpt-4o', + choices: [{ index: 0, delta: { content: 'Hello' }, finish_reason: null }], + }, + { + id: 'chatcmpl-azure-1', + object: 'chat.completion.chunk', + created: 1234567890, + model: 'gpt-4o', + choices: [ + { + index: 0, + delta: {}, + finish_reason: 'stop', + }, + ], + }, + ]); + }); + + const provider = new AzureFoundryChatProvider({ + model: 'gpt-4o', + apiKey: 'foundry-key', + baseUrl: `${harness.baseUrl}/openai/v1`, + }); + const stream = await provider.generate('You are helpful.', [], [ + { role: 'user', content: [{ type: 'text', text: 'Hi' }], toolCalls: [] }, + ]); + const parts = []; + for await (const part of stream) { + parts.push(part); + } + expect(parts).toEqual([{ type: 'text', text: 'Hello' }]); + }); + }); + + it('strips trailing slashes from base_url', async () => { + await withHarness(async (harness) => { + let capturedPath = ''; + harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => { + capturedPath = request.pathname; + await reply.sseJson(200, [ + { + id: 'chatcmpl-azure-2', + object: 'chat.completion.chunk', + created: 1234567890, + model: 'gpt-4o', + choices: [{ index: 0, delta: { content: 'ok' }, finish_reason: 'stop' }], + }, + ]); + }); + + const provider = new AzureFoundryChatProvider({ + model: 'gpt-4o', + apiKey: 'foundry-key', + baseUrl: `${harness.baseUrl}/openai/v1/`, + }); + const stream = await provider.generate('', [], [ + { role: 'user', content: [{ type: 'text', text: 'Hi' }], toolCalls: [] }, + ]); + for await (const _part of stream) { + // drain + } + expect(capturedPath).toBe('/openai/v1/chat/completions'); + }); + }); +}); diff --git a/packages/kosong/test/catalog.test.ts b/packages/kosong/test/catalog.test.ts index 5780e5502..7abbb5861 100644 --- a/packages/kosong/test/catalog.test.ts +++ b/packages/kosong/test/catalog.test.ts @@ -23,6 +23,11 @@ describe('inferWireType', () => { expect(inferWireType({ id: 'google-vertex' })).toBe('vertexai'); }); + it('infers azure-foundry from azure or foundry ids', () => { + expect(inferWireType({ id: 'azure-foundry' })).toBe('azure-foundry'); + expect(inferWireType({ id: 'microsoft-foundry' })).toBe('azure-foundry'); + }); + it('returns undefined for unknown / invalid wire types', () => { expect(inferWireType({ id: 'some-proxy' })).toBeUndefined(); expect(inferWireType({ id: 'x', type: 'not-a-wire' })).toBeUndefined(); diff --git a/packages/kosong/tsdown.config.ts b/packages/kosong/tsdown.config.ts index 00783406f..c636ecde0 100644 --- a/packages/kosong/tsdown.config.ts +++ b/packages/kosong/tsdown.config.ts @@ -5,6 +5,7 @@ export default defineConfig({ './src/index.ts', './src/providers/kimi.ts', './src/providers/openai-legacy.ts', + './src/providers/azure-foundry.ts', './src/providers/openai-responses.ts', './src/providers/anthropic.ts', './src/providers/google-genai.ts', diff --git a/packages/oauth/src/custom-registry.ts b/packages/oauth/src/custom-registry.ts index 0c5d720f7..c3578175d 100644 --- a/packages/oauth/src/custom-registry.ts +++ b/packages/oauth/src/custom-registry.ts @@ -24,6 +24,7 @@ export interface CustomRegistrySource { */ export type CustomRegistryProviderType = | 'anthropic' + | 'azure-foundry' | 'openai' | 'openai_responses' | 'kimi'; @@ -59,6 +60,7 @@ export const CUSTOM_REGISTRY_DEFAULT_CAPABILITIES = ['tool_use'] as const; const ALLOWED_PROVIDER_TYPES: ReadonlySet = new Set([ 'anthropic', + 'azure-foundry', 'openai', 'openai_responses', 'kimi', From 23ff2e18b83d6c7889326b4bfc3819a042047bd3 Mon Sep 17 00:00:00 2001 From: guglxni Date: Sat, 20 Jun 2026 21:02:26 +0530 Subject: [PATCH 2/4] fix: harden azure-foundry provider for Foundry runtime issues Require base_url before constructing the Foundry client so api-key auth never falls back to the default OpenAI host. Clamp completion budgets against Foundry's shared input+output context window and recover once when a model stalls after tool results without issuing further tool calls. Addresses Codex review on #950. Relates to #918 and #520. Co-authored-by: Cursor --- .changeset/azure-foundry-provider.md | 2 +- packages/agent-core/src/agent/turn/index.ts | 29 +++++++ .../src/agent/turn/tool-stall-recovery.ts | 20 +++++ .../src/session/provider-manager.ts | 15 +++- .../agent-core/src/utils/completion-budget.ts | 13 ++-- .../test/agent/tool-stall-recovery.test.ts | 32 ++++++++ packages/agent-core/test/agent/turn.test.ts | 23 ++++++ .../test/harness/runtime-provider.test.ts | 24 ++++++ .../test/utils/completion-budget.test.ts | 18 +++-- .../kosong/src/providers/azure-foundry.ts | 22 +++++- .../kosong/src/providers/openai-legacy.ts | 16 +++- .../src/providers/shared-context-window.ts | 47 +++++++++++ packages/kosong/src/token-estimate.ts | 77 +++++++++++++++++++ packages/kosong/test/azure-foundry.test.ts | 57 ++++++++++++++ .../kosong/test/shared-context-window.test.ts | 32 ++++++++ 15 files changed, 410 insertions(+), 17 deletions(-) create mode 100644 packages/agent-core/src/agent/turn/tool-stall-recovery.ts create mode 100644 packages/agent-core/test/agent/tool-stall-recovery.test.ts create mode 100644 packages/kosong/src/providers/shared-context-window.ts create mode 100644 packages/kosong/src/token-estimate.ts create mode 100644 packages/kosong/test/shared-context-window.test.ts diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md index 1c01dcefc..1863fb07c 100644 --- a/.changeset/azure-foundry-provider.md +++ b/.changeset/azure-foundry-provider.md @@ -2,4 +2,4 @@ "@moonshot-ai/kimi-code": minor --- -Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. +Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window so Foundry-hosted Kimi models do not overflow on the first request. diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts index 7fff37876..e6d2182a4 100644 --- a/packages/agent-core/src/agent/turn/index.ts +++ b/packages/agent-core/src/agent/turn/index.ts @@ -40,6 +40,11 @@ import { USER_PROMPT_ORIGIN, type PromptOrigin } from '../context'; import { renderUserPromptHookBlockResult, renderUserPromptHookResult } from '../../session/hooks'; import { canonicalTelemetryArgs, isPlainRecord } from './canonical-args'; import { ToolCallDeduplicator } from './tool-dedup'; +import { + hasToolResultsSinceLastUserMessage, + TOOL_STALL_RECOVERY_NAME, + TOOL_STALL_RECOVERY_TEXT, +} from './tool-stall-recovery'; interface ActiveTurn { readonly turnId: number; @@ -613,6 +618,7 @@ export class TurnFlow { private async runStepLoop(turnId: number, signal: AbortSignal): Promise { let stopHookContinuationUsed = false; let goalOutcomeMessageContinuationUsed = false; + let toolStallContinuationUsed = false; const deduper = new ToolCallDeduplicator({ telemetry: this.agent.telemetry }); await this.agent.mcp?.waitForInitialLoad(signal); // Surface the active goal at the start of the turn (append-only; no-op when @@ -679,6 +685,29 @@ export class TurnFlow { return { continue: true }; } + // 3b. Recover once when the model ends a step without tools after + // tool results already landed in the same turn (common with + // shared-window thinking models that stop after long reasoning). + if ( + !toolStallContinuationUsed && + ctx.stopReason === 'end_turn' && + ctx.stepNumber > 1 && + hasToolResultsSinceLastUserMessage(this.agent.context.messages) + ) { + if (!hasStepBudgetRemaining(loopControl?.maxStepsPerTurn, ctx.stepNumber)) { + return { continue: false }; + } + toolStallContinuationUsed = true; + this.agent.context.appendUserMessage( + [{ type: 'text', text: TOOL_STALL_RECOVERY_TEXT }], + { + kind: 'system_trigger', + name: TOOL_STALL_RECOVERY_NAME, + }, + ); + return { continue: true }; + } + // 3. The external Stop hook gets exactly one continuation; the cap // is intentionally separate from (and does not cap) goal mode. if (!stopHookContinuationUsed) { diff --git a/packages/agent-core/src/agent/turn/tool-stall-recovery.ts b/packages/agent-core/src/agent/turn/tool-stall-recovery.ts new file mode 100644 index 000000000..7c6a63f04 --- /dev/null +++ b/packages/agent-core/src/agent/turn/tool-stall-recovery.ts @@ -0,0 +1,20 @@ +import type { Message } from '@moonshot-ai/kosong'; + +export const TOOL_STALL_RECOVERY_NAME = 'tool_stall_recovery'; + +export const TOOL_STALL_RECOVERY_TEXT = + '\n' + + 'Your previous step ended without calling any tools even though more work remains on the user request. ' + + 'Call the appropriate tools now instead of only describing what you plan to do next.\n' + + ''; + +/** True when tool results appear after the latest user message in the turn history. */ +export function hasToolResultsSinceLastUserMessage(messages: readonly Message[]): boolean { + for (let index = messages.length - 1; index >= 0; index -= 1) { + const message = messages[index]; + if (message === undefined) continue; + if (message.role === 'user') return false; + if (message.role === 'tool') return true; + } + return false; +} diff --git a/packages/agent-core/src/session/provider-manager.ts b/packages/agent-core/src/session/provider-manager.ts index b447b7f8c..82209dfd8 100644 --- a/packages/agent-core/src/session/provider-manager.ts +++ b/packages/agent-core/src/session/provider-manager.ts @@ -109,6 +109,7 @@ export class ProviderManager implements ModelProvider { alias.model, this.options.kimiRequestHeaders, alias.maxOutputSize, + alias.maxContextSize, alias.reasoningKey, this.options.promptCacheKey, alias.adaptiveThinking, @@ -221,6 +222,7 @@ function toKosongProviderConfig( model: string, kimiRequestHeaders: Record | undefined, maxOutputSize: number | undefined, + maxContextSize: number | undefined, reasoningKey: string | undefined, promptCacheKey: string | undefined, adaptiveThinking: boolean | undefined, @@ -245,15 +247,24 @@ function toKosongProviderConfig( reasoningKey, ...defaultHeadersField(provider.customHeaders), }; - case 'azure-foundry': + case 'azure-foundry': { + const baseUrl = providerValue(provider.baseUrl, provider.env, 'AZURE_FOUNDRY_BASE_URL'); + if (baseUrl === undefined) { + throw new KimiError( + ErrorCodes.MODEL_CONFIG_INVALID, + 'Provider type "azure-foundry" requires base_url (or AZURE_FOUNDRY_BASE_URL in [providers..env]). Example: https://YOUR-RESOURCE.openai.azure.com/openai/v1', + ); + } return { type: 'azure-foundry', model, - baseUrl: providerValue(provider.baseUrl, provider.env, 'AZURE_FOUNDRY_BASE_URL'), + baseUrl, apiKey: providerApiKey(provider), reasoningKey, + sharedContextWindowTokens: maxContextSize, ...defaultHeadersField(provider.customHeaders), }; + } case 'kimi': return { type: 'kimi', diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts index ceb086ef2..17f901c02 100644 --- a/packages/agent-core/src/utils/completion-budget.ts +++ b/packages/agent-core/src/utils/completion-budget.ts @@ -50,18 +50,21 @@ function parseEnvBudget(raw: string | undefined): EnvBudget { /** * Compute the effective `max_completion_tokens` cap. + * + * Uses the explicit hard cap or reserved-context fallback when set, clamped + * to the model context window. Shared-window providers reject requests where + * input tokens plus max_completion_tokens exceed the total window. */ export function computeCompletionBudgetCap(args: { readonly budget: CompletionBudgetConfig; readonly capability: ModelCapability | undefined; }): number { const maxCtx = args.capability?.max_context_tokens ?? 0; - // The provider backend computes the safe request-specific value from the - // serialized prompt. Locally using the largest cap avoids cutting off - // thinking before the model produces a summary. - const cap = + const requested = args.budget.hardCap ?? - (maxCtx > 0 ? maxCtx : args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK); + args.budget.fallback ?? + (maxCtx > 0 ? maxCtx : DEFAULT_UNKNOWN_CONTEXT_FALLBACK); + const cap = maxCtx > 0 ? Math.min(requested, maxCtx) : requested; return Math.max(MIN_FLOOR, cap); } diff --git a/packages/agent-core/test/agent/tool-stall-recovery.test.ts b/packages/agent-core/test/agent/tool-stall-recovery.test.ts new file mode 100644 index 000000000..a0b074c9d --- /dev/null +++ b/packages/agent-core/test/agent/tool-stall-recovery.test.ts @@ -0,0 +1,32 @@ +import type { Message } from '@moonshot-ai/kosong'; +import { describe, expect, it } from 'vitest'; + +import { hasToolResultsSinceLastUserMessage } from '../../src/agent/turn/tool-stall-recovery'; + +describe('hasToolResultsSinceLastUserMessage', () => { + it('returns false when the latest user message has no trailing tool results', () => { + const messages: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + { role: 'assistant', content: [{ type: 'text', text: 'hello' }], toolCalls: [] }, + ]; + expect(hasToolResultsSinceLastUserMessage(messages)).toBe(false); + }); + + it('returns true when tool results follow the latest user message', () => { + const messages: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'explore' }], toolCalls: [] }, + { + role: 'assistant', + content: [{ type: 'text', text: 'reading' }], + toolCalls: [{ type: 'function', id: 'call_1', name: 'Read', arguments: '{}' }], + }, + { role: 'tool', content: [{ type: 'text', text: 'file contents' }], toolCalls: [], toolCallId: 'call_1' }, + { + role: 'assistant', + content: [{ type: 'text', text: 'I will continue' }], + toolCalls: [], + }, + ]; + expect(hasToolResultsSinceLastUserMessage(messages)).toBe(true); + }); +}); diff --git a/packages/agent-core/test/agent/turn.test.ts b/packages/agent-core/test/agent/turn.test.ts index e39094fb4..576bc1cef 100644 --- a/packages/agent-core/test/agent/turn.test.ts +++ b/packages/agent-core/test/agent/turn.test.ts @@ -112,6 +112,29 @@ describe('Agent turn flow', () => { }); }); + it('continues once after a post-tool step ends without further tool calls', async () => { + const ctx = testAgent({ kaos: createCommandKaos('ok') }); + ctx.configure({ tools: ['Bash'] }); + await ctx.rpc.setPermission({ mode: 'yolo' }); + + ctx.mockNextResponse( + { type: 'text', text: 'Running first command.' }, + bashCallWithId('call_1', 'printf ok'), + ); + ctx.mockNextResponse({ type: 'text', text: 'I will continue exploring.' }); + ctx.mockNextResponse( + { type: 'text', text: 'Continuing with another command.' }, + bashCallWithId('call_2', 'printf more'), + ); + ctx.mockNextResponse({ type: 'text', text: 'Done.' }); + + await ctx.rpc.prompt({ input: [{ type: 'text', text: 'Explore the repo' }] }); + await ctx.untilTurnEnd(); + + expect(ctx.llmCalls).toHaveLength(4); + expect(JSON.stringify(ctx.llmCalls[2]?.history ?? [])).toContain('system-reminder'); + }); + it('tracks cross-step duplicate tool-call detection telemetry', async () => { const records: TelemetryRecord[] = []; const ctx = testAgent({ diff --git a/packages/agent-core/test/harness/runtime-provider.test.ts b/packages/agent-core/test/harness/runtime-provider.test.ts index 0eaee9b93..b67a877ce 100644 --- a/packages/agent-core/test/harness/runtime-provider.test.ts +++ b/packages/agent-core/test/harness/runtime-provider.test.ts @@ -560,9 +560,33 @@ describe('resolveRuntimeProvider customHeaders propagation', () => { apiKey: 'foundry-key', baseUrl: 'https://example.openai.azure.com/openai/v1', model: 'gpt-4o', + sharedContextWindowTokens: 128000, }); }); + it('rejects azure-foundry providers without a base_url at resolve time', () => { + expect(() => + resolveRuntimeProvider({ + config: { + defaultModel: 'foundry-alias', + providers: { + foundry: { + type: 'azure-foundry', + apiKey: 'foundry-key', + }, + }, + models: { + 'foundry-alias': { + provider: 'foundry', + model: 'gpt-4o', + maxContextSize: 128000, + }, + }, + }, + }), + ).toThrow(/requires base_url/); + }); + it('forwards customHeaders to an openai_responses provider', () => { const resolved = resolveRuntimeProvider({ config: { diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts index 7df91f5d0..4760dcb04 100644 --- a/packages/agent-core/test/utils/completion-budget.test.ts +++ b/packages/agent-core/test/utils/completion-budget.test.ts @@ -50,24 +50,32 @@ describe('computeCompletionBudgetCap', () => { ).toBe(1); }); - it('uses the model context window when no hard cap is set', () => { - const maxCtx = 100000; + it('uses the reserved-context fallback when no hard cap is set', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, + capability: makeCapability(100000), + }); + expect(cap).toBe(32000); + }); + + it('uses the model context window when no hard cap or fallback is set', () => { + const maxCtx = 100000; + const cap = computeCompletionBudgetCap({ + budget: {}, capability: makeCapability(maxCtx), }); expect(cap).toBe(maxCtx); }); - it('uses the explicit hard cap when configured', () => { + it('clamps the explicit hard cap to the model context window when it is smaller', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 32000 }, capability: makeCapability(10000), }); - expect(cap).toBe(32000); + expect(cap).toBe(10000); }); - it('ignores fallback when the model context window is known', () => { + it('clamps the fallback to the model context window when it is smaller', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(10000), diff --git a/packages/kosong/src/providers/azure-foundry.ts b/packages/kosong/src/providers/azure-foundry.ts index 0fd53b42e..88b02132e 100644 --- a/packages/kosong/src/providers/azure-foundry.ts +++ b/packages/kosong/src/providers/azure-foundry.ts @@ -1,3 +1,4 @@ +import { ChatProviderError } from '#/errors'; import type { ProviderRequestAuth } from '#/provider'; import OpenAI from 'openai'; @@ -15,9 +16,19 @@ function normalizeAzureFoundryBaseUrl(baseUrl: string | undefined): string | und return trimmed.replace(/\/+$/, ''); } +function requireAzureFoundryBaseUrl(baseUrl: string | undefined): string { + const normalized = normalizeAzureFoundryBaseUrl(baseUrl); + if (normalized === undefined) { + throw new ChatProviderError( + 'AzureFoundryChatProvider: baseUrl is required. Set base_url in config.toml or AZURE_FOUNDRY_BASE_URL in [providers..env]. Example: https://YOUR-RESOURCE.openai.azure.com/openai/v1', + ); + } + return normalized; +} + function buildAzureFoundryClient( apiKey: string, - baseUrl: string | undefined, + baseUrl: string, defaultHeaders: Record | undefined, httpClient: unknown, auth?: ProviderRequestAuth, @@ -49,12 +60,17 @@ function buildAzureFoundryClient( * Targets Foundry's OpenAI v1-compatible inference route * (`https://{resource}.openai.azure.com/openai/v1`) and authenticates with * the Foundry `api-key` header rather than Bearer auth. + * + * Foundry-hosted Kimi models use a shared input+output context window. Pass + * `sharedContextWindowTokens` (wired from `max_context_size` in config) so + * completion budgets are clamped against the serialized prompt before each + * request. */ export class AzureFoundryChatProvider extends OpenAILegacyChatProvider { override readonly name = 'azure-foundry'; constructor(options: AzureFoundryOptions) { - const baseUrl = normalizeAzureFoundryBaseUrl(options.baseUrl); + const baseUrl = requireAzureFoundryBaseUrl(options.baseUrl); const apiKey = options.apiKey; super({ ...options, @@ -62,7 +78,7 @@ export class AzureFoundryChatProvider extends OpenAILegacyChatProvider { clientFactory: (auth) => buildAzureFoundryClient( apiKey ?? '', - baseUrl, + requireAzureFoundryBaseUrl(baseUrl), options.defaultHeaders, options.httpClient, auth, diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index c9987a5f5..58c8a7e22 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -35,6 +35,7 @@ import { requireProviderApiKey, resolveAuthBackedClient, } from './request-auth'; +import { clampCompletionTokensForSharedContextWindow } from './shared-context-window'; import { normalizeToolCallIdsForProvider, sanitizeToolCallId, @@ -71,6 +72,8 @@ export interface OpenAILegacyOptions { model: string; stream?: boolean | undefined; maxTokens?: number | undefined; + /** Total input+output budget when the backend enforces a shared context window. */ + sharedContextWindowTokens?: number | undefined; reasoningKey?: string | undefined; httpClient?: unknown; defaultHeaders?: Record; @@ -449,6 +452,7 @@ export class OpenAILegacyChatProvider implements ChatProvider { private _client: OpenAI | undefined; private _httpClient: unknown; private _clientFactory: ((auth: ProviderRequestAuth) => OpenAI) | undefined; + private _sharedContextWindowTokens: number | undefined; constructor(options: OpenAILegacyOptions) { const apiKey = options.apiKey ?? process.env['OPENAI_API_KEY']; @@ -472,6 +476,7 @@ export class OpenAILegacyChatProvider implements ChatProvider { this._toolMessageConversion = options.toolMessageConversion ?? null; this._httpClient = options.httpClient; this._clientFactory = options.clientFactory; + this._sharedContextWindowTokens = options.sharedContextWindowTokens; this._client = this._apiKey === undefined ? undefined : this._buildClient(this._apiKey); } @@ -512,7 +517,16 @@ export class OpenAILegacyChatProvider implements ChatProvider { const kwargs: Record = normalizeGenerationKwargs( this._model, - this._generationKwargs, + this._sharedContextWindowTokens === undefined + ? this._generationKwargs + : clampCompletionTokensForSharedContextWindow({ + model: this._model, + sharedContextWindowTokens: this._sharedContextWindowTokens, + generationKwargs: this._generationKwargs, + systemPrompt, + history, + tools, + }), ); // Determine reasoning_effort diff --git a/packages/kosong/src/providers/shared-context-window.ts b/packages/kosong/src/providers/shared-context-window.ts new file mode 100644 index 000000000..d14189218 --- /dev/null +++ b/packages/kosong/src/providers/shared-context-window.ts @@ -0,0 +1,47 @@ +import { estimatePromptTokens } from '#/token-estimate'; +import type { Message } from '#/message'; +import type { Tool } from '#/tool'; + +import type { OpenAILegacyGenerationKwargs } from './openai-legacy'; + +const DEFAULT_SERIALIZATION_MARGIN = 512; +const MIN_COMPLETION_TOKENS = 1; + +function completionTokenField(model: string): 'max_completion_tokens' | 'max_tokens' { + const normalized = model.toLowerCase(); + if (/^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized)) { + return 'max_completion_tokens'; + } + return 'max_tokens'; +} + +/** + * Clamp completion budget for providers where input and output share one + * context window (e.g. Microsoft Foundry Kimi deployments). + */ +export function clampCompletionTokensForSharedContextWindow(args: { + readonly model: string; + readonly sharedContextWindowTokens: number; + readonly generationKwargs: OpenAILegacyGenerationKwargs; + readonly systemPrompt: string; + readonly history: readonly Message[]; + readonly tools: readonly Tool[]; + readonly serializationMargin?: number; +}): OpenAILegacyGenerationKwargs { + const margin = args.serializationMargin ?? DEFAULT_SERIALIZATION_MARGIN; + const inputEstimate = estimatePromptTokens({ + systemPrompt: args.systemPrompt, + history: args.history, + tools: args.tools, + }); + const remaining = Math.max( + MIN_COMPLETION_TOKENS, + args.sharedContextWindowTokens - inputEstimate - margin, + ); + + const field = completionTokenField(args.model); + const kwargs = { ...args.generationKwargs }; + const requested = kwargs[field]; + kwargs[field] = requested === undefined ? remaining : Math.min(requested, remaining); + return kwargs; +} diff --git a/packages/kosong/src/token-estimate.ts b/packages/kosong/src/token-estimate.ts new file mode 100644 index 000000000..d0aa5f0fc --- /dev/null +++ b/packages/kosong/src/token-estimate.ts @@ -0,0 +1,77 @@ +import type { ContentPart, Message } from './message'; +import type { Tool } from './tool'; + +/** + * Estimate token count from text using a character-based heuristic. + * ASCII (~4 chars/token), CJK and other non-ASCII (~1 char/token). + */ +export function estimateTokens(text: string): number { + let asciiCount = 0; + let nonAsciiCount = 0; + for (const char of text) { + if (char.codePointAt(0)! <= 127) { + asciiCount++; + } else { + nonAsciiCount++; + } + } + return Math.ceil(asciiCount / 4) + nonAsciiCount; +} + +export function estimateTokensForMessages(messages: readonly Message[]): number { + let total = 0; + for (const message of messages) { + total += estimateTokensForMessage(message); + } + return total; +} + +export function estimateTokensForTools(tools: readonly Tool[]): number { + let total = 0; + for (const tool of tools) { + total += estimateTokens(tool.name); + total += estimateTokens(tool.description); + total += estimateTokens(JSON.stringify(tool.parameters)); + } + return total; +} + +function estimateTokensForMessage(message: Message): number { + let total = estimateTokens(message.role); + total += estimateTokensForContentParts(message.content); + for (const call of message.toolCalls) { + total += estimateTokens(call.name); + total += estimateTokens(JSON.stringify(call.arguments)); + } + return total; +} + +function estimateTokensForContentParts(parts: readonly ContentPart[]): number { + let total = 0; + for (const part of parts) { + if (part.type === 'text') { + total += estimateTokens(part.text); + } else if (part.type === 'think') { + total += estimateTokens(part.think); + } else if (part.type === 'image_url') { + total += estimateTokens(part.imageUrl.url); + } else if (part.type === 'audio_url') { + total += estimateTokens(part.audioUrl.url); + } else if (part.type === 'video_url') { + total += estimateTokens(part.videoUrl.url); + } + } + return total; +} + +export function estimatePromptTokens(args: { + readonly systemPrompt: string; + readonly history: readonly Message[]; + readonly tools: readonly Tool[]; +}): number { + return ( + estimateTokens(args.systemPrompt) + + estimateTokensForMessages(args.history) + + estimateTokensForTools(args.tools) + ); +} diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts index 8db86847e..667ee6d6a 100644 --- a/packages/kosong/test/azure-foundry.test.ts +++ b/packages/kosong/test/azure-foundry.test.ts @@ -22,6 +22,27 @@ describe('AzureFoundryChatProvider', () => { expect(provider.name).toBe('azure-foundry'); }); + it('rejects a missing base_url before constructing the client', () => { + expect( + () => + new AzureFoundryChatProvider({ + model: 'gpt-4o', + apiKey: 'test-key', + }), + ).toThrow(/baseUrl is required/); + }); + + it('rejects a blank base_url before constructing the client', () => { + expect( + () => + new AzureFoundryChatProvider({ + model: 'gpt-4o', + apiKey: 'test-key', + baseUrl: ' ', + }), + ).toThrow(/baseUrl is required/); + }); + it('sends Foundry api-key auth instead of Bearer for chat completions', async () => { await withHarness(async (harness) => { harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => { @@ -97,4 +118,40 @@ describe('AzureFoundryChatProvider', () => { expect(capturedPath).toBe('/openai/v1/chat/completions'); }); }); + + it('clamps max_tokens against the shared Foundry context window before sending', async () => { + await withHarness(async (harness) => { + let capturedBody: Record | undefined; + harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => { + capturedBody = request.bodyJson as Record; + await reply.sseJson(200, [ + { + id: 'chatcmpl-azure-cap', + object: 'chat.completion.chunk', + created: 1234567890, + model: 'Kimi-K2.6', + choices: [{ index: 0, delta: { content: 'ok' }, finish_reason: 'stop' }], + }, + ]); + }); + + const provider = new AzureFoundryChatProvider({ + model: 'Kimi-K2.6', + apiKey: 'foundry-key', + baseUrl: `${harness.baseUrl}/openai/v1`, + sharedContextWindowTokens: 262144, + }).withMaxCompletionTokens(262144); + const stream = await provider.generate('system prompt', [], [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + ]); + for await (const _part of stream) { + // drain + } + + expect(capturedBody).toBeDefined(); + expect(capturedBody!['max_tokens']).toBeTypeOf('number'); + expect(capturedBody!['max_tokens'] as number).toBeLessThan(262144); + expect(capturedBody!['max_tokens'] as number).toBeGreaterThan(0); + }); + }); }); diff --git a/packages/kosong/test/shared-context-window.test.ts b/packages/kosong/test/shared-context-window.test.ts new file mode 100644 index 000000000..604b01b1c --- /dev/null +++ b/packages/kosong/test/shared-context-window.test.ts @@ -0,0 +1,32 @@ +import { describe, expect, it } from 'vitest'; + +import { clampCompletionTokensForSharedContextWindow } from '#/providers/shared-context-window'; + +describe('clampCompletionTokensForSharedContextWindow', () => { + it('lowers an oversized completion cap to fit the remaining shared window', () => { + const kwargs = clampCompletionTokensForSharedContextWindow({ + model: 'Kimi-K2.6', + sharedContextWindowTokens: 262144, + generationKwargs: { max_tokens: 262144 }, + systemPrompt: 'x'.repeat(40_000), + history: [{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }], + tools: [], + }); + + expect(kwargs.max_tokens).toBeLessThan(262144); + expect(kwargs.max_tokens).toBeGreaterThan(0); + }); + + it('keeps a smaller explicit cap unchanged when it already fits', () => { + const kwargs = clampCompletionTokensForSharedContextWindow({ + model: 'Kimi-K2.6', + sharedContextWindowTokens: 262144, + generationKwargs: { max_tokens: 1024 }, + systemPrompt: 'short prompt', + history: [], + tools: [], + }); + + expect(kwargs.max_tokens).toBe(1024); + }); +}); From 70ae10cbafe553737991054085b294d173c91ba9 Mon Sep 17 00:00:00 2001 From: guglxni Date: Sat, 20 Jun 2026 23:30:43 +0530 Subject: [PATCH 3/4] fix: send Kimi wire format for Foundry-hosted reasoning models Foundry deployments of Kimi-K2.x were using max_tokens, which shares the output budget with reasoning_content and can yield think-only responses. Use max_completion_tokens and thinking enablement like the native Kimi provider, honor explicit thinking-off over history auto-injection, and apply shared-window clamping against the correct completion field. --- .changeset/azure-foundry-provider.md | 2 +- .../agent-core/src/config/kimi-env-params.ts | 14 ++++- .../test/config/kimi-env-params.test.ts | 16 +++++- packages/kosong/src/index.ts | 1 + .../kosong/src/providers/kimi-reasoning.ts | 47 +++++++++++++++ .../kosong/src/providers/openai-legacy.ts | 57 +++++++++++++++++-- .../src/providers/shared-context-window.ts | 16 ++++-- packages/kosong/test/azure-foundry.test.ts | 51 +++++++++++++++-- packages/kosong/test/kimi-reasoning.test.ts | 52 +++++++++++++++++ packages/kosong/test/openai-legacy.test.ts | 22 +++++++ .../kosong/test/shared-context-window.test.ts | 25 ++++++-- 11 files changed, 280 insertions(+), 23 deletions(-) create mode 100644 packages/kosong/src/providers/kimi-reasoning.ts create mode 100644 packages/kosong/test/kimi-reasoning.test.ts diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md index 1863fb07c..cfb09cec8 100644 --- a/.changeset/azure-foundry-provider.md +++ b/.changeset/azure-foundry-provider.md @@ -2,4 +2,4 @@ "@moonshot-ai/kimi-code": minor --- -Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window so Foundry-hosted Kimi models do not overflow on the first request. +Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window. For Foundry-hosted Kimi reasoning models, send `max_completion_tokens` and `thinking: { type: 'enabled' }` like the native Kimi provider so reasoning and visible output use separate budgets. diff --git a/packages/agent-core/src/config/kimi-env-params.ts b/packages/agent-core/src/config/kimi-env-params.ts index 8aa65455c..1cc726d5a 100644 --- a/packages/agent-core/src/config/kimi-env-params.ts +++ b/packages/agent-core/src/config/kimi-env-params.ts @@ -1,5 +1,6 @@ import { type ChatProvider, + isKimiReasoningModel, type GenerationKwargs, KimiChatProvider, type ThinkingEffort, @@ -50,8 +51,17 @@ export function applyKimiEnvThinkingKeep( thinkingLevel: ThinkingEffort, env: Env = process.env, ): ChatProvider { - if (!(provider instanceof KimiChatProvider)) return provider; const keep = env['KIMI_MODEL_THINKING_KEEP']?.trim(); if (keep === undefined || keep.length === 0 || thinkingLevel === 'off') return provider; - return provider.withExtraBody({ thinking: { keep } }); + if (provider instanceof KimiChatProvider) { + return provider.withExtraBody({ thinking: { keep } }); + } + if ( + provider.name === 'azure-foundry' && + isKimiReasoningModel(provider.modelName) && + provider.withGenerationKwargs !== undefined + ) { + return provider.withGenerationKwargs({ extra_body: { thinking: { keep } } }); + } + return provider; } diff --git a/packages/agent-core/test/config/kimi-env-params.test.ts b/packages/agent-core/test/config/kimi-env-params.test.ts index 723679bff..158256718 100644 --- a/packages/agent-core/test/config/kimi-env-params.test.ts +++ b/packages/agent-core/test/config/kimi-env-params.test.ts @@ -1,4 +1,4 @@ -import { type ChatProvider, KimiChatProvider } from '@moonshot-ai/kosong'; +import { createProvider, type ChatProvider, KimiChatProvider } from '@moonshot-ai/kosong'; import { describe, expect, it } from 'vitest'; import { applyKimiEnvSamplingParams, applyKimiEnvThinkingKeep } from '../../src/config/kimi-env-params'; @@ -8,6 +8,15 @@ function kimi(): KimiChatProvider { return new KimiChatProvider({ model: 'kimi-k2', apiKey: 'k' }); } +function foundryKimi(): ChatProvider { + return createProvider({ + type: 'azure-foundry', + model: 'Kimi-K2.6', + apiKey: 'k', + baseUrl: 'https://example.openai.azure.com/openai/v1', + }); +} + interface KimiGenerationState { temperature?: number; top_p?: number; @@ -63,6 +72,11 @@ describe('applyKimiEnvThinkingKeep', () => { expect(genState(out).extra_body?.thinking?.keep).toBe('all'); }); + it('injects thinking.keep for Foundry-hosted Kimi models', () => { + const out = applyKimiEnvThinkingKeep(foundryKimi(), 'high', { KIMI_MODEL_THINKING_KEEP: 'all' }); + expect(genState(out).extra_body?.thinking?.keep).toBe('all'); + }); + it('does NOT inject thinking.keep when thinking is off', () => { const out = applyKimiEnvThinkingKeep(kimi(), 'off', { KIMI_MODEL_THINKING_KEEP: 'all' }); expect(genState(out).extra_body).toBeUndefined(); diff --git a/packages/kosong/src/index.ts b/packages/kosong/src/index.ts index b8bd9bdcb..a5f02aecf 100644 --- a/packages/kosong/src/index.ts +++ b/packages/kosong/src/index.ts @@ -32,6 +32,7 @@ export type { ProviderConfig, ProviderType } from './providers'; // kwargs, `thinking.keep` extra body). export { KimiChatProvider } from './providers/kimi'; export type { ExtraBody, GenerationKwargs, KimiOptions, ThinkingConfig } from './providers/kimi'; +export { isKimiReasoningModel } from './providers/kimi-reasoning'; // Model capability matrix export { UNKNOWN_CAPABILITY, isUnknownCapability } from './capability'; diff --git a/packages/kosong/src/providers/kimi-reasoning.ts b/packages/kosong/src/providers/kimi-reasoning.ts new file mode 100644 index 000000000..1936283c3 --- /dev/null +++ b/packages/kosong/src/providers/kimi-reasoning.ts @@ -0,0 +1,47 @@ +/** + * Kimi reasoning models hosted on OpenAI-compatible gateways (Moonshot API, + * Microsoft Foundry, etc.) require `max_completion_tokens` instead of + * `max_tokens`. On reasoning models, `max_tokens` shares the budget with + * `reasoning_content`, so the model can exhaust the entire cap during thinking + * and return no visible content or tool calls. + * + * Native {@link KimiChatProvider} already normalizes this; openai-legacy paths + * (including azure-foundry) must apply the same rules when the deployment id + * identifies a Kimi reasoning model. + */ + +export function isKimiReasoningModel(model: string): boolean { + const normalized = model.toLowerCase(); + return ( + normalized.includes('kimi') || + normalized.includes('moonshot') || + /^k2(?:[-_.]|$)/.test(normalized) + ); +} + +/** Whether outbound requests should use `max_completion_tokens` on the wire. */ +export function usesMaxCompletionTokensOnWire(model: string): boolean { + if (isKimiReasoningModel(model)) return true; + const normalized = model.toLowerCase(); + return /^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized); +} + +export interface KimiThinkingWireParams { + readonly type: 'enabled' | 'disabled'; + readonly keep?: unknown; +} + +/** Top-level `thinking` object for Kimi reasoning models. */ +export function kimiThinkingWireParams(args: { + readonly reasoningEffort: string | undefined; + readonly thinkingExplicitlyOff: boolean; + readonly thinkingKeep?: unknown; +}): KimiThinkingWireParams | undefined { + if (args.thinkingExplicitlyOff) { + return { type: 'disabled' }; + } + if (args.reasoningEffort === undefined) return undefined; + return args.thinkingKeep === undefined + ? { type: 'enabled' } + : { type: 'enabled', keep: args.thinkingKeep }; +} diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index 58c8a7e22..de433d03a 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -36,6 +36,10 @@ import { resolveAuthBackedClient, } from './request-auth'; import { clampCompletionTokensForSharedContextWindow } from './shared-context-window'; +import { + isKimiReasoningModel, + usesMaxCompletionTokensOnWire, +} from './kimi-reasoning'; import { normalizeToolCallIdsForProvider, sanitizeToolCallId, @@ -108,8 +112,7 @@ interface OpenAIToolCallOut { } function usesMaxCompletionTokens(model: string): boolean { - const normalized = model.toLowerCase(); - return /^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized); + return usesMaxCompletionTokensOnWire(model); } function completionTokenKwargs( @@ -447,6 +450,8 @@ export class OpenAILegacyChatProvider implements ChatProvider { private _defaultHeaders: Record | undefined; private _reasoningKey: string | undefined; private _reasoningEffort: string | undefined; + /** When true, reasoning is explicitly disabled and must not be auto-enabled from history. */ + private _thinkingExplicitlyOff: boolean; private _generationKwargs: OpenAILegacyGenerationKwargs; private _toolMessageConversion: ToolMessageConversion; private _client: OpenAI | undefined; @@ -471,6 +476,7 @@ export class OpenAILegacyChatProvider implements ChatProvider { ? normalizedReasoningKey : undefined; this._reasoningEffort = undefined; + this._thinkingExplicitlyOff = false; this._generationKwargs = options.maxTokens !== undefined ? completionTokenKwargs(this._model, options.maxTokens) : {}; this._toolMessageConversion = options.toolMessageConversion ?? null; @@ -486,6 +492,7 @@ export class OpenAILegacyChatProvider implements ChatProvider { } get thinkingEffort(): ThinkingEffort | null { + if (this._thinkingExplicitlyOff) return 'off'; return reasoningEffortToThinkingEffort(this._reasoningEffort); } @@ -538,7 +545,11 @@ export class OpenAILegacyChatProvider implements ChatProvider { // Skip when the caller already pinned reasoning_effort via withGenerationKwargs — // their value would otherwise be silently overwritten below. // See: https://github.com/MoonshotAI/kimi-code/issues/1616 - if (reasoningEffort === undefined && kwargs['reasoning_effort'] === undefined) { + if ( + !this._thinkingExplicitlyOff && + reasoningEffort === undefined && + kwargs['reasoning_effort'] === undefined + ) { const hasThinkPart = history.some((message) => message.content.some((part) => part.type === 'think'), ); @@ -575,6 +586,37 @@ export class OpenAILegacyChatProvider implements ChatProvider { createParams['reasoning_effort'] = reasoningEffort; } + if (isKimiReasoningModel(this._model)) { + const extraBody = kwargs['extra_body']; + const extraRecord = + typeof extraBody === 'object' && extraBody !== null + ? (extraBody as Record) + : undefined; + const extraThinking = + typeof extraRecord?.thinking === 'object' && extraRecord.thinking !== null + ? (extraRecord.thinking as Record) + : undefined; + let thinkingType: 'enabled' | 'disabled' | undefined; + if (this._thinkingExplicitlyOff) { + thinkingType = 'disabled'; + } else if (reasoningEffort !== undefined) { + thinkingType = 'enabled'; + } + if (thinkingType !== undefined || extraThinking !== undefined) { + createParams['thinking'] = { + ...extraThinking, + ...(thinkingType !== undefined ? { type: thinkingType } : {}), + }; + } + if (extraRecord !== undefined) { + const { thinking: _, extra_body: __, ...restExtra } = extraRecord; + Object.assign(createParams, restExtra); + } + // Kimi gateways expect extra_body fields hoisted to the top level. + // eslint-disable-next-line @typescript-eslint/no-dynamic-delete + delete createParams['extra_body']; + } + try { const client = this._createClient(options?.auth); const response = (await client.chat.completions.create( @@ -588,9 +630,14 @@ export class OpenAILegacyChatProvider implements ChatProvider { } withThinking(effort: ThinkingEffort): OpenAILegacyChatProvider { - const reasoningEffort = thinkingEffortToReasoningEffort(effort); const clone = this._clone(); - clone._reasoningEffort = reasoningEffort; + if (effort === 'off') { + clone._thinkingExplicitlyOff = true; + clone._reasoningEffort = undefined; + } else { + clone._thinkingExplicitlyOff = false; + clone._reasoningEffort = thinkingEffortToReasoningEffort(effort); + } return clone; } diff --git a/packages/kosong/src/providers/shared-context-window.ts b/packages/kosong/src/providers/shared-context-window.ts index d14189218..9648c7133 100644 --- a/packages/kosong/src/providers/shared-context-window.ts +++ b/packages/kosong/src/providers/shared-context-window.ts @@ -2,22 +2,23 @@ import { estimatePromptTokens } from '#/token-estimate'; import type { Message } from '#/message'; import type { Tool } from '#/tool'; +import { usesMaxCompletionTokensOnWire } from './kimi-reasoning'; import type { OpenAILegacyGenerationKwargs } from './openai-legacy'; const DEFAULT_SERIALIZATION_MARGIN = 512; const MIN_COMPLETION_TOKENS = 1; function completionTokenField(model: string): 'max_completion_tokens' | 'max_tokens' { - const normalized = model.toLowerCase(); - if (/^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized)) { - return 'max_completion_tokens'; - } - return 'max_tokens'; + return usesMaxCompletionTokensOnWire(model) ? 'max_completion_tokens' : 'max_tokens'; } /** * Clamp completion budget for providers where input and output share one * context window (e.g. Microsoft Foundry Kimi deployments). + * + * Kimi reasoning models use `max_completion_tokens` for visible output; reasoning + * tokens are billed separately within the shared window. Do not apply a separate + * reasoning output cap — that defeats the purpose of the split field. */ export function clampCompletionTokensForSharedContextWindow(args: { readonly model: string; @@ -43,5 +44,10 @@ export function clampCompletionTokensForSharedContextWindow(args: { const kwargs = { ...args.generationKwargs }; const requested = kwargs[field]; kwargs[field] = requested === undefined ? remaining : Math.min(requested, remaining); + // Drop legacy alias when the wire field is max_completion_tokens. + if (field === 'max_completion_tokens') { + // eslint-disable-next-line @typescript-eslint/no-dynamic-delete + delete kwargs.max_tokens; + } return kwargs; } diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts index 667ee6d6a..1edaf9f1e 100644 --- a/packages/kosong/test/azure-foundry.test.ts +++ b/packages/kosong/test/azure-foundry.test.ts @@ -119,7 +119,7 @@ describe('AzureFoundryChatProvider', () => { }); }); - it('clamps max_tokens against the shared Foundry context window before sending', async () => { + it('clamps max_completion_tokens against the shared Foundry context window for Kimi models', async () => { await withHarness(async (harness) => { let capturedBody: Record | undefined; harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => { @@ -149,9 +149,52 @@ describe('AzureFoundryChatProvider', () => { } expect(capturedBody).toBeDefined(); - expect(capturedBody!['max_tokens']).toBeTypeOf('number'); - expect(capturedBody!['max_tokens'] as number).toBeLessThan(262144); - expect(capturedBody!['max_tokens'] as number).toBeGreaterThan(0); + expect(capturedBody!['max_completion_tokens']).toBeTypeOf('number'); + expect(capturedBody!['max_completion_tokens'] as number).toBeLessThan(262144); + expect(capturedBody!['max_completion_tokens'] as number).toBeGreaterThan(0); + expect(capturedBody!['max_tokens']).toBeUndefined(); + }); + }); + + it('sends Kimi thinking enablement alongside reasoning_effort', async () => { + await withHarness(async (harness) => { + let capturedBody: Record | undefined; + harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => { + capturedBody = request.bodyJson as Record; + await reply.sseJson(200, [ + { + id: 'chatcmpl-azure-kimi', + object: 'chat.completion.chunk', + created: 1234567890, + model: 'Kimi-K2.6', + choices: [ + { + index: 0, + delta: { reasoning_content: 'thinking', content: 'done' }, + finish_reason: 'stop', + }, + ], + }, + ]); + }); + + const provider = new AzureFoundryChatProvider({ + model: 'Kimi-K2.6', + apiKey: 'foundry-key', + baseUrl: `${harness.baseUrl}/openai/v1`, + sharedContextWindowTokens: 262144, + }).withThinking('medium'); + const stream = await provider.generate('', [], [ + { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }, + ]); + for await (const _part of stream) { + // drain + } + + expect(capturedBody!['reasoning_effort']).toBe('medium'); + expect(capturedBody!['thinking']).toEqual({ type: 'enabled' }); + expect(capturedBody!['max_completion_tokens']).toBeTypeOf('number'); + expect(capturedBody!['max_tokens']).toBeUndefined(); }); }); }); diff --git a/packages/kosong/test/kimi-reasoning.test.ts b/packages/kosong/test/kimi-reasoning.test.ts new file mode 100644 index 000000000..7e4c3b97c --- /dev/null +++ b/packages/kosong/test/kimi-reasoning.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest'; + +import { + isKimiReasoningModel, + kimiThinkingWireParams, + usesMaxCompletionTokensOnWire, +} from '#/providers/kimi-reasoning'; + +describe('isKimiReasoningModel', () => { + it('detects Kimi deployment ids on Foundry', () => { + expect(isKimiReasoningModel('Kimi-K2.6')).toBe(true); + expect(isKimiReasoningModel('kimi-k2.5')).toBe(true); + expect(isKimiReasoningModel('moonshot-v1')).toBe(true); + }); + + it('does not match unrelated models', () => { + expect(isKimiReasoningModel('gpt-4o')).toBe(false); + expect(isKimiReasoningModel('deepseek-v3')).toBe(false); + }); +}); + +describe('usesMaxCompletionTokensOnWire', () => { + it('uses max_completion_tokens for Kimi and OpenAI reasoning models', () => { + expect(usesMaxCompletionTokensOnWire('Kimi-K2.6')).toBe(true); + expect(usesMaxCompletionTokensOnWire('gpt-5')).toBe(true); + expect(usesMaxCompletionTokensOnWire('o3-mini')).toBe(true); + }); + + it('uses max_tokens for generic chat models', () => { + expect(usesMaxCompletionTokensOnWire('gpt-4o')).toBe(false); + }); +}); + +describe('kimiThinkingWireParams', () => { + it('enables thinking when reasoning is configured', () => { + expect( + kimiThinkingWireParams({ + reasoningEffort: 'medium', + thinkingExplicitlyOff: false, + }), + ).toEqual({ type: 'enabled' }); + }); + + it('disables thinking when explicitly off', () => { + expect( + kimiThinkingWireParams({ + reasoningEffort: 'medium', + thinkingExplicitlyOff: true, + }), + ).toEqual({ type: 'disabled' }); + }); +}); diff --git a/packages/kosong/test/openai-legacy.test.ts b/packages/kosong/test/openai-legacy.test.ts index 8335a8228..63def863d 100644 --- a/packages/kosong/test/openai-legacy.test.ts +++ b/packages/kosong/test/openai-legacy.test.ts @@ -986,6 +986,28 @@ describe('OpenAILegacyChatProvider', () => { expect(body['reasoning_effort']).toBe('high'); }); + + it('does not auto-inject reasoning_effort when thinking is explicitly off', async () => { + const provider = createProvider({ model: 'kimi-k2.5', reasoningKey: 'reasoning_content' }).withThinking( + 'off', + ); + const history: Message[] = [ + { role: 'user', content: [{ type: 'text', text: 'Hello' }], toolCalls: [] }, + { + role: 'assistant', + content: [ + { type: 'think', think: 'Let me think...' }, + { type: 'text', text: 'Hi!' }, + ], + toolCalls: [], + }, + { role: 'user', content: [{ type: 'text', text: 'How are you?' }], toolCalls: [] }, + ]; + const body = await captureRequestBody(provider, '', [], history); + + expect(body['reasoning_effort']).toBeUndefined(); + expect(provider.thinkingEffort).toBe('off'); + }); }); describe('default reasoning protocol (no explicit reasoningKey)', () => { diff --git a/packages/kosong/test/shared-context-window.test.ts b/packages/kosong/test/shared-context-window.test.ts index 604b01b1c..ff991359a 100644 --- a/packages/kosong/test/shared-context-window.test.ts +++ b/packages/kosong/test/shared-context-window.test.ts @@ -7,26 +7,41 @@ describe('clampCompletionTokensForSharedContextWindow', () => { const kwargs = clampCompletionTokensForSharedContextWindow({ model: 'Kimi-K2.6', sharedContextWindowTokens: 262144, - generationKwargs: { max_tokens: 262144 }, + generationKwargs: { max_completion_tokens: 262144 }, systemPrompt: 'x'.repeat(40_000), history: [{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }], tools: [], }); - expect(kwargs.max_tokens).toBeLessThan(262144); - expect(kwargs.max_tokens).toBeGreaterThan(0); + expect(kwargs.max_completion_tokens).toBeLessThan(262144); + expect(kwargs.max_completion_tokens).toBeGreaterThan(0); + expect(kwargs.max_tokens).toBeUndefined(); }); it('keeps a smaller explicit cap unchanged when it already fits', () => { const kwargs = clampCompletionTokensForSharedContextWindow({ model: 'Kimi-K2.6', sharedContextWindowTokens: 262144, - generationKwargs: { max_tokens: 1024 }, + generationKwargs: { max_completion_tokens: 1024 }, systemPrompt: 'short prompt', history: [], tools: [], }); - expect(kwargs.max_tokens).toBe(1024); + expect(kwargs.max_completion_tokens).toBe(1024); + }); + + it('uses max_tokens for non-Kimi shared-window models', () => { + const kwargs = clampCompletionTokensForSharedContextWindow({ + model: 'gpt-4o', + sharedContextWindowTokens: 128000, + generationKwargs: { max_tokens: 4096 }, + systemPrompt: 'short prompt', + history: [], + tools: [], + }); + + expect(kwargs.max_tokens).toBe(4096); + expect(kwargs.max_completion_tokens).toBeUndefined(); }); }); From 1207f072871d83e5e08e01c9cf33a07fac18a350 Mon Sep 17 00:00:00 2001 From: guglxni Date: Sat, 20 Jun 2026 23:35:02 +0530 Subject: [PATCH 4/4] fix: omit Moonshot thinking param on Foundry Kimi requests Microsoft Foundry exposes Kimi through the OpenAI chat-completions schema and rejects the Moonshot-proprietary `thinking` argument. Keep reasoning enabled via `reasoning_effort` and the max_completion_tokens split; only KimiChatProvider sends `thinking` on the native Moonshot API. --- .changeset/azure-foundry-provider.md | 2 +- .../agent-core/src/config/kimi-env-params.ts | 8 ---- .../test/config/kimi-env-params.test.ts | 7 ++-- .../kosong/src/providers/kimi-reasoning.ts | 37 +++++-------------- .../kosong/src/providers/openai-legacy.ts | 32 ---------------- packages/kosong/test/azure-foundry.test.ts | 4 +- packages/kosong/test/kimi-reasoning.test.ts | 26 +------------ 7 files changed, 17 insertions(+), 99 deletions(-) diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md index cfb09cec8..2b14488e0 100644 --- a/.changeset/azure-foundry-provider.md +++ b/.changeset/azure-foundry-provider.md @@ -2,4 +2,4 @@ "@moonshot-ai/kimi-code": minor --- -Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window. For Foundry-hosted Kimi reasoning models, send `max_completion_tokens` and `thinking: { type: 'enabled' }` like the native Kimi provider so reasoning and visible output use separate budgets. +Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window. For Foundry-hosted Kimi reasoning models, send `max_completion_tokens` (not `max_tokens`) so reasoning and visible output use separate budgets, and enable reasoning via `reasoning_effort` only — Foundry rejects Moonshot's proprietary `thinking` parameter. diff --git a/packages/agent-core/src/config/kimi-env-params.ts b/packages/agent-core/src/config/kimi-env-params.ts index 1cc726d5a..8f4af5fec 100644 --- a/packages/agent-core/src/config/kimi-env-params.ts +++ b/packages/agent-core/src/config/kimi-env-params.ts @@ -1,6 +1,5 @@ import { type ChatProvider, - isKimiReasoningModel, type GenerationKwargs, KimiChatProvider, type ThinkingEffort, @@ -56,12 +55,5 @@ export function applyKimiEnvThinkingKeep( if (provider instanceof KimiChatProvider) { return provider.withExtraBody({ thinking: { keep } }); } - if ( - provider.name === 'azure-foundry' && - isKimiReasoningModel(provider.modelName) && - provider.withGenerationKwargs !== undefined - ) { - return provider.withGenerationKwargs({ extra_body: { thinking: { keep } } }); - } return provider; } diff --git a/packages/agent-core/test/config/kimi-env-params.test.ts b/packages/agent-core/test/config/kimi-env-params.test.ts index 158256718..ebcd0f526 100644 --- a/packages/agent-core/test/config/kimi-env-params.test.ts +++ b/packages/agent-core/test/config/kimi-env-params.test.ts @@ -72,9 +72,10 @@ describe('applyKimiEnvThinkingKeep', () => { expect(genState(out).extra_body?.thinking?.keep).toBe('all'); }); - it('injects thinking.keep for Foundry-hosted Kimi models', () => { - const out = applyKimiEnvThinkingKeep(foundryKimi(), 'high', { KIMI_MODEL_THINKING_KEEP: 'all' }); - expect(genState(out).extra_body?.thinking?.keep).toBe('all'); + it('does not inject thinking.keep for Foundry-hosted Kimi models', () => { + const provider = foundryKimi(); + const out = applyKimiEnvThinkingKeep(provider, 'high', { KIMI_MODEL_THINKING_KEEP: 'all' }); + expect(out).toBe(provider); }); it('does NOT inject thinking.keep when thinking is off', () => { diff --git a/packages/kosong/src/providers/kimi-reasoning.ts b/packages/kosong/src/providers/kimi-reasoning.ts index 1936283c3..0a21206de 100644 --- a/packages/kosong/src/providers/kimi-reasoning.ts +++ b/packages/kosong/src/providers/kimi-reasoning.ts @@ -1,13 +1,14 @@ /** - * Kimi reasoning models hosted on OpenAI-compatible gateways (Moonshot API, - * Microsoft Foundry, etc.) require `max_completion_tokens` instead of - * `max_tokens`. On reasoning models, `max_tokens` shares the budget with - * `reasoning_content`, so the model can exhaust the entire cap during thinking - * and return no visible content or tool calls. + * Kimi reasoning models hosted on OpenAI-compatible gateways require + * `max_completion_tokens` instead of `max_tokens`. On reasoning models, + * `max_tokens` shares the budget with `reasoning_content`, so the model can + * exhaust the entire cap during thinking and return no visible content or tool + * calls. * - * Native {@link KimiChatProvider} already normalizes this; openai-legacy paths - * (including azure-foundry) must apply the same rules when the deployment id - * identifies a Kimi reasoning model. + * The Moonshot-proprietary `thinking: { type: 'enabled' }` parameter is only + * sent by {@link KimiChatProvider}. Gateways such as Microsoft Foundry expose + * Kimi through the OpenAI chat-completions schema and enable reasoning via + * `reasoning_effort` alone — sending `thinking` yields 400. */ export function isKimiReasoningModel(model: string): boolean { @@ -25,23 +26,3 @@ export function usesMaxCompletionTokensOnWire(model: string): boolean { const normalized = model.toLowerCase(); return /^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized); } - -export interface KimiThinkingWireParams { - readonly type: 'enabled' | 'disabled'; - readonly keep?: unknown; -} - -/** Top-level `thinking` object for Kimi reasoning models. */ -export function kimiThinkingWireParams(args: { - readonly reasoningEffort: string | undefined; - readonly thinkingExplicitlyOff: boolean; - readonly thinkingKeep?: unknown; -}): KimiThinkingWireParams | undefined { - if (args.thinkingExplicitlyOff) { - return { type: 'disabled' }; - } - if (args.reasoningEffort === undefined) return undefined; - return args.thinkingKeep === undefined - ? { type: 'enabled' } - : { type: 'enabled', keep: args.thinkingKeep }; -} diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts index de433d03a..497a2391f 100644 --- a/packages/kosong/src/providers/openai-legacy.ts +++ b/packages/kosong/src/providers/openai-legacy.ts @@ -37,7 +37,6 @@ import { } from './request-auth'; import { clampCompletionTokensForSharedContextWindow } from './shared-context-window'; import { - isKimiReasoningModel, usesMaxCompletionTokensOnWire, } from './kimi-reasoning'; import { @@ -586,37 +585,6 @@ export class OpenAILegacyChatProvider implements ChatProvider { createParams['reasoning_effort'] = reasoningEffort; } - if (isKimiReasoningModel(this._model)) { - const extraBody = kwargs['extra_body']; - const extraRecord = - typeof extraBody === 'object' && extraBody !== null - ? (extraBody as Record) - : undefined; - const extraThinking = - typeof extraRecord?.thinking === 'object' && extraRecord.thinking !== null - ? (extraRecord.thinking as Record) - : undefined; - let thinkingType: 'enabled' | 'disabled' | undefined; - if (this._thinkingExplicitlyOff) { - thinkingType = 'disabled'; - } else if (reasoningEffort !== undefined) { - thinkingType = 'enabled'; - } - if (thinkingType !== undefined || extraThinking !== undefined) { - createParams['thinking'] = { - ...extraThinking, - ...(thinkingType !== undefined ? { type: thinkingType } : {}), - }; - } - if (extraRecord !== undefined) { - const { thinking: _, extra_body: __, ...restExtra } = extraRecord; - Object.assign(createParams, restExtra); - } - // Kimi gateways expect extra_body fields hoisted to the top level. - // eslint-disable-next-line @typescript-eslint/no-dynamic-delete - delete createParams['extra_body']; - } - try { const client = this._createClient(options?.auth); const response = (await client.chat.completions.create( diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts index 1edaf9f1e..155de8374 100644 --- a/packages/kosong/test/azure-foundry.test.ts +++ b/packages/kosong/test/azure-foundry.test.ts @@ -156,7 +156,7 @@ describe('AzureFoundryChatProvider', () => { }); }); - it('sends Kimi thinking enablement alongside reasoning_effort', async () => { + it('sends reasoning_effort and max_completion_tokens without Moonshot thinking param', async () => { await withHarness(async (harness) => { let capturedBody: Record | undefined; harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => { @@ -192,7 +192,7 @@ describe('AzureFoundryChatProvider', () => { } expect(capturedBody!['reasoning_effort']).toBe('medium'); - expect(capturedBody!['thinking']).toEqual({ type: 'enabled' }); + expect(capturedBody!['thinking']).toBeUndefined(); expect(capturedBody!['max_completion_tokens']).toBeTypeOf('number'); expect(capturedBody!['max_tokens']).toBeUndefined(); }); diff --git a/packages/kosong/test/kimi-reasoning.test.ts b/packages/kosong/test/kimi-reasoning.test.ts index 7e4c3b97c..e797d99fa 100644 --- a/packages/kosong/test/kimi-reasoning.test.ts +++ b/packages/kosong/test/kimi-reasoning.test.ts @@ -1,10 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { - isKimiReasoningModel, - kimiThinkingWireParams, - usesMaxCompletionTokensOnWire, -} from '#/providers/kimi-reasoning'; +import { isKimiReasoningModel, usesMaxCompletionTokensOnWire } from '#/providers/kimi-reasoning'; describe('isKimiReasoningModel', () => { it('detects Kimi deployment ids on Foundry', () => { @@ -30,23 +26,3 @@ describe('usesMaxCompletionTokensOnWire', () => { expect(usesMaxCompletionTokensOnWire('gpt-4o')).toBe(false); }); }); - -describe('kimiThinkingWireParams', () => { - it('enables thinking when reasoning is configured', () => { - expect( - kimiThinkingWireParams({ - reasoningEffort: 'medium', - thinkingExplicitlyOff: false, - }), - ).toEqual({ type: 'enabled' }); - }); - - it('disables thinking when explicitly off', () => { - expect( - kimiThinkingWireParams({ - reasoningEffort: 'medium', - thinkingExplicitlyOff: true, - }), - ).toEqual({ type: 'disabled' }); - }); -});