From f36bd80f8ef25c0cca3eddc3b876921b0946ddf6 Mon Sep 17 00:00:00 2001
From: guglxni <aaryanguglani.cs21@rvce.edu.in>
Date: Sat, 20 Jun 2026 18:40:25 +0530
Subject: [PATCH 1/4] feat: add azure-foundry provider for Microsoft Foundry
 model access

Introduce a first-class azure-foundry provider that targets Foundry's OpenAI
v1-compatible route with api-key authentication, so users no longer need to
hand-wire the generic openai provider for Azure deployments.

Closes #918

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .changeset/azure-foundry-provider.md          |   5 +
 docs/en/configuration/config-files.md         |   2 +-
 docs/en/configuration/env-vars.md             |   2 +
 docs/en/configuration/providers.md            |  25 +++++
 docs/zh/configuration/config-files.md         |   2 +-
 docs/zh/configuration/env-vars.md             |   2 +
 docs/zh/configuration/providers.md            |  25 +++++
 packages/agent-core/src/config/schema.ts      |   1 +
 .../modelCatalog/modelCatalogService.ts       |   2 +
 .../src/session/provider-manager.ts           |  11 ++
 .../test/harness/runtime-provider.test.ts     |  31 ++++++
 packages/kosong/src/catalog.ts                |   2 +
 .../kosong/src/providers/azure-foundry.ts     |  72 +++++++++++++
 packages/kosong/src/providers/index.ts        |   5 +
 packages/kosong/test/azure-foundry.test.ts    | 100 ++++++++++++++++++
 packages/kosong/test/catalog.test.ts          |   5 +
 packages/kosong/tsdown.config.ts              |   1 +
 packages/oauth/src/custom-registry.ts         |   2 +
 18 files changed, 293 insertions(+), 2 deletions(-)
 create mode 100644 .changeset/azure-foundry-provider.md
 create mode 100644 packages/kosong/src/providers/azure-foundry.ts
 create mode 100644 packages/kosong/test/azure-foundry.test.ts

diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md
new file mode 100644
index 000000000..1c01dcefc
--- /dev/null
+++ b/.changeset/azure-foundry-provider.md
@@ -0,0 +1,5 @@
+---
+"@moonshot-ai/kimi-code": minor
+---
+
+Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route.
diff --git a/docs/en/configuration/config-files.md b/docs/en/configuration/config-files.md
index 0d64f5044..cd20b9dfe 100644
--- a/docs/en/configuration/config-files.md
+++ b/docs/en/configuration/config-files.md
@@ -100,7 +100,7 @@ Each entry in the `providers` table defines an API provider, keyed by a unique n
 
 | Field | Type | Required | Description |
 | --- | --- | --- | --- |
-| `type` | `string` | Yes | Provider type: `kimi`, `anthropic`, `openai`, `openai_responses`, `google-genai`, `vertexai` |
+| `type` | `string` | Yes | Provider type: `kimi`, `anthropic`, `openai`, `openai_responses`, `azure-foundry`, `google-genai`, `vertexai` |
 | `api_key` | `string` | No | API key, written in plain text in the config file |
 | `base_url` | `string` | No | API base URL |
 | `oauth` | `table` | No | OAuth credential reference (`storage` and `key` fields); injected automatically by the login flow — normally no need to write this by hand |
diff --git a/docs/en/configuration/env-vars.md b/docs/en/configuration/env-vars.md
index 933232511..791d0e88a 100644
--- a/docs/en/configuration/env-vars.md
+++ b/docs/en/configuration/env-vars.md
@@ -58,6 +58,8 @@ Key names per provider:
 | `ANTHROPIC_BASE_URL` | Anthropic | Follows Anthropic SDK default |
 | `OPENAI_API_KEY` | OpenAI (`openai` and `openai_responses`) | None |
 | `OPENAI_BASE_URL` | OpenAI (`openai` and `openai_responses`) | `https://api.openai.com/v1` |
+| `AZURE_FOUNDRY_API_KEY` | Microsoft Foundry (`azure-foundry`) | None |
+| `AZURE_FOUNDRY_BASE_URL` | Microsoft Foundry (`azure-foundry`) | None |
 | `GOOGLE_API_KEY` | Google GenAI, Vertex AI | None |
 | `VERTEXAI_API_KEY` | Vertex AI | None |
 | `GOOGLE_CLOUD_PROJECT` | Vertex AI | None |
diff --git a/docs/en/configuration/providers.md b/docs/en/configuration/providers.md
index 8fed5c4e1..89c5f0c65 100644
--- a/docs/en/configuration/providers.md
+++ b/docs/en/configuration/providers.md
@@ -12,6 +12,7 @@ The `type` field in the `providers` table determines which protocol implementati
 | `anthropic` | Anthropic Messages | Claude model family |
 | `openai` | OpenAI Chat Completions | OpenAI and compatible services, DeepSeek, Qwen, etc. |
 | `openai_responses` | OpenAI Responses API | OpenAI's newer Responses interface |
+| `azure-foundry` | Microsoft Foundry (OpenAI v1) | Azure AI Foundry model deployments (GPT, DeepSeek, Llama, Mistral, etc.) |
 | `google-genai` | Google GenAI | Gemini API |
 | `vertexai` | Google GenAI on Vertex | Google Cloud Vertex AI |
 
@@ -107,6 +108,30 @@ base_url = "https://api.openai.com/v1"
 api_key = "sk-xxxxx"
 ```
 
+## `azure-foundry`
+
+For connecting to [Microsoft Foundry](https://learn.microsoft.com/en-us/azure/foundry/) model deployments through the OpenAI v1-compatible inference route. Foundry hosts multiple model families — OpenAI GPT, DeepSeek, Meta Llama, Mistral, and others sold directly by Azure — not just OpenAI models. Put the model ID from your Foundry deployment in `[models.<alias>]`.
+
+Microsoft recommends the OpenAI v1 route for third-party SDKs and custom applications. See [Integrate Microsoft Foundry with your applications](https://learn.microsoft.com/en-us/azure/foundry/how-to/integrate-with-other-apps).
+
+- Recommended `base_url`: `https://{resource}.openai.azure.com/openai/v1`
+- Credential key names: `AZURE_FOUNDRY_API_KEY`, `AZURE_FOUNDRY_BASE_URL`
+- Auth: sends the Foundry `api-key` header
+
+```toml
+[providers.foundry]
+type = "azure-foundry"
+base_url = "https://YOUR-RESOURCE.openai.azure.com/openai/v1"
+api_key = "YOUR_KEY"
+
+[models.foundry-gpt4o]
+provider = "foundry"
+model = "gpt-4o"
+max_context_size = 128000
+```
+
+Third-party reasoning models on Foundry work the same way as on the generic `openai` provider: set `reasoning_key` on the model alias when your gateway returns reasoning content under a non-standard field name.
+
 ## `google-genai`
 
 For connecting directly to the Google Gemini API. Thinking, vision, and multimodal capabilities are auto-detected by model name.
diff --git a/docs/zh/configuration/config-files.md b/docs/zh/configuration/config-files.md
index e0a215f56..f29b2fa22 100644
--- a/docs/zh/configuration/config-files.md
+++ b/docs/zh/configuration/config-files.md
@@ -100,7 +100,7 @@ timeout = 5
 
 | 字段 | 类型 | 必填 | 说明 |
 | --- | --- | --- | --- |
-| `type` | `string` | 是 | 供应商类型：`kimi`、`anthropic`、`openai`、`openai_responses`、`google-genai`、`vertexai` |
+| `type` | `string` | 是 | 供应商类型：`kimi`、`anthropic`、`openai`、`openai_responses`、`azure-foundry`、`google-genai`、`vertexai` |
 | `api_key` | `string` | 否 | API 密钥，明文写在配置文件里 |
 | `base_url` | `string` | 否 | API 基础 URL |
 | `oauth` | `table` | 否 | OAuth 凭据引用（`storage`、`key` 两个字段），由登录流程自动注入，通常无需手写 |
diff --git a/docs/zh/configuration/env-vars.md b/docs/zh/configuration/env-vars.md
index 227b1ced1..46b87eb4c 100644
--- a/docs/zh/configuration/env-vars.md
+++ b/docs/zh/configuration/env-vars.md
@@ -58,6 +58,8 @@ KIMI_BASE_URL = "https://api.moonshot.ai/v1"
 | `ANTHROPIC_BASE_URL` | Anthropic | Anthropic SDK 默认值 |
 | `OPENAI_API_KEY` | OpenAI（`openai` 和 `openai_responses`） | 无 |
 | `OPENAI_BASE_URL` | OpenAI（`openai` 和 `openai_responses`） | `https://api.openai.com/v1` |
+| `AZURE_FOUNDRY_API_KEY` | Microsoft Foundry（`azure-foundry`） | 无 |
+| `AZURE_FOUNDRY_BASE_URL` | Microsoft Foundry（`azure-foundry`） | 无 |
 | `GOOGLE_API_KEY` | Google GenAI、Vertex AI | 无 |
 | `VERTEXAI_API_KEY` | Vertex AI | 无 |
 | `GOOGLE_CLOUD_PROJECT` | Vertex AI | 无 |
diff --git a/docs/zh/configuration/providers.md b/docs/zh/configuration/providers.md
index 41aae2736..fed4d57ba 100644
--- a/docs/zh/configuration/providers.md
+++ b/docs/zh/configuration/providers.md
@@ -12,6 +12,7 @@ Kimi Code CLI 支持同时接入多家 LLM 平台——用 Kimi Code 托管服
 | `anthropic` | Anthropic Messages | Claude 系列模型 |
 | `openai` | OpenAI Chat Completions | OpenAI 及兼容服务、DeepSeek、Qwen 等 |
 | `openai_responses` | OpenAI Responses API | OpenAI 较新的 Responses 接口 |
+| `azure-foundry` | Microsoft Foundry（OpenAI v1） | Azure AI Foundry 模型部署（GPT、DeepSeek、Llama、Mistral 等） |
 | `google-genai` | Google GenAI | Gemini API |
 | `vertexai` | Google GenAI on Vertex | Google Cloud Vertex AI |
 
@@ -107,6 +108,30 @@ base_url = "https://api.openai.com/v1"
 api_key = "sk-xxxxx"
 ```
 
+## `azure-foundry`
+
+用于连接 [Microsoft Foundry](https://learn.microsoft.com/en-us/azure/foundry/) 上的模型部署，走 OpenAI v1 兼容推理路由。Foundry 托管多种模型家族——OpenAI GPT、DeepSeek、Meta Llama、Mistral 等 Azure 直售模型，并非只有 OpenAI。在 `[models.<alias>]` 中填写 Foundry 部署的模型 ID。
+
+Microsoft 建议第三方 SDK 和自定义应用使用 OpenAI v1 路由。详见 [Integrate Microsoft Foundry with your applications](https://learn.microsoft.com/en-us/azure/foundry/how-to/integrate-with-other-apps)。
+
+- 推荐 `base_url`：`https://{resource}.openai.azure.com/openai/v1`
+- 凭证键名：`AZURE_FOUNDRY_API_KEY`、`AZURE_FOUNDRY_BASE_URL`
+- 认证：发送 Foundry 的 `api-key` 请求头
+
+```toml
+[providers.foundry]
+type = "azure-foundry"
+base_url = "https://YOUR-RESOURCE.openai.azure.com/openai/v1"
+api_key = "YOUR_KEY"
+
+[models.foundry-gpt4o]
+provider = "foundry"
+model = "gpt-4o"
+max_context_size = 128000
+```
+
+Foundry 上的第三方推理模型与通用 `openai` 供应商用法相同：若网关以非标准字段返回推理内容，可在模型别名上设置 `reasoning_key`。
+
 ## `google-genai`
 
 用于直连 Google Gemini API。thinking、视觉及多模态能力按模型名自动识别。
diff --git a/packages/agent-core/src/config/schema.ts b/packages/agent-core/src/config/schema.ts
index 9b3d11cf0..d58655c15 100644
--- a/packages/agent-core/src/config/schema.ts
+++ b/packages/agent-core/src/config/schema.ts
@@ -5,6 +5,7 @@ import { z } from 'zod';
 
 export const ProviderTypeSchema = z.enum([
   'anthropic',
+  'azure-foundry',
   'openai',
   'kimi',
   'google-genai',
diff --git a/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts b/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts
index bd8eb79f3..233a209df 100644
--- a/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts
+++ b/packages/agent-core/src/services/modelCatalog/modelCatalogService.ts
@@ -214,6 +214,8 @@ function hasConfiguredApiKey(provider: ProviderConfig): boolean {
     case 'openai':
     case 'openai_responses':
       return nonEmpty(provider.env?.['OPENAI_API_KEY']) !== undefined;
+    case 'azure-foundry':
+      return nonEmpty(provider.env?.['AZURE_FOUNDRY_API_KEY']) !== undefined;
     case 'kimi':
       return nonEmpty(provider.env?.['KIMI_API_KEY']) !== undefined;
     case 'google-genai':
diff --git a/packages/agent-core/src/session/provider-manager.ts b/packages/agent-core/src/session/provider-manager.ts
index 34dc82091..b447b7f8c 100644
--- a/packages/agent-core/src/session/provider-manager.ts
+++ b/packages/agent-core/src/session/provider-manager.ts
@@ -245,6 +245,15 @@ function toKosongProviderConfig(
         reasoningKey,
         ...defaultHeadersField(provider.customHeaders),
       };
+    case 'azure-foundry':
+      return {
+        type: 'azure-foundry',
+        model,
+        baseUrl: providerValue(provider.baseUrl, provider.env, 'AZURE_FOUNDRY_BASE_URL'),
+        apiKey: providerApiKey(provider),
+        reasoningKey,
+        ...defaultHeadersField(provider.customHeaders),
+      };
     case 'kimi':
       return {
         type: 'kimi',
@@ -306,6 +315,8 @@ function providerApiKey(provider: ProviderConfig): string | undefined {
     case 'openai':
     case 'openai_responses':
       return providerValue(provider.apiKey, provider.env, 'OPENAI_API_KEY');
+    case 'azure-foundry':
+      return providerValue(provider.apiKey, provider.env, 'AZURE_FOUNDRY_API_KEY');
     case 'kimi':
       return providerValue(provider.apiKey, provider.env, 'KIMI_API_KEY');
     case 'google-genai':
diff --git a/packages/agent-core/test/harness/runtime-provider.test.ts b/packages/agent-core/test/harness/runtime-provider.test.ts
index c93283b65..0eaee9b93 100644
--- a/packages/agent-core/test/harness/runtime-provider.test.ts
+++ b/packages/agent-core/test/harness/runtime-provider.test.ts
@@ -532,6 +532,37 @@ describe('resolveRuntimeProvider customHeaders propagation', () => {
     });
   });
 
+  it('resolves an azure-foundry provider with env credential keys', () => {
+    const resolved = resolveRuntimeProvider({
+      config: {
+        defaultModel: 'foundry-alias',
+        providers: {
+          foundry: {
+            type: 'azure-foundry',
+            env: {
+              AZURE_FOUNDRY_API_KEY: 'foundry-key',
+              AZURE_FOUNDRY_BASE_URL: 'https://example.openai.azure.com/openai/v1',
+            },
+          },
+        },
+        models: {
+          'foundry-alias': {
+            provider: 'foundry',
+            model: 'gpt-4o',
+            maxContextSize: 128000,
+          },
+        },
+      },
+    });
+
+    expect(resolved.provider).toMatchObject({
+      type: 'azure-foundry',
+      apiKey: 'foundry-key',
+      baseUrl: 'https://example.openai.azure.com/openai/v1',
+      model: 'gpt-4o',
+    });
+  });
+
   it('forwards customHeaders to an openai_responses provider', () => {
     const resolved = resolveRuntimeProvider({
       config: {
diff --git a/packages/kosong/src/catalog.ts b/packages/kosong/src/catalog.ts
index 40975430c..40ba29edd 100644
--- a/packages/kosong/src/catalog.ts
+++ b/packages/kosong/src/catalog.ts
@@ -48,6 +48,7 @@ export interface CatalogModel {
 
 const KNOWN_WIRE_TYPES = [
   'anthropic',
+  'azure-foundry',
   'openai',
   'kimi',
   'google-genai',
@@ -87,6 +88,7 @@ export function inferWireType(entry: CatalogProviderEntry): ProviderType | undef
   if (npm.includes('anthropic') || id.includes('anthropic') || id.includes('claude')) {
     return 'anthropic';
   }
+  if (id.includes('azure') || id.includes('foundry')) return 'azure-foundry';
   if (id.includes('vertex')) return 'vertexai';
   if (npm.includes('google') || id.includes('google') || id.includes('gemini')) {
     return 'google-genai';
diff --git a/packages/kosong/src/providers/azure-foundry.ts b/packages/kosong/src/providers/azure-foundry.ts
new file mode 100644
index 000000000..0fd53b42e
--- /dev/null
+++ b/packages/kosong/src/providers/azure-foundry.ts
@@ -0,0 +1,72 @@
+import type { ProviderRequestAuth } from '#/provider';
+import OpenAI from 'openai';
+
+import {
+  OpenAILegacyChatProvider,
+  type OpenAILegacyOptions,
+} from './openai-legacy';
+import { mergeRequestHeaders, requireProviderApiKey } from './request-auth';
+
+export type AzureFoundryOptions = OpenAILegacyOptions;
+
+function normalizeAzureFoundryBaseUrl(baseUrl: string | undefined): string | undefined {
+  const trimmed = baseUrl?.trim();
+  if (trimmed === undefined || trimmed.length === 0) return undefined;
+  return trimmed.replace(/\/+$/, '');
+}
+
+function buildAzureFoundryClient(
+  apiKey: string,
+  baseUrl: string | undefined,
+  defaultHeaders: Record<string, string> | undefined,
+  httpClient: unknown,
+  auth?: ProviderRequestAuth,
+): OpenAI {
+  const key = requireProviderApiKey('AzureFoundryChatProvider', auth, apiKey);
+  const headers: Record<string, string | null> = { authorization: null, 'api-key': key };
+  const merged = mergeRequestHeaders(defaultHeaders, auth?.headers);
+  if (merged !== undefined) {
+    for (const [name, value] of Object.entries(merged)) {
+      headers[name.toLowerCase()] = value;
+    }
+  }
+  headers['api-key'] = key;
+
+  const clientOpts: Record<string, unknown> = {
+    apiKey: key,
+    baseURL: baseUrl,
+    defaultHeaders: headers,
+  };
+  if (httpClient !== undefined) {
+    clientOpts['httpClient'] = httpClient;
+  }
+  return new OpenAI(clientOpts as ConstructorParameters<typeof OpenAI>[0]);
+}
+
+/**
+ * Microsoft Foundry chat provider.
+ *
+ * Targets Foundry's OpenAI v1-compatible inference route
+ * (`https://{resource}.openai.azure.com/openai/v1`) and authenticates with
+ * the Foundry `api-key` header rather than Bearer auth.
+ */
+export class AzureFoundryChatProvider extends OpenAILegacyChatProvider {
+  override readonly name = 'azure-foundry';
+
+  constructor(options: AzureFoundryOptions) {
+    const baseUrl = normalizeAzureFoundryBaseUrl(options.baseUrl);
+    const apiKey = options.apiKey;
+    super({
+      ...options,
+      baseUrl,
+      clientFactory: (auth) =>
+        buildAzureFoundryClient(
+          apiKey ?? '',
+          baseUrl,
+          options.defaultHeaders,
+          options.httpClient,
+          auth,
+        ),
+    });
+  }
+}
diff --git a/packages/kosong/src/providers/index.ts b/packages/kosong/src/providers/index.ts
index d95e9c58e..3d4f38cb4 100644
--- a/packages/kosong/src/providers/index.ts
+++ b/packages/kosong/src/providers/index.ts
@@ -1,6 +1,7 @@
 import { UNKNOWN_CAPABILITY, type ModelCapability } from '../capability';
 import type { ChatProvider } from '../provider';
 import { AnthropicChatProvider, type AnthropicOptions } from './anthropic';
+import { AzureFoundryChatProvider, type AzureFoundryOptions } from './azure-foundry';
 import {
   getAnthropicModelCapability,
   getGoogleGenAIModelCapability,
@@ -14,6 +15,7 @@ import { OpenAIResponsesChatProvider, type OpenAIResponsesOptions } from './open
 
 export type ProviderConfig =
   | ({ type: 'anthropic' } & AnthropicOptions)
+  | ({ type: 'azure-foundry' } & AzureFoundryOptions)
   | ({ type: 'openai' } & OpenAILegacyOptions)
   | ({ type: 'kimi' } & KimiOptions)
   | ({ type: 'google-genai' } & GoogleGenAIOptions)
@@ -26,6 +28,8 @@ export function createProvider(config: ProviderConfig): ChatProvider {
   switch (config.type) {
     case 'anthropic':
       return new AnthropicChatProvider(config);
+    case 'azure-foundry':
+      return new AzureFoundryChatProvider(config);
     case 'openai':
       return new OpenAILegacyChatProvider(config);
     case 'kimi':
@@ -55,6 +59,7 @@ export function getModelCapability(wire: ProviderType, modelName: string): Model
   switch (wire) {
     case 'anthropic':
       return getAnthropicModelCapability(modelName);
+    case 'azure-foundry':
     case 'openai':
       return getOpenAILegacyModelCapability(modelName);
     case 'openai_responses':
diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts
new file mode 100644
index 000000000..8db86847e
--- /dev/null
+++ b/packages/kosong/test/azure-foundry.test.ts
@@ -0,0 +1,100 @@
+import { AzureFoundryChatProvider } from '#/providers/azure-foundry';
+import { describe, expect, it } from 'vitest';
+
+import { createFakeProviderHarness, type FakeProviderHarness } from './e2e/fake-provider-harness';
+
+async function withHarness<T>(fn: (harness: FakeProviderHarness) => Promise<T>): Promise<T> {
+  const harness = await createFakeProviderHarness();
+  try {
+    return await fn(harness);
+  } finally {
+    await harness.close();
+  }
+}
+
+describe('AzureFoundryChatProvider', () => {
+  it('uses the azure-foundry provider name', () => {
+    const provider = new AzureFoundryChatProvider({
+      model: 'gpt-4o',
+      apiKey: 'test-key',
+      baseUrl: 'https://example.openai.azure.com/openai/v1',
+    });
+    expect(provider.name).toBe('azure-foundry');
+  });
+
+  it('sends Foundry api-key auth instead of Bearer for chat completions', async () => {
+    await withHarness(async (harness) => {
+      harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => {
+        expect(request.headers['api-key']).toBe('foundry-key');
+        expect(request.headers['authorization']).toBeUndefined();
+        await reply.sseJson(200, [
+          {
+            id: 'chatcmpl-azure-1',
+            object: 'chat.completion.chunk',
+            created: 1234567890,
+            model: 'gpt-4o',
+            choices: [{ index: 0, delta: { content: 'Hello' }, finish_reason: null }],
+          },
+          {
+            id: 'chatcmpl-azure-1',
+            object: 'chat.completion.chunk',
+            created: 1234567890,
+            model: 'gpt-4o',
+            choices: [
+              {
+                index: 0,
+                delta: {},
+                finish_reason: 'stop',
+              },
+            ],
+          },
+        ]);
+      });
+
+      const provider = new AzureFoundryChatProvider({
+        model: 'gpt-4o',
+        apiKey: 'foundry-key',
+        baseUrl: `${harness.baseUrl}/openai/v1`,
+      });
+      const stream = await provider.generate('You are helpful.', [], [
+        { role: 'user', content: [{ type: 'text', text: 'Hi' }], toolCalls: [] },
+      ]);
+      const parts = [];
+      for await (const part of stream) {
+        parts.push(part);
+      }
+      expect(parts).toEqual([{ type: 'text', text: 'Hello' }]);
+    });
+  });
+
+  it('strips trailing slashes from base_url', async () => {
+    await withHarness(async (harness) => {
+      let capturedPath = '';
+      harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => {
+        capturedPath = request.pathname;
+        await reply.sseJson(200, [
+          {
+            id: 'chatcmpl-azure-2',
+            object: 'chat.completion.chunk',
+            created: 1234567890,
+            model: 'gpt-4o',
+            choices: [{ index: 0, delta: { content: 'ok' }, finish_reason: 'stop' }],
+          },
+        ]);
+      });
+
+      const provider = new AzureFoundryChatProvider({
+        model: 'gpt-4o',
+        apiKey: 'foundry-key',
+        baseUrl: `${harness.baseUrl}/openai/v1/`,
+      });
+      const stream = await provider.generate('', [], [
+        { role: 'user', content: [{ type: 'text', text: 'Hi' }], toolCalls: [] },
+      ]);
+      for await (const _part of stream) {
+        // drain
+      }
+      expect(capturedPath).toBe('/openai/v1/chat/completions');
+    });
+  });
+});
diff --git a/packages/kosong/test/catalog.test.ts b/packages/kosong/test/catalog.test.ts
index 5780e5502..7abbb5861 100644
--- a/packages/kosong/test/catalog.test.ts
+++ b/packages/kosong/test/catalog.test.ts
@@ -23,6 +23,11 @@ describe('inferWireType', () => {
     expect(inferWireType({ id: 'google-vertex' })).toBe('vertexai');
   });
 
+  it('infers azure-foundry from azure or foundry ids', () => {
+    expect(inferWireType({ id: 'azure-foundry' })).toBe('azure-foundry');
+    expect(inferWireType({ id: 'microsoft-foundry' })).toBe('azure-foundry');
+  });
+
   it('returns undefined for unknown / invalid wire types', () => {
     expect(inferWireType({ id: 'some-proxy' })).toBeUndefined();
     expect(inferWireType({ id: 'x', type: 'not-a-wire' })).toBeUndefined();
diff --git a/packages/kosong/tsdown.config.ts b/packages/kosong/tsdown.config.ts
index 00783406f..c636ecde0 100644
--- a/packages/kosong/tsdown.config.ts
+++ b/packages/kosong/tsdown.config.ts
@@ -5,6 +5,7 @@ export default defineConfig({
     './src/index.ts',
     './src/providers/kimi.ts',
     './src/providers/openai-legacy.ts',
+    './src/providers/azure-foundry.ts',
     './src/providers/openai-responses.ts',
     './src/providers/anthropic.ts',
     './src/providers/google-genai.ts',
diff --git a/packages/oauth/src/custom-registry.ts b/packages/oauth/src/custom-registry.ts
index 0c5d720f7..c3578175d 100644
--- a/packages/oauth/src/custom-registry.ts
+++ b/packages/oauth/src/custom-registry.ts
@@ -24,6 +24,7 @@ export interface CustomRegistrySource {
  */
 export type CustomRegistryProviderType =
   | 'anthropic'
+  | 'azure-foundry'
   | 'openai'
   | 'openai_responses'
   | 'kimi';
@@ -59,6 +60,7 @@ export const CUSTOM_REGISTRY_DEFAULT_CAPABILITIES = ['tool_use'] as const;
 
 const ALLOWED_PROVIDER_TYPES: ReadonlySet<CustomRegistryProviderType> = new Set([
   'anthropic',
+  'azure-foundry',
   'openai',
   'openai_responses',
   'kimi',

From 23ff2e18b83d6c7889326b4bfc3819a042047bd3 Mon Sep 17 00:00:00 2001
From: guglxni <aaryanguglani.cs21@rvce.edu.in>
Date: Sat, 20 Jun 2026 21:02:26 +0530
Subject: [PATCH 2/4] fix: harden azure-foundry provider for Foundry runtime
 issues

Require base_url before constructing the Foundry client so api-key auth
never falls back to the default OpenAI host. Clamp completion budgets against
Foundry's shared input+output context window and recover once when a model
stalls after tool results without issuing further tool calls.

Addresses Codex review on #950. Relates to #918 and #520.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .changeset/azure-foundry-provider.md          |  2 +-
 packages/agent-core/src/agent/turn/index.ts   | 29 +++++++
 .../src/agent/turn/tool-stall-recovery.ts     | 20 +++++
 .../src/session/provider-manager.ts           | 15 +++-
 .../agent-core/src/utils/completion-budget.ts | 13 ++--
 .../test/agent/tool-stall-recovery.test.ts    | 32 ++++++++
 packages/agent-core/test/agent/turn.test.ts   | 23 ++++++
 .../test/harness/runtime-provider.test.ts     | 24 ++++++
 .../test/utils/completion-budget.test.ts      | 18 +++--
 .../kosong/src/providers/azure-foundry.ts     | 22 +++++-
 .../kosong/src/providers/openai-legacy.ts     | 16 +++-
 .../src/providers/shared-context-window.ts    | 47 +++++++++++
 packages/kosong/src/token-estimate.ts         | 77 +++++++++++++++++++
 packages/kosong/test/azure-foundry.test.ts    | 57 ++++++++++++++
 .../kosong/test/shared-context-window.test.ts | 32 ++++++++
 15 files changed, 410 insertions(+), 17 deletions(-)
 create mode 100644 packages/agent-core/src/agent/turn/tool-stall-recovery.ts
 create mode 100644 packages/agent-core/test/agent/tool-stall-recovery.test.ts
 create mode 100644 packages/kosong/src/providers/shared-context-window.ts
 create mode 100644 packages/kosong/src/token-estimate.ts
 create mode 100644 packages/kosong/test/shared-context-window.test.ts

diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md
index 1c01dcefc..1863fb07c 100644
--- a/.changeset/azure-foundry-provider.md
+++ b/.changeset/azure-foundry-provider.md
@@ -2,4 +2,4 @@
 "@moonshot-ai/kimi-code": minor
 ---
 
-Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route.
+Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window so Foundry-hosted Kimi models do not overflow on the first request.
diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts
index 7fff37876..e6d2182a4 100644
--- a/packages/agent-core/src/agent/turn/index.ts
+++ b/packages/agent-core/src/agent/turn/index.ts
@@ -40,6 +40,11 @@ import { USER_PROMPT_ORIGIN, type PromptOrigin } from '../context';
 import { renderUserPromptHookBlockResult, renderUserPromptHookResult } from '../../session/hooks';
 import { canonicalTelemetryArgs, isPlainRecord } from './canonical-args';
 import { ToolCallDeduplicator } from './tool-dedup';
+import {
+  hasToolResultsSinceLastUserMessage,
+  TOOL_STALL_RECOVERY_NAME,
+  TOOL_STALL_RECOVERY_TEXT,
+} from './tool-stall-recovery';
 
 interface ActiveTurn {
   readonly turnId: number;
@@ -613,6 +618,7 @@ export class TurnFlow {
   private async runStepLoop(turnId: number, signal: AbortSignal): Promise<LoopTurnStopReason> {
     let stopHookContinuationUsed = false;
     let goalOutcomeMessageContinuationUsed = false;
+    let toolStallContinuationUsed = false;
     const deduper = new ToolCallDeduplicator({ telemetry: this.agent.telemetry });
     await this.agent.mcp?.waitForInitialLoad(signal);
     // Surface the active goal at the start of the turn (append-only; no-op when
@@ -679,6 +685,29 @@ export class TurnFlow {
                 return { continue: true };
               }
 
+              // 3b. Recover once when the model ends a step without tools after
+              //     tool results already landed in the same turn (common with
+              //     shared-window thinking models that stop after long reasoning).
+              if (
+                !toolStallContinuationUsed &&
+                ctx.stopReason === 'end_turn' &&
+                ctx.stepNumber > 1 &&
+                hasToolResultsSinceLastUserMessage(this.agent.context.messages)
+              ) {
+                if (!hasStepBudgetRemaining(loopControl?.maxStepsPerTurn, ctx.stepNumber)) {
+                  return { continue: false };
+                }
+                toolStallContinuationUsed = true;
+                this.agent.context.appendUserMessage(
+                  [{ type: 'text', text: TOOL_STALL_RECOVERY_TEXT }],
+                  {
+                    kind: 'system_trigger',
+                    name: TOOL_STALL_RECOVERY_NAME,
+                  },
+                );
+                return { continue: true };
+              }
+
               // 3. The external Stop hook gets exactly one continuation; the cap
               //    is intentionally separate from (and does not cap) goal mode.
               if (!stopHookContinuationUsed) {
diff --git a/packages/agent-core/src/agent/turn/tool-stall-recovery.ts b/packages/agent-core/src/agent/turn/tool-stall-recovery.ts
new file mode 100644
index 000000000..7c6a63f04
--- /dev/null
+++ b/packages/agent-core/src/agent/turn/tool-stall-recovery.ts
@@ -0,0 +1,20 @@
+import type { Message } from '@moonshot-ai/kosong';
+
+export const TOOL_STALL_RECOVERY_NAME = 'tool_stall_recovery';
+
+export const TOOL_STALL_RECOVERY_TEXT =
+  '<system-reminder>\n' +
+  'Your previous step ended without calling any tools even though more work remains on the user request. ' +
+  'Call the appropriate tools now instead of only describing what you plan to do next.\n' +
+  '</system-reminder>';
+
+/** True when tool results appear after the latest user message in the turn history. */
+export function hasToolResultsSinceLastUserMessage(messages: readonly Message[]): boolean {
+  for (let index = messages.length - 1; index >= 0; index -= 1) {
+    const message = messages[index];
+    if (message === undefined) continue;
+    if (message.role === 'user') return false;
+    if (message.role === 'tool') return true;
+  }
+  return false;
+}
diff --git a/packages/agent-core/src/session/provider-manager.ts b/packages/agent-core/src/session/provider-manager.ts
index b447b7f8c..82209dfd8 100644
--- a/packages/agent-core/src/session/provider-manager.ts
+++ b/packages/agent-core/src/session/provider-manager.ts
@@ -109,6 +109,7 @@ export class ProviderManager implements ModelProvider {
       alias.model,
       this.options.kimiRequestHeaders,
       alias.maxOutputSize,
+      alias.maxContextSize,
       alias.reasoningKey,
       this.options.promptCacheKey,
       alias.adaptiveThinking,
@@ -221,6 +222,7 @@ function toKosongProviderConfig(
   model: string,
   kimiRequestHeaders: Record<string, string> | undefined,
   maxOutputSize: number | undefined,
+  maxContextSize: number | undefined,
   reasoningKey: string | undefined,
   promptCacheKey: string | undefined,
   adaptiveThinking: boolean | undefined,
@@ -245,15 +247,24 @@ function toKosongProviderConfig(
         reasoningKey,
         ...defaultHeadersField(provider.customHeaders),
       };
-    case 'azure-foundry':
+    case 'azure-foundry': {
+      const baseUrl = providerValue(provider.baseUrl, provider.env, 'AZURE_FOUNDRY_BASE_URL');
+      if (baseUrl === undefined) {
+        throw new KimiError(
+          ErrorCodes.MODEL_CONFIG_INVALID,
+          'Provider type "azure-foundry" requires base_url (or AZURE_FOUNDRY_BASE_URL in [providers.<name>.env]). Example: https://YOUR-RESOURCE.openai.azure.com/openai/v1',
+        );
+      }
       return {
         type: 'azure-foundry',
         model,
-        baseUrl: providerValue(provider.baseUrl, provider.env, 'AZURE_FOUNDRY_BASE_URL'),
+        baseUrl,
         apiKey: providerApiKey(provider),
         reasoningKey,
+        sharedContextWindowTokens: maxContextSize,
         ...defaultHeadersField(provider.customHeaders),
       };
+    }
     case 'kimi':
       return {
         type: 'kimi',
diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts
index ceb086ef2..17f901c02 100644
--- a/packages/agent-core/src/utils/completion-budget.ts
+++ b/packages/agent-core/src/utils/completion-budget.ts
@@ -50,18 +50,21 @@ function parseEnvBudget(raw: string | undefined): EnvBudget {
 
 /**
  * Compute the effective `max_completion_tokens` cap.
+ *
+ * Uses the explicit hard cap or reserved-context fallback when set, clamped
+ * to the model context window. Shared-window providers reject requests where
+ * input tokens plus max_completion_tokens exceed the total window.
  */
 export function computeCompletionBudgetCap(args: {
   readonly budget: CompletionBudgetConfig;
   readonly capability: ModelCapability | undefined;
 }): number {
   const maxCtx = args.capability?.max_context_tokens ?? 0;
-  // The provider backend computes the safe request-specific value from the
-  // serialized prompt. Locally using the largest cap avoids cutting off
-  // thinking before the model produces a summary.
-  const cap =
+  const requested =
     args.budget.hardCap ??
-    (maxCtx > 0 ? maxCtx : args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK);
+    args.budget.fallback ??
+    (maxCtx > 0 ? maxCtx : DEFAULT_UNKNOWN_CONTEXT_FALLBACK);
+  const cap = maxCtx > 0 ? Math.min(requested, maxCtx) : requested;
   return Math.max(MIN_FLOOR, cap);
 }
 
diff --git a/packages/agent-core/test/agent/tool-stall-recovery.test.ts b/packages/agent-core/test/agent/tool-stall-recovery.test.ts
new file mode 100644
index 000000000..a0b074c9d
--- /dev/null
+++ b/packages/agent-core/test/agent/tool-stall-recovery.test.ts
@@ -0,0 +1,32 @@
+import type { Message } from '@moonshot-ai/kosong';
+import { describe, expect, it } from 'vitest';
+
+import { hasToolResultsSinceLastUserMessage } from '../../src/agent/turn/tool-stall-recovery';
+
+describe('hasToolResultsSinceLastUserMessage', () => {
+  it('returns false when the latest user message has no trailing tool results', () => {
+    const messages: Message[] = [
+      { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
+      { role: 'assistant', content: [{ type: 'text', text: 'hello' }], toolCalls: [] },
+    ];
+    expect(hasToolResultsSinceLastUserMessage(messages)).toBe(false);
+  });
+
+  it('returns true when tool results follow the latest user message', () => {
+    const messages: Message[] = [
+      { role: 'user', content: [{ type: 'text', text: 'explore' }], toolCalls: [] },
+      {
+        role: 'assistant',
+        content: [{ type: 'text', text: 'reading' }],
+        toolCalls: [{ type: 'function', id: 'call_1', name: 'Read', arguments: '{}' }],
+      },
+      { role: 'tool', content: [{ type: 'text', text: 'file contents' }], toolCalls: [], toolCallId: 'call_1' },
+      {
+        role: 'assistant',
+        content: [{ type: 'text', text: 'I will continue' }],
+        toolCalls: [],
+      },
+    ];
+    expect(hasToolResultsSinceLastUserMessage(messages)).toBe(true);
+  });
+});
diff --git a/packages/agent-core/test/agent/turn.test.ts b/packages/agent-core/test/agent/turn.test.ts
index e39094fb4..576bc1cef 100644
--- a/packages/agent-core/test/agent/turn.test.ts
+++ b/packages/agent-core/test/agent/turn.test.ts
@@ -112,6 +112,29 @@ describe('Agent turn flow', () => {
     });
   });
 
+  it('continues once after a post-tool step ends without further tool calls', async () => {
+    const ctx = testAgent({ kaos: createCommandKaos('ok') });
+    ctx.configure({ tools: ['Bash'] });
+    await ctx.rpc.setPermission({ mode: 'yolo' });
+
+    ctx.mockNextResponse(
+      { type: 'text', text: 'Running first command.' },
+      bashCallWithId('call_1', 'printf ok'),
+    );
+    ctx.mockNextResponse({ type: 'text', text: 'I will continue exploring.' });
+    ctx.mockNextResponse(
+      { type: 'text', text: 'Continuing with another command.' },
+      bashCallWithId('call_2', 'printf more'),
+    );
+    ctx.mockNextResponse({ type: 'text', text: 'Done.' });
+
+    await ctx.rpc.prompt({ input: [{ type: 'text', text: 'Explore the repo' }] });
+    await ctx.untilTurnEnd();
+
+    expect(ctx.llmCalls).toHaveLength(4);
+    expect(JSON.stringify(ctx.llmCalls[2]?.history ?? [])).toContain('system-reminder');
+  });
+
   it('tracks cross-step duplicate tool-call detection telemetry', async () => {
     const records: TelemetryRecord[] = [];
     const ctx = testAgent({
diff --git a/packages/agent-core/test/harness/runtime-provider.test.ts b/packages/agent-core/test/harness/runtime-provider.test.ts
index 0eaee9b93..b67a877ce 100644
--- a/packages/agent-core/test/harness/runtime-provider.test.ts
+++ b/packages/agent-core/test/harness/runtime-provider.test.ts
@@ -560,9 +560,33 @@ describe('resolveRuntimeProvider customHeaders propagation', () => {
       apiKey: 'foundry-key',
       baseUrl: 'https://example.openai.azure.com/openai/v1',
       model: 'gpt-4o',
+      sharedContextWindowTokens: 128000,
     });
   });
 
+  it('rejects azure-foundry providers without a base_url at resolve time', () => {
+    expect(() =>
+      resolveRuntimeProvider({
+        config: {
+          defaultModel: 'foundry-alias',
+          providers: {
+            foundry: {
+              type: 'azure-foundry',
+              apiKey: 'foundry-key',
+            },
+          },
+          models: {
+            'foundry-alias': {
+              provider: 'foundry',
+              model: 'gpt-4o',
+              maxContextSize: 128000,
+            },
+          },
+        },
+      }),
+    ).toThrow(/requires base_url/);
+  });
+
   it('forwards customHeaders to an openai_responses provider', () => {
     const resolved = resolveRuntimeProvider({
       config: {
diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts
index 7df91f5d0..4760dcb04 100644
--- a/packages/agent-core/test/utils/completion-budget.test.ts
+++ b/packages/agent-core/test/utils/completion-budget.test.ts
@@ -50,24 +50,32 @@ describe('computeCompletionBudgetCap', () => {
     ).toBe(1);
   });
 
-  it('uses the model context window when no hard cap is set', () => {
-    const maxCtx = 100000;
+  it('uses the reserved-context fallback when no hard cap is set', () => {
     const cap = computeCompletionBudgetCap({
       budget: { fallback: 32000 },
+      capability: makeCapability(100000),
+    });
+    expect(cap).toBe(32000);
+  });
+
+  it('uses the model context window when no hard cap or fallback is set', () => {
+    const maxCtx = 100000;
+    const cap = computeCompletionBudgetCap({
+      budget: {},
       capability: makeCapability(maxCtx),
     });
     expect(cap).toBe(maxCtx);
   });
 
-  it('uses the explicit hard cap when configured', () => {
+  it('clamps the explicit hard cap to the model context window when it is smaller', () => {
     const cap = computeCompletionBudgetCap({
       budget: { hardCap: 32000 },
       capability: makeCapability(10000),
     });
-    expect(cap).toBe(32000);
+    expect(cap).toBe(10000);
   });
 
-  it('ignores fallback when the model context window is known', () => {
+  it('clamps the fallback to the model context window when it is smaller', () => {
     const cap = computeCompletionBudgetCap({
       budget: { fallback: 32000 },
       capability: makeCapability(10000),
diff --git a/packages/kosong/src/providers/azure-foundry.ts b/packages/kosong/src/providers/azure-foundry.ts
index 0fd53b42e..88b02132e 100644
--- a/packages/kosong/src/providers/azure-foundry.ts
+++ b/packages/kosong/src/providers/azure-foundry.ts
@@ -1,3 +1,4 @@
+import { ChatProviderError } from '#/errors';
 import type { ProviderRequestAuth } from '#/provider';
 import OpenAI from 'openai';
 
@@ -15,9 +16,19 @@ function normalizeAzureFoundryBaseUrl(baseUrl: string | undefined): string | und
   return trimmed.replace(/\/+$/, '');
 }
 
+function requireAzureFoundryBaseUrl(baseUrl: string | undefined): string {
+  const normalized = normalizeAzureFoundryBaseUrl(baseUrl);
+  if (normalized === undefined) {
+    throw new ChatProviderError(
+      'AzureFoundryChatProvider: baseUrl is required. Set base_url in config.toml or AZURE_FOUNDRY_BASE_URL in [providers.<name>.env]. Example: https://YOUR-RESOURCE.openai.azure.com/openai/v1',
+    );
+  }
+  return normalized;
+}
+
 function buildAzureFoundryClient(
   apiKey: string,
-  baseUrl: string | undefined,
+  baseUrl: string,
   defaultHeaders: Record<string, string> | undefined,
   httpClient: unknown,
   auth?: ProviderRequestAuth,
@@ -49,12 +60,17 @@ function buildAzureFoundryClient(
  * Targets Foundry's OpenAI v1-compatible inference route
  * (`https://{resource}.openai.azure.com/openai/v1`) and authenticates with
  * the Foundry `api-key` header rather than Bearer auth.
+ *
+ * Foundry-hosted Kimi models use a shared input+output context window. Pass
+ * `sharedContextWindowTokens` (wired from `max_context_size` in config) so
+ * completion budgets are clamped against the serialized prompt before each
+ * request.
  */
 export class AzureFoundryChatProvider extends OpenAILegacyChatProvider {
   override readonly name = 'azure-foundry';
 
   constructor(options: AzureFoundryOptions) {
-    const baseUrl = normalizeAzureFoundryBaseUrl(options.baseUrl);
+    const baseUrl = requireAzureFoundryBaseUrl(options.baseUrl);
     const apiKey = options.apiKey;
     super({
       ...options,
@@ -62,7 +78,7 @@ export class AzureFoundryChatProvider extends OpenAILegacyChatProvider {
       clientFactory: (auth) =>
         buildAzureFoundryClient(
           apiKey ?? '',
-          baseUrl,
+          requireAzureFoundryBaseUrl(baseUrl),
           options.defaultHeaders,
           options.httpClient,
           auth,
diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts
index c9987a5f5..58c8a7e22 100644
--- a/packages/kosong/src/providers/openai-legacy.ts
+++ b/packages/kosong/src/providers/openai-legacy.ts
@@ -35,6 +35,7 @@ import {
   requireProviderApiKey,
   resolveAuthBackedClient,
 } from './request-auth';
+import { clampCompletionTokensForSharedContextWindow } from './shared-context-window';
 import {
   normalizeToolCallIdsForProvider,
   sanitizeToolCallId,
@@ -71,6 +72,8 @@ export interface OpenAILegacyOptions {
   model: string;
   stream?: boolean | undefined;
   maxTokens?: number | undefined;
+  /** Total input+output budget when the backend enforces a shared context window. */
+  sharedContextWindowTokens?: number | undefined;
   reasoningKey?: string | undefined;
   httpClient?: unknown;
   defaultHeaders?: Record<string, string>;
@@ -449,6 +452,7 @@ export class OpenAILegacyChatProvider implements ChatProvider {
   private _client: OpenAI | undefined;
   private _httpClient: unknown;
   private _clientFactory: ((auth: ProviderRequestAuth) => OpenAI) | undefined;
+  private _sharedContextWindowTokens: number | undefined;
 
   constructor(options: OpenAILegacyOptions) {
     const apiKey = options.apiKey ?? process.env['OPENAI_API_KEY'];
@@ -472,6 +476,7 @@ export class OpenAILegacyChatProvider implements ChatProvider {
     this._toolMessageConversion = options.toolMessageConversion ?? null;
     this._httpClient = options.httpClient;
     this._clientFactory = options.clientFactory;
+    this._sharedContextWindowTokens = options.sharedContextWindowTokens;
 
     this._client = this._apiKey === undefined ? undefined : this._buildClient(this._apiKey);
   }
@@ -512,7 +517,16 @@ export class OpenAILegacyChatProvider implements ChatProvider {
 
     const kwargs: Record<string, unknown> = normalizeGenerationKwargs(
       this._model,
-      this._generationKwargs,
+      this._sharedContextWindowTokens === undefined
+        ? this._generationKwargs
+        : clampCompletionTokensForSharedContextWindow({
+            model: this._model,
+            sharedContextWindowTokens: this._sharedContextWindowTokens,
+            generationKwargs: this._generationKwargs,
+            systemPrompt,
+            history,
+            tools,
+          }),
     );
 
     // Determine reasoning_effort
diff --git a/packages/kosong/src/providers/shared-context-window.ts b/packages/kosong/src/providers/shared-context-window.ts
new file mode 100644
index 000000000..d14189218
--- /dev/null
+++ b/packages/kosong/src/providers/shared-context-window.ts
@@ -0,0 +1,47 @@
+import { estimatePromptTokens } from '#/token-estimate';
+import type { Message } from '#/message';
+import type { Tool } from '#/tool';
+
+import type { OpenAILegacyGenerationKwargs } from './openai-legacy';
+
+const DEFAULT_SERIALIZATION_MARGIN = 512;
+const MIN_COMPLETION_TOKENS = 1;
+
+function completionTokenField(model: string): 'max_completion_tokens' | 'max_tokens' {
+  const normalized = model.toLowerCase();
+  if (/^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized)) {
+    return 'max_completion_tokens';
+  }
+  return 'max_tokens';
+}
+
+/**
+ * Clamp completion budget for providers where input and output share one
+ * context window (e.g. Microsoft Foundry Kimi deployments).
+ */
+export function clampCompletionTokensForSharedContextWindow(args: {
+  readonly model: string;
+  readonly sharedContextWindowTokens: number;
+  readonly generationKwargs: OpenAILegacyGenerationKwargs;
+  readonly systemPrompt: string;
+  readonly history: readonly Message[];
+  readonly tools: readonly Tool[];
+  readonly serializationMargin?: number;
+}): OpenAILegacyGenerationKwargs {
+  const margin = args.serializationMargin ?? DEFAULT_SERIALIZATION_MARGIN;
+  const inputEstimate = estimatePromptTokens({
+    systemPrompt: args.systemPrompt,
+    history: args.history,
+    tools: args.tools,
+  });
+  const remaining = Math.max(
+    MIN_COMPLETION_TOKENS,
+    args.sharedContextWindowTokens - inputEstimate - margin,
+  );
+
+  const field = completionTokenField(args.model);
+  const kwargs = { ...args.generationKwargs };
+  const requested = kwargs[field];
+  kwargs[field] = requested === undefined ? remaining : Math.min(requested, remaining);
+  return kwargs;
+}
diff --git a/packages/kosong/src/token-estimate.ts b/packages/kosong/src/token-estimate.ts
new file mode 100644
index 000000000..d0aa5f0fc
--- /dev/null
+++ b/packages/kosong/src/token-estimate.ts
@@ -0,0 +1,77 @@
+import type { ContentPart, Message } from './message';
+import type { Tool } from './tool';
+
+/**
+ * Estimate token count from text using a character-based heuristic.
+ * ASCII (~4 chars/token), CJK and other non-ASCII (~1 char/token).
+ */
+export function estimateTokens(text: string): number {
+  let asciiCount = 0;
+  let nonAsciiCount = 0;
+  for (const char of text) {
+    if (char.codePointAt(0)! <= 127) {
+      asciiCount++;
+    } else {
+      nonAsciiCount++;
+    }
+  }
+  return Math.ceil(asciiCount / 4) + nonAsciiCount;
+}
+
+export function estimateTokensForMessages(messages: readonly Message[]): number {
+  let total = 0;
+  for (const message of messages) {
+    total += estimateTokensForMessage(message);
+  }
+  return total;
+}
+
+export function estimateTokensForTools(tools: readonly Tool[]): number {
+  let total = 0;
+  for (const tool of tools) {
+    total += estimateTokens(tool.name);
+    total += estimateTokens(tool.description);
+    total += estimateTokens(JSON.stringify(tool.parameters));
+  }
+  return total;
+}
+
+function estimateTokensForMessage(message: Message): number {
+  let total = estimateTokens(message.role);
+  total += estimateTokensForContentParts(message.content);
+  for (const call of message.toolCalls) {
+    total += estimateTokens(call.name);
+    total += estimateTokens(JSON.stringify(call.arguments));
+  }
+  return total;
+}
+
+function estimateTokensForContentParts(parts: readonly ContentPart[]): number {
+  let total = 0;
+  for (const part of parts) {
+    if (part.type === 'text') {
+      total += estimateTokens(part.text);
+    } else if (part.type === 'think') {
+      total += estimateTokens(part.think);
+    } else if (part.type === 'image_url') {
+      total += estimateTokens(part.imageUrl.url);
+    } else if (part.type === 'audio_url') {
+      total += estimateTokens(part.audioUrl.url);
+    } else if (part.type === 'video_url') {
+      total += estimateTokens(part.videoUrl.url);
+    }
+  }
+  return total;
+}
+
+export function estimatePromptTokens(args: {
+  readonly systemPrompt: string;
+  readonly history: readonly Message[];
+  readonly tools: readonly Tool[];
+}): number {
+  return (
+    estimateTokens(args.systemPrompt) +
+    estimateTokensForMessages(args.history) +
+    estimateTokensForTools(args.tools)
+  );
+}
diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts
index 8db86847e..667ee6d6a 100644
--- a/packages/kosong/test/azure-foundry.test.ts
+++ b/packages/kosong/test/azure-foundry.test.ts
@@ -22,6 +22,27 @@ describe('AzureFoundryChatProvider', () => {
     expect(provider.name).toBe('azure-foundry');
   });
 
+  it('rejects a missing base_url before constructing the client', () => {
+    expect(
+      () =>
+        new AzureFoundryChatProvider({
+          model: 'gpt-4o',
+          apiKey: 'test-key',
+        }),
+    ).toThrow(/baseUrl is required/);
+  });
+
+  it('rejects a blank base_url before constructing the client', () => {
+    expect(
+      () =>
+        new AzureFoundryChatProvider({
+          model: 'gpt-4o',
+          apiKey: 'test-key',
+          baseUrl: '   ',
+        }),
+    ).toThrow(/baseUrl is required/);
+  });
+
   it('sends Foundry api-key auth instead of Bearer for chat completions', async () => {
     await withHarness(async (harness) => {
       harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => {
@@ -97,4 +118,40 @@ describe('AzureFoundryChatProvider', () => {
       expect(capturedPath).toBe('/openai/v1/chat/completions');
     });
   });
+
+  it('clamps max_tokens against the shared Foundry context window before sending', async () => {
+    await withHarness(async (harness) => {
+      let capturedBody: Record<string, unknown> | undefined;
+      harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => {
+        capturedBody = request.bodyJson as Record<string, unknown>;
+        await reply.sseJson(200, [
+          {
+            id: 'chatcmpl-azure-cap',
+            object: 'chat.completion.chunk',
+            created: 1234567890,
+            model: 'Kimi-K2.6',
+            choices: [{ index: 0, delta: { content: 'ok' }, finish_reason: 'stop' }],
+          },
+        ]);
+      });
+
+      const provider = new AzureFoundryChatProvider({
+        model: 'Kimi-K2.6',
+        apiKey: 'foundry-key',
+        baseUrl: `${harness.baseUrl}/openai/v1`,
+        sharedContextWindowTokens: 262144,
+      }).withMaxCompletionTokens(262144);
+      const stream = await provider.generate('system prompt', [], [
+        { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
+      ]);
+      for await (const _part of stream) {
+        // drain
+      }
+
+      expect(capturedBody).toBeDefined();
+      expect(capturedBody!['max_tokens']).toBeTypeOf('number');
+      expect(capturedBody!['max_tokens'] as number).toBeLessThan(262144);
+      expect(capturedBody!['max_tokens'] as number).toBeGreaterThan(0);
+    });
+  });
 });
diff --git a/packages/kosong/test/shared-context-window.test.ts b/packages/kosong/test/shared-context-window.test.ts
new file mode 100644
index 000000000..604b01b1c
--- /dev/null
+++ b/packages/kosong/test/shared-context-window.test.ts
@@ -0,0 +1,32 @@
+import { describe, expect, it } from 'vitest';
+
+import { clampCompletionTokensForSharedContextWindow } from '#/providers/shared-context-window';
+
+describe('clampCompletionTokensForSharedContextWindow', () => {
+  it('lowers an oversized completion cap to fit the remaining shared window', () => {
+    const kwargs = clampCompletionTokensForSharedContextWindow({
+      model: 'Kimi-K2.6',
+      sharedContextWindowTokens: 262144,
+      generationKwargs: { max_tokens: 262144 },
+      systemPrompt: 'x'.repeat(40_000),
+      history: [{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }],
+      tools: [],
+    });
+
+    expect(kwargs.max_tokens).toBeLessThan(262144);
+    expect(kwargs.max_tokens).toBeGreaterThan(0);
+  });
+
+  it('keeps a smaller explicit cap unchanged when it already fits', () => {
+    const kwargs = clampCompletionTokensForSharedContextWindow({
+      model: 'Kimi-K2.6',
+      sharedContextWindowTokens: 262144,
+      generationKwargs: { max_tokens: 1024 },
+      systemPrompt: 'short prompt',
+      history: [],
+      tools: [],
+    });
+
+    expect(kwargs.max_tokens).toBe(1024);
+  });
+});

From 70ae10cbafe553737991054085b294d173c91ba9 Mon Sep 17 00:00:00 2001
From: guglxni <aaryanguglani.cs21@rvce.edu.in>
Date: Sat, 20 Jun 2026 23:30:43 +0530
Subject: [PATCH 3/4] fix: send Kimi wire format for Foundry-hosted reasoning
 models

Foundry deployments of Kimi-K2.x were using max_tokens, which shares the
output budget with reasoning_content and can yield think-only responses.
Use max_completion_tokens and thinking enablement like the native Kimi
provider, honor explicit thinking-off over history auto-injection, and
apply shared-window clamping against the correct completion field.
---
 .changeset/azure-foundry-provider.md          |  2 +-
 .../agent-core/src/config/kimi-env-params.ts  | 14 ++++-
 .../test/config/kimi-env-params.test.ts       | 16 +++++-
 packages/kosong/src/index.ts                  |  1 +
 .../kosong/src/providers/kimi-reasoning.ts    | 47 +++++++++++++++
 .../kosong/src/providers/openai-legacy.ts     | 57 +++++++++++++++++--
 .../src/providers/shared-context-window.ts    | 16 ++++--
 packages/kosong/test/azure-foundry.test.ts    | 51 +++++++++++++++--
 packages/kosong/test/kimi-reasoning.test.ts   | 52 +++++++++++++++++
 packages/kosong/test/openai-legacy.test.ts    | 22 +++++++
 .../kosong/test/shared-context-window.test.ts | 25 ++++++--
 11 files changed, 280 insertions(+), 23 deletions(-)
 create mode 100644 packages/kosong/src/providers/kimi-reasoning.ts
 create mode 100644 packages/kosong/test/kimi-reasoning.test.ts

diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md
index 1863fb07c..cfb09cec8 100644
--- a/.changeset/azure-foundry-provider.md
+++ b/.changeset/azure-foundry-provider.md
@@ -2,4 +2,4 @@
 "@moonshot-ai/kimi-code": minor
 ---
 
-Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window so Foundry-hosted Kimi models do not overflow on the first request.
+Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window. For Foundry-hosted Kimi reasoning models, send `max_completion_tokens` and `thinking: { type: 'enabled' }` like the native Kimi provider so reasoning and visible output use separate budgets.
diff --git a/packages/agent-core/src/config/kimi-env-params.ts b/packages/agent-core/src/config/kimi-env-params.ts
index 8aa65455c..1cc726d5a 100644
--- a/packages/agent-core/src/config/kimi-env-params.ts
+++ b/packages/agent-core/src/config/kimi-env-params.ts
@@ -1,5 +1,6 @@
 import {
   type ChatProvider,
+  isKimiReasoningModel,
   type GenerationKwargs,
   KimiChatProvider,
   type ThinkingEffort,
@@ -50,8 +51,17 @@ export function applyKimiEnvThinkingKeep(
   thinkingLevel: ThinkingEffort,
   env: Env = process.env,
 ): ChatProvider {
-  if (!(provider instanceof KimiChatProvider)) return provider;
   const keep = env['KIMI_MODEL_THINKING_KEEP']?.trim();
   if (keep === undefined || keep.length === 0 || thinkingLevel === 'off') return provider;
-  return provider.withExtraBody({ thinking: { keep } });
+  if (provider instanceof KimiChatProvider) {
+    return provider.withExtraBody({ thinking: { keep } });
+  }
+  if (
+    provider.name === 'azure-foundry' &&
+    isKimiReasoningModel(provider.modelName) &&
+    provider.withGenerationKwargs !== undefined
+  ) {
+    return provider.withGenerationKwargs({ extra_body: { thinking: { keep } } });
+  }
+  return provider;
 }
diff --git a/packages/agent-core/test/config/kimi-env-params.test.ts b/packages/agent-core/test/config/kimi-env-params.test.ts
index 723679bff..158256718 100644
--- a/packages/agent-core/test/config/kimi-env-params.test.ts
+++ b/packages/agent-core/test/config/kimi-env-params.test.ts
@@ -1,4 +1,4 @@
-import { type ChatProvider, KimiChatProvider } from '@moonshot-ai/kosong';
+import { createProvider, type ChatProvider, KimiChatProvider } from '@moonshot-ai/kosong';
 import { describe, expect, it } from 'vitest';
 
 import { applyKimiEnvSamplingParams, applyKimiEnvThinkingKeep } from '../../src/config/kimi-env-params';
@@ -8,6 +8,15 @@ function kimi(): KimiChatProvider {
   return new KimiChatProvider({ model: 'kimi-k2', apiKey: 'k' });
 }
 
+function foundryKimi(): ChatProvider {
+  return createProvider({
+    type: 'azure-foundry',
+    model: 'Kimi-K2.6',
+    apiKey: 'k',
+    baseUrl: 'https://example.openai.azure.com/openai/v1',
+  });
+}
+
 interface KimiGenerationState {
   temperature?: number;
   top_p?: number;
@@ -63,6 +72,11 @@ describe('applyKimiEnvThinkingKeep', () => {
     expect(genState(out).extra_body?.thinking?.keep).toBe('all');
   });
 
+  it('injects thinking.keep for Foundry-hosted Kimi models', () => {
+    const out = applyKimiEnvThinkingKeep(foundryKimi(), 'high', { KIMI_MODEL_THINKING_KEEP: 'all' });
+    expect(genState(out).extra_body?.thinking?.keep).toBe('all');
+  });
+
   it('does NOT inject thinking.keep when thinking is off', () => {
     const out = applyKimiEnvThinkingKeep(kimi(), 'off', { KIMI_MODEL_THINKING_KEEP: 'all' });
     expect(genState(out).extra_body).toBeUndefined();
diff --git a/packages/kosong/src/index.ts b/packages/kosong/src/index.ts
index b8bd9bdcb..a5f02aecf 100644
--- a/packages/kosong/src/index.ts
+++ b/packages/kosong/src/index.ts
@@ -32,6 +32,7 @@ export type { ProviderConfig, ProviderType } from './providers';
 // kwargs, `thinking.keep` extra body).
 export { KimiChatProvider } from './providers/kimi';
 export type { ExtraBody, GenerationKwargs, KimiOptions, ThinkingConfig } from './providers/kimi';
+export { isKimiReasoningModel } from './providers/kimi-reasoning';
 
 // Model capability matrix
 export { UNKNOWN_CAPABILITY, isUnknownCapability } from './capability';
diff --git a/packages/kosong/src/providers/kimi-reasoning.ts b/packages/kosong/src/providers/kimi-reasoning.ts
new file mode 100644
index 000000000..1936283c3
--- /dev/null
+++ b/packages/kosong/src/providers/kimi-reasoning.ts
@@ -0,0 +1,47 @@
+/**
+ * Kimi reasoning models hosted on OpenAI-compatible gateways (Moonshot API,
+ * Microsoft Foundry, etc.) require `max_completion_tokens` instead of
+ * `max_tokens`. On reasoning models, `max_tokens` shares the budget with
+ * `reasoning_content`, so the model can exhaust the entire cap during thinking
+ * and return no visible content or tool calls.
+ *
+ * Native {@link KimiChatProvider} already normalizes this; openai-legacy paths
+ * (including azure-foundry) must apply the same rules when the deployment id
+ * identifies a Kimi reasoning model.
+ */
+
+export function isKimiReasoningModel(model: string): boolean {
+  const normalized = model.toLowerCase();
+  return (
+    normalized.includes('kimi') ||
+    normalized.includes('moonshot') ||
+    /^k2(?:[-_.]|$)/.test(normalized)
+  );
+}
+
+/** Whether outbound requests should use `max_completion_tokens` on the wire. */
+export function usesMaxCompletionTokensOnWire(model: string): boolean {
+  if (isKimiReasoningModel(model)) return true;
+  const normalized = model.toLowerCase();
+  return /^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized);
+}
+
+export interface KimiThinkingWireParams {
+  readonly type: 'enabled' | 'disabled';
+  readonly keep?: unknown;
+}
+
+/** Top-level `thinking` object for Kimi reasoning models. */
+export function kimiThinkingWireParams(args: {
+  readonly reasoningEffort: string | undefined;
+  readonly thinkingExplicitlyOff: boolean;
+  readonly thinkingKeep?: unknown;
+}): KimiThinkingWireParams | undefined {
+  if (args.thinkingExplicitlyOff) {
+    return { type: 'disabled' };
+  }
+  if (args.reasoningEffort === undefined) return undefined;
+  return args.thinkingKeep === undefined
+    ? { type: 'enabled' }
+    : { type: 'enabled', keep: args.thinkingKeep };
+}
diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts
index 58c8a7e22..de433d03a 100644
--- a/packages/kosong/src/providers/openai-legacy.ts
+++ b/packages/kosong/src/providers/openai-legacy.ts
@@ -36,6 +36,10 @@ import {
   resolveAuthBackedClient,
 } from './request-auth';
 import { clampCompletionTokensForSharedContextWindow } from './shared-context-window';
+import {
+  isKimiReasoningModel,
+  usesMaxCompletionTokensOnWire,
+} from './kimi-reasoning';
 import {
   normalizeToolCallIdsForProvider,
   sanitizeToolCallId,
@@ -108,8 +112,7 @@ interface OpenAIToolCallOut {
 }
 
 function usesMaxCompletionTokens(model: string): boolean {
-  const normalized = model.toLowerCase();
-  return /^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized);
+  return usesMaxCompletionTokensOnWire(model);
 }
 
 function completionTokenKwargs(
@@ -447,6 +450,8 @@ export class OpenAILegacyChatProvider implements ChatProvider {
   private _defaultHeaders: Record<string, string> | undefined;
   private _reasoningKey: string | undefined;
   private _reasoningEffort: string | undefined;
+  /** When true, reasoning is explicitly disabled and must not be auto-enabled from history. */
+  private _thinkingExplicitlyOff: boolean;
   private _generationKwargs: OpenAILegacyGenerationKwargs;
   private _toolMessageConversion: ToolMessageConversion;
   private _client: OpenAI | undefined;
@@ -471,6 +476,7 @@ export class OpenAILegacyChatProvider implements ChatProvider {
         ? normalizedReasoningKey
         : undefined;
     this._reasoningEffort = undefined;
+    this._thinkingExplicitlyOff = false;
     this._generationKwargs =
       options.maxTokens !== undefined ? completionTokenKwargs(this._model, options.maxTokens) : {};
     this._toolMessageConversion = options.toolMessageConversion ?? null;
@@ -486,6 +492,7 @@ export class OpenAILegacyChatProvider implements ChatProvider {
   }
 
   get thinkingEffort(): ThinkingEffort | null {
+    if (this._thinkingExplicitlyOff) return 'off';
     return reasoningEffortToThinkingEffort(this._reasoningEffort);
   }
 
@@ -538,7 +545,11 @@ export class OpenAILegacyChatProvider implements ChatProvider {
     // Skip when the caller already pinned reasoning_effort via withGenerationKwargs —
     // their value would otherwise be silently overwritten below.
     // See: https://github.com/MoonshotAI/kimi-code/issues/1616
-    if (reasoningEffort === undefined && kwargs['reasoning_effort'] === undefined) {
+    if (
+      !this._thinkingExplicitlyOff &&
+      reasoningEffort === undefined &&
+      kwargs['reasoning_effort'] === undefined
+    ) {
       const hasThinkPart = history.some((message) =>
         message.content.some((part) => part.type === 'think'),
       );
@@ -575,6 +586,37 @@ export class OpenAILegacyChatProvider implements ChatProvider {
       createParams['reasoning_effort'] = reasoningEffort;
     }
 
+    if (isKimiReasoningModel(this._model)) {
+      const extraBody = kwargs['extra_body'];
+      const extraRecord =
+        typeof extraBody === 'object' && extraBody !== null
+          ? (extraBody as Record<string, unknown>)
+          : undefined;
+      const extraThinking =
+        typeof extraRecord?.thinking === 'object' && extraRecord.thinking !== null
+          ? (extraRecord.thinking as Record<string, unknown>)
+          : undefined;
+      let thinkingType: 'enabled' | 'disabled' | undefined;
+      if (this._thinkingExplicitlyOff) {
+        thinkingType = 'disabled';
+      } else if (reasoningEffort !== undefined) {
+        thinkingType = 'enabled';
+      }
+      if (thinkingType !== undefined || extraThinking !== undefined) {
+        createParams['thinking'] = {
+          ...extraThinking,
+          ...(thinkingType !== undefined ? { type: thinkingType } : {}),
+        };
+      }
+      if (extraRecord !== undefined) {
+        const { thinking: _, extra_body: __, ...restExtra } = extraRecord;
+        Object.assign(createParams, restExtra);
+      }
+      // Kimi gateways expect extra_body fields hoisted to the top level.
+      // eslint-disable-next-line @typescript-eslint/no-dynamic-delete
+      delete createParams['extra_body'];
+    }
+
     try {
       const client = this._createClient(options?.auth);
       const response = (await client.chat.completions.create(
@@ -588,9 +630,14 @@ export class OpenAILegacyChatProvider implements ChatProvider {
   }
 
   withThinking(effort: ThinkingEffort): OpenAILegacyChatProvider {
-    const reasoningEffort = thinkingEffortToReasoningEffort(effort);
     const clone = this._clone();
-    clone._reasoningEffort = reasoningEffort;
+    if (effort === 'off') {
+      clone._thinkingExplicitlyOff = true;
+      clone._reasoningEffort = undefined;
+    } else {
+      clone._thinkingExplicitlyOff = false;
+      clone._reasoningEffort = thinkingEffortToReasoningEffort(effort);
+    }
     return clone;
   }
 
diff --git a/packages/kosong/src/providers/shared-context-window.ts b/packages/kosong/src/providers/shared-context-window.ts
index d14189218..9648c7133 100644
--- a/packages/kosong/src/providers/shared-context-window.ts
+++ b/packages/kosong/src/providers/shared-context-window.ts
@@ -2,22 +2,23 @@ import { estimatePromptTokens } from '#/token-estimate';
 import type { Message } from '#/message';
 import type { Tool } from '#/tool';
 
+import { usesMaxCompletionTokensOnWire } from './kimi-reasoning';
 import type { OpenAILegacyGenerationKwargs } from './openai-legacy';
 
 const DEFAULT_SERIALIZATION_MARGIN = 512;
 const MIN_COMPLETION_TOKENS = 1;
 
 function completionTokenField(model: string): 'max_completion_tokens' | 'max_tokens' {
-  const normalized = model.toLowerCase();
-  if (/^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized)) {
-    return 'max_completion_tokens';
-  }
-  return 'max_tokens';
+  return usesMaxCompletionTokensOnWire(model) ? 'max_completion_tokens' : 'max_tokens';
 }
 
 /**
  * Clamp completion budget for providers where input and output share one
  * context window (e.g. Microsoft Foundry Kimi deployments).
+ *
+ * Kimi reasoning models use `max_completion_tokens` for visible output; reasoning
+ * tokens are billed separately within the shared window. Do not apply a separate
+ * reasoning output cap — that defeats the purpose of the split field.
  */
 export function clampCompletionTokensForSharedContextWindow(args: {
   readonly model: string;
@@ -43,5 +44,10 @@ export function clampCompletionTokensForSharedContextWindow(args: {
   const kwargs = { ...args.generationKwargs };
   const requested = kwargs[field];
   kwargs[field] = requested === undefined ? remaining : Math.min(requested, remaining);
+  // Drop legacy alias when the wire field is max_completion_tokens.
+  if (field === 'max_completion_tokens') {
+    // eslint-disable-next-line @typescript-eslint/no-dynamic-delete
+    delete kwargs.max_tokens;
+  }
   return kwargs;
 }
diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts
index 667ee6d6a..1edaf9f1e 100644
--- a/packages/kosong/test/azure-foundry.test.ts
+++ b/packages/kosong/test/azure-foundry.test.ts
@@ -119,7 +119,7 @@ describe('AzureFoundryChatProvider', () => {
     });
   });
 
-  it('clamps max_tokens against the shared Foundry context window before sending', async () => {
+  it('clamps max_completion_tokens against the shared Foundry context window for Kimi models', async () => {
     await withHarness(async (harness) => {
       let capturedBody: Record<string, unknown> | undefined;
       harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => {
@@ -149,9 +149,52 @@ describe('AzureFoundryChatProvider', () => {
       }
 
       expect(capturedBody).toBeDefined();
-      expect(capturedBody!['max_tokens']).toBeTypeOf('number');
-      expect(capturedBody!['max_tokens'] as number).toBeLessThan(262144);
-      expect(capturedBody!['max_tokens'] as number).toBeGreaterThan(0);
+      expect(capturedBody!['max_completion_tokens']).toBeTypeOf('number');
+      expect(capturedBody!['max_completion_tokens'] as number).toBeLessThan(262144);
+      expect(capturedBody!['max_completion_tokens'] as number).toBeGreaterThan(0);
+      expect(capturedBody!['max_tokens']).toBeUndefined();
+    });
+  });
+
+  it('sends Kimi thinking enablement alongside reasoning_effort', async () => {
+    await withHarness(async (harness) => {
+      let capturedBody: Record<string, unknown> | undefined;
+      harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => {
+        capturedBody = request.bodyJson as Record<string, unknown>;
+        await reply.sseJson(200, [
+          {
+            id: 'chatcmpl-azure-kimi',
+            object: 'chat.completion.chunk',
+            created: 1234567890,
+            model: 'Kimi-K2.6',
+            choices: [
+              {
+                index: 0,
+                delta: { reasoning_content: 'thinking', content: 'done' },
+                finish_reason: 'stop',
+              },
+            ],
+          },
+        ]);
+      });
+
+      const provider = new AzureFoundryChatProvider({
+        model: 'Kimi-K2.6',
+        apiKey: 'foundry-key',
+        baseUrl: `${harness.baseUrl}/openai/v1`,
+        sharedContextWindowTokens: 262144,
+      }).withThinking('medium');
+      const stream = await provider.generate('', [], [
+        { role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
+      ]);
+      for await (const _part of stream) {
+        // drain
+      }
+
+      expect(capturedBody!['reasoning_effort']).toBe('medium');
+      expect(capturedBody!['thinking']).toEqual({ type: 'enabled' });
+      expect(capturedBody!['max_completion_tokens']).toBeTypeOf('number');
+      expect(capturedBody!['max_tokens']).toBeUndefined();
     });
   });
 });
diff --git a/packages/kosong/test/kimi-reasoning.test.ts b/packages/kosong/test/kimi-reasoning.test.ts
new file mode 100644
index 000000000..7e4c3b97c
--- /dev/null
+++ b/packages/kosong/test/kimi-reasoning.test.ts
@@ -0,0 +1,52 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  isKimiReasoningModel,
+  kimiThinkingWireParams,
+  usesMaxCompletionTokensOnWire,
+} from '#/providers/kimi-reasoning';
+
+describe('isKimiReasoningModel', () => {
+  it('detects Kimi deployment ids on Foundry', () => {
+    expect(isKimiReasoningModel('Kimi-K2.6')).toBe(true);
+    expect(isKimiReasoningModel('kimi-k2.5')).toBe(true);
+    expect(isKimiReasoningModel('moonshot-v1')).toBe(true);
+  });
+
+  it('does not match unrelated models', () => {
+    expect(isKimiReasoningModel('gpt-4o')).toBe(false);
+    expect(isKimiReasoningModel('deepseek-v3')).toBe(false);
+  });
+});
+
+describe('usesMaxCompletionTokensOnWire', () => {
+  it('uses max_completion_tokens for Kimi and OpenAI reasoning models', () => {
+    expect(usesMaxCompletionTokensOnWire('Kimi-K2.6')).toBe(true);
+    expect(usesMaxCompletionTokensOnWire('gpt-5')).toBe(true);
+    expect(usesMaxCompletionTokensOnWire('o3-mini')).toBe(true);
+  });
+
+  it('uses max_tokens for generic chat models', () => {
+    expect(usesMaxCompletionTokensOnWire('gpt-4o')).toBe(false);
+  });
+});
+
+describe('kimiThinkingWireParams', () => {
+  it('enables thinking when reasoning is configured', () => {
+    expect(
+      kimiThinkingWireParams({
+        reasoningEffort: 'medium',
+        thinkingExplicitlyOff: false,
+      }),
+    ).toEqual({ type: 'enabled' });
+  });
+
+  it('disables thinking when explicitly off', () => {
+    expect(
+      kimiThinkingWireParams({
+        reasoningEffort: 'medium',
+        thinkingExplicitlyOff: true,
+      }),
+    ).toEqual({ type: 'disabled' });
+  });
+});
diff --git a/packages/kosong/test/openai-legacy.test.ts b/packages/kosong/test/openai-legacy.test.ts
index 8335a8228..63def863d 100644
--- a/packages/kosong/test/openai-legacy.test.ts
+++ b/packages/kosong/test/openai-legacy.test.ts
@@ -986,6 +986,28 @@ describe('OpenAILegacyChatProvider', () => {
 
       expect(body['reasoning_effort']).toBe('high');
     });
+
+    it('does not auto-inject reasoning_effort when thinking is explicitly off', async () => {
+      const provider = createProvider({ model: 'kimi-k2.5', reasoningKey: 'reasoning_content' }).withThinking(
+        'off',
+      );
+      const history: Message[] = [
+        { role: 'user', content: [{ type: 'text', text: 'Hello' }], toolCalls: [] },
+        {
+          role: 'assistant',
+          content: [
+            { type: 'think', think: 'Let me think...' },
+            { type: 'text', text: 'Hi!' },
+          ],
+          toolCalls: [],
+        },
+        { role: 'user', content: [{ type: 'text', text: 'How are you?' }], toolCalls: [] },
+      ];
+      const body = await captureRequestBody(provider, '', [], history);
+
+      expect(body['reasoning_effort']).toBeUndefined();
+      expect(provider.thinkingEffort).toBe('off');
+    });
   });
 
   describe('default reasoning protocol (no explicit reasoningKey)', () => {
diff --git a/packages/kosong/test/shared-context-window.test.ts b/packages/kosong/test/shared-context-window.test.ts
index 604b01b1c..ff991359a 100644
--- a/packages/kosong/test/shared-context-window.test.ts
+++ b/packages/kosong/test/shared-context-window.test.ts
@@ -7,26 +7,41 @@ describe('clampCompletionTokensForSharedContextWindow', () => {
     const kwargs = clampCompletionTokensForSharedContextWindow({
       model: 'Kimi-K2.6',
       sharedContextWindowTokens: 262144,
-      generationKwargs: { max_tokens: 262144 },
+      generationKwargs: { max_completion_tokens: 262144 },
       systemPrompt: 'x'.repeat(40_000),
       history: [{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] }],
       tools: [],
     });
 
-    expect(kwargs.max_tokens).toBeLessThan(262144);
-    expect(kwargs.max_tokens).toBeGreaterThan(0);
+    expect(kwargs.max_completion_tokens).toBeLessThan(262144);
+    expect(kwargs.max_completion_tokens).toBeGreaterThan(0);
+    expect(kwargs.max_tokens).toBeUndefined();
   });
 
   it('keeps a smaller explicit cap unchanged when it already fits', () => {
     const kwargs = clampCompletionTokensForSharedContextWindow({
       model: 'Kimi-K2.6',
       sharedContextWindowTokens: 262144,
-      generationKwargs: { max_tokens: 1024 },
+      generationKwargs: { max_completion_tokens: 1024 },
       systemPrompt: 'short prompt',
       history: [],
       tools: [],
     });
 
-    expect(kwargs.max_tokens).toBe(1024);
+    expect(kwargs.max_completion_tokens).toBe(1024);
+  });
+
+  it('uses max_tokens for non-Kimi shared-window models', () => {
+    const kwargs = clampCompletionTokensForSharedContextWindow({
+      model: 'gpt-4o',
+      sharedContextWindowTokens: 128000,
+      generationKwargs: { max_tokens: 4096 },
+      systemPrompt: 'short prompt',
+      history: [],
+      tools: [],
+    });
+
+    expect(kwargs.max_tokens).toBe(4096);
+    expect(kwargs.max_completion_tokens).toBeUndefined();
   });
 });

From 1207f072871d83e5e08e01c9cf33a07fac18a350 Mon Sep 17 00:00:00 2001
From: guglxni <aaryanguglani.cs21@rvce.edu.in>
Date: Sat, 20 Jun 2026 23:35:02 +0530
Subject: [PATCH 4/4] fix: omit Moonshot thinking param on Foundry Kimi
 requests

Microsoft Foundry exposes Kimi through the OpenAI chat-completions
schema and rejects the Moonshot-proprietary `thinking` argument. Keep
reasoning enabled via `reasoning_effort` and the max_completion_tokens
split; only KimiChatProvider sends `thinking` on the native Moonshot API.
---
 .changeset/azure-foundry-provider.md          |  2 +-
 .../agent-core/src/config/kimi-env-params.ts  |  8 ----
 .../test/config/kimi-env-params.test.ts       |  7 ++--
 .../kosong/src/providers/kimi-reasoning.ts    | 37 +++++--------------
 .../kosong/src/providers/openai-legacy.ts     | 32 ----------------
 packages/kosong/test/azure-foundry.test.ts    |  4 +-
 packages/kosong/test/kimi-reasoning.test.ts   | 26 +------------
 7 files changed, 17 insertions(+), 99 deletions(-)

diff --git a/.changeset/azure-foundry-provider.md b/.changeset/azure-foundry-provider.md
index cfb09cec8..2b14488e0 100644
--- a/.changeset/azure-foundry-provider.md
+++ b/.changeset/azure-foundry-provider.md
@@ -2,4 +2,4 @@
 "@moonshot-ai/kimi-code": minor
 ---
 
-Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window. For Foundry-hosted Kimi reasoning models, send `max_completion_tokens` and `thinking: { type: 'enabled' }` like the native Kimi provider so reasoning and visible output use separate budgets.
+Add an `azure-foundry` provider type for Microsoft Foundry model deployments via the OpenAI v1-compatible route. Clamp completion budgets against the model's shared input+output context window. For Foundry-hosted Kimi reasoning models, send `max_completion_tokens` (not `max_tokens`) so reasoning and visible output use separate budgets, and enable reasoning via `reasoning_effort` only — Foundry rejects Moonshot's proprietary `thinking` parameter.
diff --git a/packages/agent-core/src/config/kimi-env-params.ts b/packages/agent-core/src/config/kimi-env-params.ts
index 1cc726d5a..8f4af5fec 100644
--- a/packages/agent-core/src/config/kimi-env-params.ts
+++ b/packages/agent-core/src/config/kimi-env-params.ts
@@ -1,6 +1,5 @@
 import {
   type ChatProvider,
-  isKimiReasoningModel,
   type GenerationKwargs,
   KimiChatProvider,
   type ThinkingEffort,
@@ -56,12 +55,5 @@ export function applyKimiEnvThinkingKeep(
   if (provider instanceof KimiChatProvider) {
     return provider.withExtraBody({ thinking: { keep } });
   }
-  if (
-    provider.name === 'azure-foundry' &&
-    isKimiReasoningModel(provider.modelName) &&
-    provider.withGenerationKwargs !== undefined
-  ) {
-    return provider.withGenerationKwargs({ extra_body: { thinking: { keep } } });
-  }
   return provider;
 }
diff --git a/packages/agent-core/test/config/kimi-env-params.test.ts b/packages/agent-core/test/config/kimi-env-params.test.ts
index 158256718..ebcd0f526 100644
--- a/packages/agent-core/test/config/kimi-env-params.test.ts
+++ b/packages/agent-core/test/config/kimi-env-params.test.ts
@@ -72,9 +72,10 @@ describe('applyKimiEnvThinkingKeep', () => {
     expect(genState(out).extra_body?.thinking?.keep).toBe('all');
   });
 
-  it('injects thinking.keep for Foundry-hosted Kimi models', () => {
-    const out = applyKimiEnvThinkingKeep(foundryKimi(), 'high', { KIMI_MODEL_THINKING_KEEP: 'all' });
-    expect(genState(out).extra_body?.thinking?.keep).toBe('all');
+  it('does not inject thinking.keep for Foundry-hosted Kimi models', () => {
+    const provider = foundryKimi();
+    const out = applyKimiEnvThinkingKeep(provider, 'high', { KIMI_MODEL_THINKING_KEEP: 'all' });
+    expect(out).toBe(provider);
   });
 
   it('does NOT inject thinking.keep when thinking is off', () => {
diff --git a/packages/kosong/src/providers/kimi-reasoning.ts b/packages/kosong/src/providers/kimi-reasoning.ts
index 1936283c3..0a21206de 100644
--- a/packages/kosong/src/providers/kimi-reasoning.ts
+++ b/packages/kosong/src/providers/kimi-reasoning.ts
@@ -1,13 +1,14 @@
 /**
- * Kimi reasoning models hosted on OpenAI-compatible gateways (Moonshot API,
- * Microsoft Foundry, etc.) require `max_completion_tokens` instead of
- * `max_tokens`. On reasoning models, `max_tokens` shares the budget with
- * `reasoning_content`, so the model can exhaust the entire cap during thinking
- * and return no visible content or tool calls.
+ * Kimi reasoning models hosted on OpenAI-compatible gateways require
+ * `max_completion_tokens` instead of `max_tokens`. On reasoning models,
+ * `max_tokens` shares the budget with `reasoning_content`, so the model can
+ * exhaust the entire cap during thinking and return no visible content or tool
+ * calls.
  *
- * Native {@link KimiChatProvider} already normalizes this; openai-legacy paths
- * (including azure-foundry) must apply the same rules when the deployment id
- * identifies a Kimi reasoning model.
+ * The Moonshot-proprietary `thinking: { type: 'enabled' }` parameter is only
+ * sent by {@link KimiChatProvider}. Gateways such as Microsoft Foundry expose
+ * Kimi through the OpenAI chat-completions schema and enable reasoning via
+ * `reasoning_effort` alone — sending `thinking` yields 400.
  */
 
 export function isKimiReasoningModel(model: string): boolean {
@@ -25,23 +26,3 @@ export function usesMaxCompletionTokensOnWire(model: string): boolean {
   const normalized = model.toLowerCase();
   return /^o\d(?:$|[-.])/.test(normalized) || /^gpt-5(?:$|[-.])/.test(normalized);
 }
-
-export interface KimiThinkingWireParams {
-  readonly type: 'enabled' | 'disabled';
-  readonly keep?: unknown;
-}
-
-/** Top-level `thinking` object for Kimi reasoning models. */
-export function kimiThinkingWireParams(args: {
-  readonly reasoningEffort: string | undefined;
-  readonly thinkingExplicitlyOff: boolean;
-  readonly thinkingKeep?: unknown;
-}): KimiThinkingWireParams | undefined {
-  if (args.thinkingExplicitlyOff) {
-    return { type: 'disabled' };
-  }
-  if (args.reasoningEffort === undefined) return undefined;
-  return args.thinkingKeep === undefined
-    ? { type: 'enabled' }
-    : { type: 'enabled', keep: args.thinkingKeep };
-}
diff --git a/packages/kosong/src/providers/openai-legacy.ts b/packages/kosong/src/providers/openai-legacy.ts
index de433d03a..497a2391f 100644
--- a/packages/kosong/src/providers/openai-legacy.ts
+++ b/packages/kosong/src/providers/openai-legacy.ts
@@ -37,7 +37,6 @@ import {
 } from './request-auth';
 import { clampCompletionTokensForSharedContextWindow } from './shared-context-window';
 import {
-  isKimiReasoningModel,
   usesMaxCompletionTokensOnWire,
 } from './kimi-reasoning';
 import {
@@ -586,37 +585,6 @@ export class OpenAILegacyChatProvider implements ChatProvider {
       createParams['reasoning_effort'] = reasoningEffort;
     }
 
-    if (isKimiReasoningModel(this._model)) {
-      const extraBody = kwargs['extra_body'];
-      const extraRecord =
-        typeof extraBody === 'object' && extraBody !== null
-          ? (extraBody as Record<string, unknown>)
-          : undefined;
-      const extraThinking =
-        typeof extraRecord?.thinking === 'object' && extraRecord.thinking !== null
-          ? (extraRecord.thinking as Record<string, unknown>)
-          : undefined;
-      let thinkingType: 'enabled' | 'disabled' | undefined;
-      if (this._thinkingExplicitlyOff) {
-        thinkingType = 'disabled';
-      } else if (reasoningEffort !== undefined) {
-        thinkingType = 'enabled';
-      }
-      if (thinkingType !== undefined || extraThinking !== undefined) {
-        createParams['thinking'] = {
-          ...extraThinking,
-          ...(thinkingType !== undefined ? { type: thinkingType } : {}),
-        };
-      }
-      if (extraRecord !== undefined) {
-        const { thinking: _, extra_body: __, ...restExtra } = extraRecord;
-        Object.assign(createParams, restExtra);
-      }
-      // Kimi gateways expect extra_body fields hoisted to the top level.
-      // eslint-disable-next-line @typescript-eslint/no-dynamic-delete
-      delete createParams['extra_body'];
-    }
-
     try {
       const client = this._createClient(options?.auth);
       const response = (await client.chat.completions.create(
diff --git a/packages/kosong/test/azure-foundry.test.ts b/packages/kosong/test/azure-foundry.test.ts
index 1edaf9f1e..155de8374 100644
--- a/packages/kosong/test/azure-foundry.test.ts
+++ b/packages/kosong/test/azure-foundry.test.ts
@@ -156,7 +156,7 @@ describe('AzureFoundryChatProvider', () => {
     });
   });
 
-  it('sends Kimi thinking enablement alongside reasoning_effort', async () => {
+  it('sends reasoning_effort and max_completion_tokens without Moonshot thinking param', async () => {
     await withHarness(async (harness) => {
       let capturedBody: Record<string, unknown> | undefined;
       harness.route('POST', '/openai/v1/chat/completions', async (request, reply) => {
@@ -192,7 +192,7 @@ describe('AzureFoundryChatProvider', () => {
       }
 
       expect(capturedBody!['reasoning_effort']).toBe('medium');
-      expect(capturedBody!['thinking']).toEqual({ type: 'enabled' });
+      expect(capturedBody!['thinking']).toBeUndefined();
       expect(capturedBody!['max_completion_tokens']).toBeTypeOf('number');
       expect(capturedBody!['max_tokens']).toBeUndefined();
     });
diff --git a/packages/kosong/test/kimi-reasoning.test.ts b/packages/kosong/test/kimi-reasoning.test.ts
index 7e4c3b97c..e797d99fa 100644
--- a/packages/kosong/test/kimi-reasoning.test.ts
+++ b/packages/kosong/test/kimi-reasoning.test.ts
@@ -1,10 +1,6 @@
 import { describe, expect, it } from 'vitest';
 
-import {
-  isKimiReasoningModel,
-  kimiThinkingWireParams,
-  usesMaxCompletionTokensOnWire,
-} from '#/providers/kimi-reasoning';
+import { isKimiReasoningModel, usesMaxCompletionTokensOnWire } from '#/providers/kimi-reasoning';
 
 describe('isKimiReasoningModel', () => {
   it('detects Kimi deployment ids on Foundry', () => {
@@ -30,23 +26,3 @@ describe('usesMaxCompletionTokensOnWire', () => {
     expect(usesMaxCompletionTokensOnWire('gpt-4o')).toBe(false);
   });
 });
-
-describe('kimiThinkingWireParams', () => {
-  it('enables thinking when reasoning is configured', () => {
-    expect(
-      kimiThinkingWireParams({
-        reasoningEffort: 'medium',
-        thinkingExplicitlyOff: false,
-      }),
-    ).toEqual({ type: 'enabled' });
-  });
-
-  it('disables thinking when explicitly off', () => {
-    expect(
-      kimiThinkingWireParams({
-        reasoningEffort: 'medium',
-        thinkingExplicitlyOff: true,
-      }),
-    ).toEqual({ type: 'disabled' });
-  });
-});