diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py index 5d23b33f6..c6fd2b67e 100644 --- a/src/app/endpoints/rlsapi_v1.py +++ b/src/app/endpoints/rlsapi_v1.py @@ -51,6 +51,7 @@ extract_provider_and_model_from_model_id, handle_known_apistatus_errors, is_context_length_error, + normalize_vertex_ai_model_id, ) from utils.quota import check_tokens_available from utils.responses import ( @@ -343,9 +344,12 @@ async def _call_llm( logger.debug("Using model %s for rlsapi v1 inference", resolved_model_id) + # Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug + normalized_model = normalize_vertex_ai_model_id(resolved_model_id) + response = await client.responses.create( input=question, - model=resolved_model_id, + model=normalized_model, instructions=instructions, tools=tools or [], stream=False, diff --git a/src/utils/compaction.py b/src/utils/compaction.py index b87085ff0..de3603e1d 100644 --- a/src/utils/compaction.py +++ b/src/utils/compaction.py @@ -34,6 +34,7 @@ from log import get_logger from models.compaction import ConversationSummary +from utils.query import normalize_vertex_ai_model_id from utils.token_estimator import ( estimate_conversation_tokens, estimate_tokens, @@ -266,10 +267,14 @@ async def summarize_chunk( # by utils.responses.get_topic_summary and protects the directives from # prompt-injection via user message content that ends up in the # transcript. + + # Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug + normalized_model = normalize_vertex_ai_model_id(model) + response = await client.responses.create( input=f"Conversation:\n{transcript}", instructions=SUMMARIZATION_PROMPT, - model=model, + model=normalized_model, stream=False, store=False, ) @@ -374,10 +379,14 @@ async def recursively_resummarize( model, ) # Same instructions/input split as summarize_chunk — see comment there. + + # Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug + normalized_model = normalize_vertex_ai_model_id(model) + response = await client.responses.create( input=transcript, instructions=RECURSIVE_RESUMMARIZATION_PROMPT, - model=model, + model=normalized_model, stream=False, store=False, ) diff --git a/src/utils/query.py b/src/utils/query.py index 32b9673f0..51e246d91 100644 --- a/src/utils/query.py +++ b/src/utils/query.py @@ -534,6 +534,34 @@ def extract_provider_and_model_from_model_id(model_id: str) -> tuple[str, str]: return "", model_id +def normalize_vertex_ai_model_id(model_id: str) -> str: + """Normalize Vertex AI model ID to work around llama-stack 0.6.x bug. + + llama-stack 0.6.x has a bug in the inline::meta-reference responses provider + where it normalizes model IDs before checking against allowed_models, but doesn't + normalize the allowed_models list itself. This causes Vertex AI models to fail + validation because: + - Model is registered as: publishers/google/models/gemini-2.5-flash + - llama-stack strips to: google/gemini-2.5-flash internally + - Checks against allowed list: ['publishers/google/models/gemini-2.5-flash'] + - Mismatch → 500 error + + This workaround strips the publishers/google/models/ prefix to match what + llama-stack expects internally. + + Fixed in llama-stack 0.7.0 via https://github.com/ogx-ai/ogx/pull/5169 + + Args: + model_id: The model ID, possibly in Vertex AI format + + Returns: + Normalized model ID with Vertex AI prefix stripped if present + """ + if model_id.startswith("publishers/google/models/"): + return model_id.replace("publishers/google/models/", "google/", 1) + return model_id + + def handle_known_apistatus_errors( error: LLSApiStatusError | OpenAIAPIStatusError, model_id: str ) -> AbstractErrorResponse: diff --git a/src/utils/responses.py b/src/utils/responses.py index 6c06e8cbb..a6d098199 100644 --- a/src/utils/responses.py +++ b/src/utils/responses.py @@ -118,6 +118,7 @@ from utils.query import ( extract_provider_and_model_from_model_id, handle_known_apistatus_errors, + normalize_vertex_ai_model_id, prepare_input, ) from utils.suid import to_llama_stack_conversation_id @@ -178,11 +179,14 @@ async def get_topic_summary( # pylint: disable=too-many-nested-blocks The topic summary for the question """ try: + # Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug + normalized_model = normalize_vertex_ai_model_id(model_id) + response = cast( ResponseObject, await client.responses.create( input=question, - model=model_id, + model=normalized_model, instructions=get_topic_summary_system_prompt(), stream=False, store=False, # Don't store topic summary requests @@ -389,9 +393,13 @@ async def prepare_responses_params( # pylint: disable=too-many-arguments,too-ma # Build x-llamastack-provider-data header from MCP tool headers extra_headers = _build_provider_data_headers(tools) + + # Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug + normalized_model = normalize_vertex_ai_model_id(model) + return ResponsesApiParams( input=input_text, - model=model, + model=normalized_model, instructions=system_prompt, tools=tools, conversation=llama_stack_conv_id, diff --git a/tests/unit/utils/test_responses.py b/tests/unit/utils/test_responses.py index 3ca93a907..8666cfd93 100644 --- a/tests/unit/utils/test_responses.py +++ b/tests/unit/utils/test_responses.py @@ -57,6 +57,7 @@ from models.api.requests import QueryRequest from models.common.responses.types import InputTool, InputToolMCP from models.config import ApprovalFilter, ByokRag, ModelContextProtocolServer +from utils.query import normalize_vertex_ai_model_id from utils.responses import ( _build_chunk_attributes, _merge_tools, @@ -3577,3 +3578,32 @@ async def test_merge_header_no_server_tools_returns_client_only( ) assert tools is not None assert len(tools) == 1 + + +class TestNormalizeVertexAIModelId: + """Tests for normalize_vertex_ai_model_id function.""" + + def test_normalizes_vertex_ai_model_id(self) -> None: + """Test that Vertex AI model IDs are normalized correctly.""" + input_model = "publishers/google/models/gemini-2.5-flash" + expected = "google/gemini-2.5-flash" + assert normalize_vertex_ai_model_id(input_model) == expected + + def test_normalizes_vertex_ai_model_id_with_version(self) -> None: + """Test normalization with versioned Vertex AI model ID.""" + input_model = "publishers/google/models/gemini-1.5-pro-001" + expected = "google/gemini-1.5-pro-001" + assert normalize_vertex_ai_model_id(input_model) == expected + + def test_preserves_non_vertex_ai_model_ids(self) -> None: + """Test that non-Vertex AI model IDs are returned unchanged.""" + # Regular model IDs should pass through + assert normalize_vertex_ai_model_id("gpt-4") == "gpt-4" + assert normalize_vertex_ai_model_id("openai/gpt-4") == "openai/gpt-4" + assert normalize_vertex_ai_model_id("watsonx/model") == "watsonx/model" + + def test_preserves_gemini_api_format(self) -> None: + """Test that Gemini API format (models/...) is preserved.""" + # Gemini API format doesn't have the publishers prefix + gemini_api_format = "models/gemini-2.5-flash" + assert normalize_vertex_ai_model_id(gemini_api_format) == gemini_api_format