Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,16 @@
}
}
},
"429": {
"description": "The quota has been exceeded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QuotaExceededResponse"
}
}
}
},
Comment on lines +378 to +387
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

429 responses added across endpoints

Good. Ensure the documented “description”/examples match the actual response.detail.response used at runtime (“Model quota exceeded” in code/tests vs “The quota has been exceeded” here).

Run this to check for message mismatches:

Also applies to: 474-483, 1279-1288


🏁 Script executed:

#!/bin/bash
# Find runtime 429 messages vs docs wording
rg -n -C1 -S "Model quota exceeded|The quota has been exceeded" --type py --type json

Length of output: 6321


429 response messages in docs/openapi.json do not match actual runtime behavior

The OpenAPI documentation documents "The quota has been exceeded" but the actual endpoints return "Model quota exceeded". Update the description and examples in docs/openapi.json at lines 378-387, 474-483, and 1279-1288 to use "Model quota exceeded" to match the actual response sent by src/app/endpoints/streaming_query.py:939 and src/app/endpoints/query.py:435, as confirmed by tests.

🤖 Prompt for AI Agents
In docs/openapi.json around lines 378-387, 474-483, and 1279-1288, the 429
response descriptions and examples currently read "The quota has been exceeded"
but must match runtime behavior which returns "Model quota exceeded"; update the
description fields and any example values at those line ranges to use the exact
text "Model quota exceeded" (and adjust any surrounding punctuation/formatting
to remain valid JSON), ensuring the schema references remain unchanged.

"500": {
"description": "Internal Server Error",
"detail": {
Expand Down Expand Up @@ -461,6 +471,16 @@
}
}
},
"429": {
"description": "The quota has been exceeded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QuotaExceededResponse"
}
}
}
},
"500": {
"description": "Internal Server Error",
"detail": {
Expand Down Expand Up @@ -1256,6 +1276,16 @@
}
}
},
"429": {
"description": "The quota has been exceeded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QuotaExceededResponse"
}
}
}
},
"500": {
"description": "Internal Server Error",
"detail": {
Expand Down Expand Up @@ -3577,6 +3607,63 @@
}
]
},
"QuotaExceededResponse": {
"properties": {
"detail": {
"$ref": "#/components/schemas/DetailModel"
}
},
"type": "object",
"required": [
"detail"
],
"title": "QuotaExceededResponse",
"description": "429 Too Many Requests - LLM quota exceeded.",
"examples": [
{
"detail": {
"cause": "User 123 has no available tokens.",
"response": "The quota has been exceeded"
}
},
{
"detail": {
"cause": "Cluster has no available tokens.",
"response": "The quota has been exceeded"
}
},
{
"detail": {
"cause": "Unknown subject 999 has no available tokens.",
"response": "The quota has been exceeded"
}
},
{
"detail": {
"cause": "User 123 has 5 tokens, but 10 tokens are needed.",
"response": "The quota has been exceeded"
}
},
{
"detail": {
"cause": "Cluster has 500 tokens, but 900 tokens are needed.",
"response": "The quota has been exceeded"
}
},
{
"detail": {
"cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.",
"response": "The quota has been exceeded"
}
},
{
"detail": {
"cause": "The token quota for model gpt-4-turbo has been exceeded.",
"response": "The model quota has been exceeded"
}
}
]
},
"QuotaHandlersConfiguration": {
"properties": {
"sqlite": {
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies = [
# Used by authorization resolvers
"jsonpath-ng>=1.6.1",
"psycopg2-binary>=2.9.10",
"litellm>=1.75.5.post1",
]


Expand Down Expand Up @@ -129,8 +130,6 @@ llslibdev = [
"langdetect>=1.0.9",
"emoji>=2.1.0",
"nltk>=3.8.1",
# API inference: remote::gemini
"litellm>=1.75.5.post1",
# API inference: inline::sentence-transformers
"sentence-transformers>=5.0.0",
# API vector_io: inline::faiss
Expand Down
15 changes: 15 additions & 0 deletions src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Annotated, Any, Optional, cast

from fastapi import APIRouter, Depends, HTTPException, Request, status
from litellm.exceptions import RateLimitError
from llama_stack_client import (
APIConnectionError,
AsyncLlamaStackClient, # type: ignore
Expand Down Expand Up @@ -42,6 +43,7 @@
ReferencedDocument,
ToolCall,
UnauthorizedResponse,
QuotaExceededResponse,
)
from utils.endpoints import (
check_configuration_loaded,
Expand Down Expand Up @@ -86,6 +88,10 @@
"description": "Client does not have permission to access conversation",
"model": ForbiddenResponse,
},
429: {
"description": "The quota has been exceeded",
"model": QuotaExceededResponse,
},
500: {
"detail": {
"response": "Unable to connect to Llama Stack",
Expand Down Expand Up @@ -421,6 +427,15 @@ async def query_endpoint_handler_base( # pylint: disable=R0914
"cause": str(e),
},
) from e
except RateLimitError as e:
used_model = getattr(e, "model", "unknown")
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail={
"response": "Model quota exceeded",
"cause": f"The token quota for model {used_model} has been exceeded.",
},
) from e


@router.post("/query", responses=query_response)
Expand Down
5 changes: 5 additions & 0 deletions src/app/endpoints/query_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
QueryResponse,
ReferencedDocument,
UnauthorizedResponse,
QuotaExceededResponse,
)
from utils.endpoints import (
get_system_prompt,
Expand Down Expand Up @@ -59,6 +60,10 @@
"description": "Client does not have permission to access conversation",
"model": ForbiddenResponse,
},
429: {
"description": "The quota has been exceeded",
"model": QuotaExceededResponse,
},
500: {
"detail": {
"response": "Unable to connect to Llama Stack",
Expand Down
20 changes: 19 additions & 1 deletion src/app/endpoints/streaming_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from datetime import UTC, datetime
from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast

from litellm.exceptions import RateLimitError
from fastapi import APIRouter, Depends, HTTPException, Request, status
from fastapi.responses import StreamingResponse
from llama_stack_client import (
Expand Down Expand Up @@ -48,7 +49,11 @@
from models.config import Action
from models.database.conversations import UserConversation
from models.requests import QueryRequest
from models.responses import ForbiddenResponse, UnauthorizedResponse
from models.responses import (
ForbiddenResponse,
UnauthorizedResponse,
QuotaExceededResponse,
)
from utils.endpoints import (
check_configuration_loaded,
create_referenced_documents_with_metadata,
Expand Down Expand Up @@ -104,6 +109,10 @@
"description": "Client does not have permission to access conversation",
"model": ForbiddenResponse,
},
429: {
"description": "The quota has been exceeded",
"model": QuotaExceededResponse,
},
500: {
"detail": {
"response": "Unable to connect to Llama Stack",
Expand Down Expand Up @@ -922,6 +931,15 @@ async def response_generator(
"cause": str(e),
},
) from e
except RateLimitError as e:
used_model = getattr(e, "model", "unknown")
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail={
"response": "Model quota exceeded",
"cause": f"The token quota for model {used_model} has been exceeded.",
},
) from e
except Exception as e: # pylint: disable=broad-except
# Handle other errors with OLS-compatible error response
# This broad exception catch is intentional to ensure all errors
Expand Down
68 changes: 68 additions & 0 deletions src/models/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,6 +1142,74 @@ def __init__(self, user_id: str, resource: str, resource_id: str):
}


class QuotaExceededResponse(AbstractErrorResponse):
"""429 Too Many Requests - LLM quota exceeded."""

def __init__(
self,
user_id: str,
model_name: str, # pylint: disable=unused-argument
limit: int, # pylint: disable=unused-argument
):
"""Initialize a QuotaExceededResponse."""
super().__init__(
detail=DetailModel(
response="The quota has been exceeded",
cause=(f"User {user_id} has no available tokens."),
)
)
# TODO(LCORE-837): add factories for custom cause creation

model_config = {
"json_schema_extra": {
"examples": [
{
"detail": {
"response": "The quota has been exceeded",
"cause": "User 123 has no available tokens.",
}
},
{
"detail": {
"response": "The quota has been exceeded",
"cause": "Cluster has no available tokens.",
}
},
{
"detail": {
"response": "The quota has been exceeded",
"cause": "Unknown subject 999 has no available tokens.",
}
},
{
"detail": {
"response": "The quota has been exceeded",
"cause": "User 123 has 5 tokens, but 10 tokens are needed.",
}
},
{
"detail": {
"response": "The quota has been exceeded",
"cause": "Cluster has 500 tokens, but 900 tokens are needed.",
}
},
{
"detail": {
"response": "The quota has been exceeded",
"cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.",
}
},
{
"detail": {
"response": "The model quota has been exceeded",
"cause": "The token quota for model gpt-4-turbo has been exceeded.",
}
},
]
}
}


class InvalidFeedbackStoragePathResponse(AbstractErrorResponse):
"""500 Internal Error - Invalid feedback storage path."""

Expand Down
44 changes: 44 additions & 0 deletions tests/unit/app/endpoints/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest
from pytest_mock import MockerFixture
from fastapi import HTTPException, Request, status
from litellm.exceptions import RateLimitError

from llama_stack_client import APIConnectionError
from llama_stack_client.types import UserMessage # type: ignore
Expand Down Expand Up @@ -2261,3 +2262,46 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) -
stream=False,
toolgroups=None,
)


@pytest.mark.asyncio
async def test_query_endpoint_quota_exceeded(
mocker: MockerFixture, dummy_request: Request
) -> None:
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
query_request = QueryRequest(
query="What is OpenStack?",
provider="openai",
model="gpt-4-turbo",
) # type: ignore
mock_client = mocker.AsyncMock()
mock_agent = mocker.AsyncMock()
mock_agent.create_turn.side_effect = RateLimitError(
model="gpt-4-turbo", llm_provider="openai", message=""
)
mocker.patch(
"app.endpoints.query.get_agent",
return_value=(mock_agent, "conv-123", "sess-123"),
)
mocker.patch(
"app.endpoints.query.select_model_and_provider_id",
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"),
)
mocker.patch("app.endpoints.query.validate_model_provider_override")
mocker.patch(
"client.AsyncLlamaStackClientHolder.get_client",
return_value=mock_client,
)
mocker.patch(
"app.endpoints.query.handle_mcp_headers_with_toolgroups", return_value={}
)

with pytest.raises(HTTPException) as exc_info:
await query_endpoint_handler(
dummy_request, query_request=query_request, auth=MOCK_AUTH
)
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
detail = exc_info.value.detail
assert isinstance(detail, dict)
assert detail["response"] == "Model quota exceeded" # type: ignore
assert "gpt-4-turbo" in detail["cause"] # type: ignore
Loading
Loading