From 84487e73d1d906aebb19a46532c870270ee2fd53 Mon Sep 17 00:00:00 2001 From: Roberto Date: Thu, 25 Jun 2026 22:14:33 -0300 Subject: [PATCH 1/5] feat: curated MCP tool surface over the REST API The MCP catalog is built from a small curated FastAPI app (mcp_app) with 24 well-described tools, instead of auto-generating one tool per REST route (95 of them), which floods agent context and hurts tool selection. Related routes fold behind a dataset/kind selector, and an optional code-mode tool stays gated off by default (FINDATA_MCP_CODE_MODE). The 95 REST routes are untouched. Co-Authored-By: Claude Opus 4.8 --- docs/MCP_SURFACE.md | 115 +++++ pyproject.toml | 3 + src/findata/api/app.py | 12 +- src/findata/api/mcp_app.py | 905 +++++++++++++++++++++++++++++++++++++ tests/test_mcp_surface.py | 125 +++++ 5 files changed, 1158 insertions(+), 2 deletions(-) create mode 100644 docs/MCP_SURFACE.md create mode 100644 src/findata/api/mcp_app.py create mode 100644 tests/test_mcp_surface.py diff --git a/docs/MCP_SURFACE.md b/docs/MCP_SURFACE.md new file mode 100644 index 0000000..b64e34b --- /dev/null +++ b/docs/MCP_SURFACE.md @@ -0,0 +1,115 @@ +# MCP surface — curated tools over the REST API + +> Status: prototype / design proposal (alpha 0.3.x). Non-breaking: the REST API +> is untouched. Implemented in [`src/findata/api/mcp_app.py`](../src/findata/api/mcp_app.py). + +## Problem + +The MCP server used to be auto-generated **1:1 from the FastAPI app**: +`FastApiMCP(app)` turns every route into a tool, so the catalog was **94 tools** +— one per dataset/endpoint. From a client/agent's point of view that means: + +- **~21k tokens of `tools/list`** loaded at the start of every session, before a + single call. +- **Worse tool selection** — a model picks worse among 94 near-duplicate names + (one tool per SGS series, per CVM fund facet…) than among ~two dozen + well-described tools. + +## Approach (A + B + C) + +A separate FastAPI app, `mcp_app`, is the **only** source of the tool catalog. +It exposes a small, hand-curated set of tools that dispatch to the same +`findata.sources.*` functions the REST routers already use. + +```python +# app.py — tools come from mcp_app; transport is served on the public app +_mcp = FastApiMCP(mcp_app, name=..., description=...) +_mcp.mount_http(router=app) # /mcp on the public app; REST routes untouched +``` + +`FastApiMCP(mcp_app)` builds the catalog from `mcp_app`'s OpenAPI and executes +each tool via `httpx.ASGITransport(app=mcp_app)`. Because the routers carry no +app-state/rate-limiter coupling, reusing the source functions in a second app is +safe. **The 94 REST routes that back the CLI and HTTP consumers never change.** + +- **A — curation.** Each tool has an explicit `operation_id`, an agent-oriented + one-line `summary`, and a docstring written *for an agent deciding whether to + call it* — not the raw route docstring. `response_model=None` + `-> Any` keeps + response schemas out of the catalog (they would re-inflate it). +- **B — consolidation.** Sprawly clusters collapse behind a `dataset`/`kind` + selector (see table). The work moves from "many thin tools" to "few tools with + good docs". +- **C — code mode.** One optional tool, `findata_run_code`, runs a Python + snippet against the `findata` library in an isolated child interpreter. It + replaces dozens of fine-grained calls for filter/join/aggregate flows that + would otherwise stream every intermediate result through the model's context. + **Gated off by default** (`FINDATA_MCP_CODE_MODE=1` to enable). + +## Result + +| | 1:1 (old) | curated (new) | +|---|---:|---:| +| MCP tools | 94 | **24** (25 with code mode) | +| `tools/list` size | ~85k chars (~21k tok) | **~29k chars (~7k tok)** | +| REST operations | 94 | **94 (unchanged)** | + +## The 24 curated tools + +``` +registry_lookup ← start here: CNPJ / ticker / code / name → entities + +bcb_series bcb_ptax bcb_focus (BCB: 12 → 3) +cvm_company cvm_financials cvm_fund cvm_structured_fund (CVM: 22 → 4) +b3_quote b3_cotahist b3_index (B3: 9 → 3) +tesouro_bonds tesouro_siconfi (Tesouro: 6 → 2) +ibge_indicator ibge_ipca_breakdown (IBGE: 4 → 2) +ipea_series ipea_search (IPEA: 4 → 2) +anbima (ANBIMA: 3 → 1) +openfinance_directory (Open Finance: 15 → 1) +basedosdados_search basedosdados_sql (BdD: 7 → 2) +receita_arrecadacao aneel_leiloes susep_empresas +findata_run_code (code mode, opt-in) +``` + +### Consolidation map + +| Tool | Folds in | Selector | +|---|---|---| +| `bcb_series` | `/series`, `/series/code/{code}`, `/series/name/{name}` | `code` / `name` / none=catalog | +| `bcb_ptax` | `/ptax/usd`, `/ptax/usd/period`, `/ptax/{currency}` | `start`+`end` → period | +| `bcb_focus` | `/focus/{indicators,annual,monthly,selic,top5}` | `horizon`, `panel`, `indicator` | +| `cvm_company` | companies search/list, `fca/*`, `ipe` | `dataset=search\|list\|fca_*\|filings` | +| `cvm_fund` | `funds`, `funds/{daily,holdings,lamina,profile,periods}`, returns | `dataset` | +| `cvm_structured_fund` | `funds/{fii,fidc,fip}/*` | `kind` + `dataset` | +| `b3_index` | index portfolio + monthly + list | `dataset`, omit `symbol` to list | +| `tesouro_bonds` | bonds list/search/history | `dataset` | +| `tesouro_siconfi` | `rreo`, `rgf`, `entes` | `report` | +| `openfinance_directory` | participants/endpoints/resources/roles | `dataset` | + +## Tradeoffs + +- **Fewer but "fatter" tools.** Each carries a `dataset` enum and more doc. The + whole bet is that good descriptions beat tool count — so the docstrings are the + deliverable, not an afterthought. +- **Consolidation can hide endpoint-specific params behind an enum.** Mitigated + by documenting each `dataset`/`kind` value and validating bad combinations with + a `400` (e.g. `cvm_fund dataset=holdings` requires `cnpj`+`month`), matching the + REST API's `ValueError → 400` behaviour. +- **Discoverability of rare endpoints.** A handful of niche REST routes are not + individually surfaced as tools. They remain fully reachable over REST and via + `findata_run_code`. + +## Code mode — security + +`findata_run_code` is a **prototype, not a hardened sandbox**. The snippet runs +in a child `python -I` (isolated mode, cwd in a tempdir) with a wall-clock +timeout and a 20k-char output cap, but it has full library and network access. +It is **disabled unless `FINDATA_MCP_CODE_MODE=1`** and is intended for trusted, +local/agent use. A production deployment should run it in a real sandbox +(container/seccomp/network egress controls) before enabling. + +## Example flows (verified through the curated MCP) + +- `registry_lookup(q="PETR4")` → PETROBRAS, CNPJ `33.000.167/0001-01`, `[PETR3, PETR4]` (offline). +- `bcb_ptax(start=2024-01-02, end=2024-01-05)` → daily PTAX USD series (the handoff's headline flow). +- `findata_run_code("import findata; ...")` → runs in the sandbox, returns captured stdout. diff --git a/pyproject.toml b/pyproject.toml index 7c47c40..edbe9aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,6 +134,9 @@ max-statements = 50 ] # FastAPI idiom: Query() / Depends() calls in argument defaults. "src/findata/api/routers/**" = ["B008", "PLR0913"] +# Curated MCP layer: FastAPI Query() defaults (B008), wide consolidated tools +# (PLR0913), and intentional flat dataset-dispatch switches (C901/PLR0912/PLR0911). +"src/findata/api/mcp_app.py" = ["B008", "PLR0913", "C901", "PLR0912", "PLR0911"] # CLI commands are naturally wide (many typer.Option flags). "src/findata/cli.py" = ["PLR0913"] # Banner uses rich + sys.stdout directly — not a print-statement debug. diff --git a/src/findata/api/app.py b/src/findata/api/app.py index a570afe..fda6e24 100644 --- a/src/findata/api/app.py +++ b/src/findata/api/app.py @@ -148,15 +148,23 @@ async def _value_error_handler(_: Request, exc: ValueError) -> JSONResponse: try: from fastapi_mcp import FastApiMCP + from findata.api.mcp_app import mcp_app + + # The MCP tool catalog is built from the *curated* `mcp_app` (a separate + # FastAPI app, ~24 well-described tools), not from the public `app` — that + # would expose one near-duplicate tool per REST route (~94) and bloat every + # agent's context. `mount_http(router=app)` serves the /mcp transport on the + # public app, while the tools are generated from and executed against + # `mcp_app` (via its ASGI transport). The 94 REST routes stay untouched. _mcp = FastApiMCP( - app, + mcp_app, name=_PROJECT_SLUG, description=( f"{_PROJECT_STATEMENT} MCP para BCB, CVM, B3, IBGE, IPEA, " "Tesouro, Base dos Dados, Open Finance e gráficos experimentais." ), ) - _mcp.mount_http() # Serves MCP at /mcp (fastapi-mcp >=0.4) + _mcp.mount_http(router=app) # Serves MCP at /mcp (fastapi-mcp >=0.4) _MCP_ENABLED = True except Exception: # optional subsystem must never break core API _MCP_ENABLED = False diff --git a/src/findata/api/mcp_app.py b/src/findata/api/mcp_app.py new file mode 100644 index 0000000..52a4c49 --- /dev/null +++ b/src/findata/api/mcp_app.py @@ -0,0 +1,905 @@ +"""Curated MCP surface for the findata-br server. + +The public REST API (``findata.api.app``) exposes ~94 fine-grained routes — one +per upstream dataset/endpoint. Mapping those 1:1 to MCP tools floods an agent's +context with ~94 near-duplicate tool schemas before it makes a single call, and +hurts tool-selection accuracy. + +This module is a *separate* FastAPI app whose only purpose is to be the source +of the MCP tool catalog. It exposes a small, hand-curated set of tools, each +with an agent-oriented description, that dispatch to the same +``findata.sources.*`` functions the REST routers use. Consolidated tools collapse +sprawly clusters (e.g. the 12 BCB and 14 CVM-fund endpoints) behind a few +``dataset``/``kind`` selectors. + +Wiring lives in ``app.py``: ``FastApiMCP(mcp_app).mount_http(router=app)`` builds +the tool catalog from *this* app while serving ``/mcp`` on the public app. The +94 REST routes are never touched. + + A — curation: only the headline tools are exposed, with real descriptions. + B — consolidation: ``bcb_*``/``cvm_*``/``tesouro_*``… fold many routes into one. + C — code mode: optional ``findata_run_code`` runs a Python snippet against the + library (gated by ``FINDATA_MCP_CODE_MODE=1``; off by default). +""" + +from __future__ import annotations + +import asyncio +import os +import sys +import tempfile +from datetime import date +from typing import Any, Literal + +from fastapi import APIRouter, FastAPI, HTTPException, Query, Request +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field + +from findata.registry import lookup +from findata.sources.anbima import indices as anbima_src +from findata.sources.aneel import leiloes +from findata.sources.b3 import cotahist, indices +from findata.sources.basedosdados import catalog +from findata.sources.bcb import focus, ptax, sgs +from findata.sources.cvm import ( + companies, + fca, + fidc, + fii, + financials, + fip, + funds, + holdings, + ipe, + lamina, + profile, +) +from findata.sources.ibge import indicators +from findata.sources.ipea import series as ipea_series +from findata.sources.openfinance import directory as of_dir +from findata.sources.receita import arrecadacao +from findata.sources.susep import empresas +from findata.sources.tesouro import bonds, siconfi + +router = APIRouter() + +_MAX_TICKERS = 20 +_MIN_YEAR_BCB_SGS = 1986 + + +# ── Registry — the entry point ───────────────────────────────────── + + +@router.get( + "/registry/lookup", + operation_id="registry_lookup", + response_model=None, + summary="Resolve a CNPJ, B3 ticker, CVM/SUSEP code, or company name to canonical entities", +) +async def registry_lookup( + q: str = Query( + ..., + min_length=2, + description="CNPJ (masked or not), ticker (PETR4), CVM/SUSEP code, or name fragment", + ), + limit: int = Query(20, ge=1, le=100), +) -> Any: + """Offline cross-source resolver backed by an embedded FTS5 catalog. + + Start here to turn a fuzzy identifier into a CNPJ + tickers + source codes + before calling the source-specific tools. The BM25 ``rank`` indicates match + strength (very negative = strong exact hit; near zero = fuzzy name match). + """ + return await lookup(q, limit=limit) + + +# ── BCB — Banco Central ──────────────────────────────────────────── + + +@router.get( + "/bcb/series", + operation_id="bcb_series", + response_model=None, + summary="BCB time series (Selic, IPCA, câmbio…): list the catalog or fetch by code/name", +) +async def bcb_series( + code: int | None = Query(None, description="SGS numeric code, e.g. 432=Selic meta, 433=IPCA"), + name: str | None = Query(None, description="Catalog alias, e.g. selic, ipca, dolar_ptax"), + start: date | None = Query(None, description="Start date YYYY-MM-DD (code mode only)"), + end: date | None = Query(None, description="End date YYYY-MM-DD (code mode only)"), + last_n: int | None = Query(None, ge=1, le=1000, description="Return only the last N values"), +) -> Any: + """Three modes in one tool. Pass nothing to list the curated catalog; pass + ``code`` for a series by SGS code (optionally ``start``/``end`` or ``last_n``); + or pass ``name`` for the most recent values of a named series. + """ + if code is not None: + if last_n is not None: + return await sgs.get_series_last(code, last_n) + return await sgs.get_series(code, start, end) + if name is not None: + return await sgs.get_series_by_name(name, last_n or 10) + return sgs.SERIES_CATALOG + + +@router.get( + "/bcb/ptax", + operation_id="bcb_ptax", + response_model=None, + summary="PTAX official exchange rate for any currency — single date or a date range", +) +async def bcb_ptax( + currency: str = Query("USD", description="ISO currency code, e.g. USD, EUR, GBP"), + date_: date | None = Query(None, alias="date", description="Single date (default: latest)"), + start: date | None = Query(None, description="Range start (use with end; USD only)"), + end: date | None = Query(None, description="Range end (use with start; USD only)"), +) -> Any: + """Official PTAX from BCB. Pass ``start``+``end`` for a daily series over a + range (USD only), or ``date`` (or nothing) for a single day. ``currency=USD`` + is the common case; other currencies support single-date queries only. + """ + if start is not None and end is not None: + if currency.upper() != "USD": + raise HTTPException(400, "Range queries are USD-only; use `date` for other currencies") + return await ptax.get_ptax_usd_period(start, end) + if currency.upper() == "USD": + return await ptax.get_ptax_usd(date_) + return await ptax.get_ptax_currency(currency, date_) + + +@router.get( + "/bcb/focus", + operation_id="bcb_focus", + response_model=None, + summary="Boletim Focus expectations — annual/monthly, market or Top-5, or Selic per COPOM", +) +async def bcb_focus( + indicator: str = Query( + "IPCA", + description="Indicator, e.g. IPCA, 'PIB Total', Câmbio. Use 'Selic' for COPOM path, " + "'list' to see available indicators.", + ), + horizon: Literal["annual", "monthly"] = Query("annual"), + panel: Literal["market", "top5"] = Query( + "market", description="market = all forecasters; top5 = Top-5 ranked (annual only)" + ), + top: int = Query(20, ge=1, le=100, description="Max rows to return"), +) -> Any: + """Consolidates the Focus endpoints. ``indicator='list'`` returns the available + indicators; ``indicator='Selic'`` returns the Selic expectation per COPOM + meeting (horizon/panel ignored). Otherwise pick ``horizon`` and ``panel``. + """ + key = indicator.strip().lower() + if key == "list": + return focus.FOCUS_INDICATORS + if key == "selic": + return await focus.get_focus_selic(top) + if panel == "top5": + return await focus.get_focus_top5_annual(indicator, top) + if horizon == "monthly": + return await focus.get_focus_monthly(indicator, top) + return await focus.get_focus_annual(indicator, top) + + +# ── CVM — companies & funds ──────────────────────────────────────── + + +@router.get( + "/cvm/company", + operation_id="cvm_company", + response_model=None, + summary="CVM-listed companies: search/list, registration facts (FCA), and filings (IPE)", +) +async def cvm_company( + dataset: Literal[ + "search", "list", "fca_general", "fca_securities", "fca_dri", "filings" + ] = Query("search"), + query: str | None = Query(None, min_length=2, description="Name search (dataset=search)"), + cnpj: str | None = Query( + None, description="Company CNPJ filter (recommended for fca_*/filings)" + ), + year: int | None = Query( + None, ge=2003, description="Reference year (required for fca_*/filings)" + ), + ticker: str | None = Query(None, description="B3 ticker filter (dataset=fca_securities)"), + categoria: str | None = Query( + None, description="Filing category (dataset=filings), e.g. 'Fato Relevante'" + ), + limit: int = Query(100, ge=1, le=2000), +) -> Any: + """The company side of CVM. ``search`` needs ``query``; ``list`` is the full + registry. ``fca_general|fca_securities|fca_dri`` are cadastral facets needing + ``year`` (+ optional ``cnpj``/``ticker``). ``filings`` (IPE — fatos relevantes, + comunicados) needs ``year`` (+ optional ``cnpj``/``categoria``). + """ + if dataset == "search": + if not query: + raise HTTPException(400, "dataset=search requires `query`") + return await companies.search_company(query, True) + if dataset == "list": + return (await companies.get_companies(True))[:limit] + if dataset == "filings": + if year is None: + raise HTTPException(400, "dataset=filings requires `year`") + return (await ipe.get_ipe(year, cnpj=cnpj, categoria=categoria))[:limit] + if year is None: + raise HTTPException(400, f"dataset={dataset} requires `year`") + if dataset == "fca_general": + return await fca.get_fca_geral(year, cnpj) + if dataset == "fca_securities": + return await fca.get_fca_valores_mobiliarios(year, cnpj=cnpj, ticker=ticker) + return await fca.get_fca_dri(year, cnpj) + + +@router.get( + "/cvm/financials", + operation_id="cvm_financials", + response_model=None, + summary="CVM financial statements — annual (DFP) or quarterly (ITR) for a company", +) +async def cvm_financials( + year: int = Query(..., ge=2010, description="Fiscal year"), + period: Literal["annual", "quarterly"] = Query( + "annual", description="annual=DFP, quarterly=ITR" + ), + statement: financials.StatementType = Query( + financials.StatementType.DRE_CON, + description="Statement type: BPA/BPP/DRE/DFC_MI/DMPL/DVA, _con (consolidated) or _ind", + ), + cnpj: str | None = Query( + None, description="Company CNPJ — strongly recommended (avoids the full dataset)" + ), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """Annual DFP or quarterly ITR statements. Statement types: BPA_con, BPP_con, + DRE_con, DFC_MI_con, DMPL_con, DVA_con (+ ``_ind`` variants). Always pass ``cnpj``. + """ + if period == "quarterly": + return (await financials.get_itr(year, statement, cnpj))[:limit] + return (await financials.get_dfp(year, statement, cnpj))[:limit] + + +@router.get( + "/cvm/fund", + operation_id="cvm_fund", + response_model=None, + summary="Open-ended CVM funds (FI): catalog, daily NAV, holdings, factsheet, returns, profile", +) +async def cvm_fund( + dataset: Literal[ + "catalog", "daily", "holdings", "lamina", "returns", "profile", "periods" + ] = Query("catalog"), + cnpj: str | None = Query( + None, description="Fund CNPJ (required for holdings; recommended elsewhere)" + ), + year: int | None = Query(None, description="Reference year (required except catalog/periods)"), + month: int | None = Query(None, ge=1, le=12, description="Reference month (monthly datasets)"), + horizon: Literal["monthly", "yearly"] = Query( + "monthly", description="returns granularity (dataset=returns)" + ), + blocks: str | None = Query( + None, + description="holdings: block whitelist, e.g. BLC_1,BLC_4 (of BLC_1..BLC_8,CONFID,PL,FIE)", + ), + product: str = Query( + "INF_DIARIO", + description="periods: INF_DIARIO|CDA|LAMINA|PERFIL_MENSAL|BALANCETE|EVENTUAL|EXTRATO", + ), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """Open funds in one tool. ``catalog`` lists registered funds; ``periods`` lists + the YYYYMM stamps available upstream for ``product``. The rest need ``year``; + ``daily``/``holdings``/``lamina``/``returns``/``profile`` need ``month`` too, and + ``holdings`` requires ``cnpj`` (the monthly CDA file is huge). + """ + if dataset == "catalog": + return (await funds.get_fund_catalog(True, None))[:limit] + if dataset == "periods": + from findata.sources.cvm import _directory + + return await _directory.list_periods("FI", f"DOC/{product}") + if year is None: + raise HTTPException(400, f"dataset={dataset} requires `year`") + if dataset == "holdings": + if not cnpj or month is None: + raise HTTPException(400, "dataset=holdings requires `cnpj` and `month`") + block_list = [b.strip() for b in blocks.split(",")] if blocks else None + return await holdings.get_fund_holdings(cnpj, year, month, block_list) + if month is None: + raise HTTPException(400, f"dataset={dataset} requires `month`") + if dataset == "daily": + return (await funds.get_fund_daily(year, month, cnpj))[:limit] + if dataset == "lamina": + return (await lamina.get_fund_lamina(year, month, cnpj))[:limit] + if dataset == "profile": + return (await profile.get_fund_profile(year, month, cnpj))[:limit] + if horizon == "yearly": + return await lamina.get_fund_yearly_returns(year, month, cnpj) + return await lamina.get_fund_monthly_returns(year, month, cnpj) + + +async def _structured_fii( + dataset: str | None, cnpj: str | None, year: int, month: int | None +) -> Any: + if dataset in (None, "geral"): + return await fii.get_fii_geral(year, cnpj=cnpj, month=month) + if dataset == "complemento": + return await fii.get_fii_complemento(year, cnpj=cnpj, month=month) + raise HTTPException(400, f"unknown FII dataset {dataset!r} (use geral|complemento)") + + +async def _structured_fidc( + dataset: str | None, cnpj: str | None, year: int, month: int | None +) -> Any: + if month is None: + raise HTTPException(400, "FIDC datasets require `month`") + if dataset in (None, "geral"): + return await fidc.get_fidc_geral(year, month, cnpj=cnpj) + if dataset == "pl": + return await fidc.get_fidc_pl(year, month, cnpj=cnpj) + if dataset in ("direitos", "direitos-creditorios"): + return await fidc.get_fidc_direitos_creditorios(year, month, cnpj=cnpj) + raise HTTPException(400, f"unknown FIDC dataset {dataset!r} (use geral|pl|direitos)") + + +@router.get( + "/cvm/structured-fund", + operation_id="cvm_structured_fund", + response_model=None, + summary="Structured CVM funds — FII (real estate), FIDC (receivables), FIP (private equity)", +) +async def cvm_structured_fund( + kind: Literal["fii", "fidc", "fip"] = Query(...), + dataset: str | None = Query( + None, description="fii: geral|complemento; fidc: geral|pl|direitos; fip: (n/a)" + ), + cnpj: str | None = Query(None, description="Fund CNPJ filter"), + year: int = Query(..., description="Reference year"), + month: int | None = Query(None, ge=1, le=12, description="Required for FIDC; optional for FII"), + quarter: int | None = Query(None, ge=1, le=4, description="FIP only — informe quarter"), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """Structured funds by ``kind``. FII has ``geral`` (cadastral) and ``complemento`` + (cotistas/PL/taxa) facets. FIDC has ``geral``/``pl``/``direitos`` (needs ``month``). + FIP returns the quarterly informe (optional ``quarter``). + """ + if kind == "fii": + return await _structured_fii(dataset, cnpj, year, month) + if kind == "fidc": + return await _structured_fidc(dataset, cnpj, year, month) + return (await fip.get_fip(year, cnpj=cnpj, quarter=quarter))[:limit] + + +# ── B3 — Bolsa ───────────────────────────────────────────────────── + + +def _b3_quotes() -> Any: + try: + from findata.sources.b3 import quotes + except ImportError as exc: # pragma: no cover — only without the [b3] extra + raise HTTPException( + 503, "Live quotes need the optional extra: pip install 'findata-br[b3]'" + ) from exc + return quotes + + +@router.get( + "/b3/quote", + operation_id="b3_quote", + response_model=None, + summary="Live B3 stock quote(s) (optional [b3] extra) — prefer b3_cotahist for official EOD", +) +async def b3_quote( + tickers: str = Query( + ..., description="One ticker or comma-separated list (max 20), e.g. PETR4,VALE3" + ), +) -> Any: + """Current quote(s) from the optional yfinance-backed source. For canonical, + official end-of-day history use ``b3_cotahist`` instead. + """ + quotes = _b3_quotes() + ticker_list = [t.strip() for t in tickers.split(",") if t.strip()] + if not ticker_list: + raise HTTPException(400, "at least one ticker is required") + if len(ticker_list) > _MAX_TICKERS: + raise HTTPException(400, f"max {_MAX_TICKERS} tickers per request") + if len(ticker_list) == 1: + return await quotes.get_quote(ticker_list[0]) + return await quotes.get_multiple_quotes(ticker_list) + + +@router.get( + "/b3/cotahist", + operation_id="b3_cotahist", + response_model=None, + summary="Official B3 COTAHIST daily quotes — by year, month, or single day", +) +async def b3_cotahist( + year: int = Query(..., ge=_MIN_YEAR_BCB_SGS, description="Year (B3 publishes since 1986)"), + month: int | None = Query(None, ge=1, le=12), + day: int | None = Query(None, ge=1, le=31), + ticker: str | None = Query( + None, description="CODNEG filter, e.g. PETR4 — recommended (annual files are ~85 MB)" + ), + market_codes: str | None = Query( + None, description="CODBDI whitelist, comma-separated, e.g. 02,96" + ), +) -> Any: + """Granularity follows the args: ``day`` (needs ``month``) → one trading day, + ``month`` → one month, otherwise the whole ``year``. Pass ``ticker`` for + single-issuer queries. + """ + codes = [c.strip() for c in market_codes.split(",")] if market_codes else None + if day is not None: + if month is None: + raise HTTPException(400, "`day` requires `month`") + return await cotahist.get_cotahist_day(year, month, day, ticker, codes) + if month is not None: + return await cotahist.get_cotahist_month(year, month, ticker, codes) + return await cotahist.get_cotahist_year(year, ticker, codes) + + +@router.get( + "/b3/index", + operation_id="b3_index", + response_model=None, + summary="B3 index theoretical portfolio & monthly history (IBOV, IBrX, SMLL, IDIV, IFIX…)", +) +async def b3_index( + symbol: str | None = Query( + None, description="Index symbol, e.g. IBOV; omit to list known indices" + ), + dataset: Literal["portfolio", "monthly"] = Query( + "portfolio", description="portfolio=current composição; monthly=closing levels" + ), + start: date | None = Query(None, description="monthly: start date YYYY-MM-DD"), + end: date | None = Query(None, description="monthly: end date YYYY-MM-DD"), + months: int = Query(120, ge=1, le=360, description="monthly window when start omitted"), +) -> Any: + """Omit ``symbol`` to list the indices we can fetch. With ``symbol``, + ``portfolio`` returns the current composição (constituents + weights); + ``monthly`` returns closing levels for charting. + """ + if symbol is None: + return await indices.list_known_indices() + if dataset == "monthly": + return await indices.get_index_monthly_evolution( + symbol, start=start, end=end, months=months + ) + return await indices.get_index_portfolio(symbol) + + +# ── Tesouro / SICONFI ────────────────────────────────────────────── + + +@router.get( + "/tesouro/bonds", + operation_id="tesouro_bonds", + response_model=None, + summary="Tesouro Direto bonds — list/filter, search names, or price+rate history", +) +async def tesouro_bonds( + dataset: Literal["list", "search", "history"] = Query("list"), + titulo: str | None = Query( + None, description="Bond name for history, e.g. 'Tesouro IPCA+ 2035'" + ), + q: str | None = Query(None, min_length=2, description="Search query (dataset=search)"), + tipo: str | None = Query(None, description="Type filter (dataset=list), e.g. 'Tesouro IPCA+'"), + start: date | None = Query(None), + end: date | None = Query(None), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """``list`` returns current bond prices/rates (filter by ``tipo``/date); + ``search`` finds bond names from ``q``; ``history`` returns the series for a + single ``titulo``. + """ + if dataset == "search": + if not q: + raise HTTPException(400, "dataset=search requires `q`") + return await bonds.search_bonds(q) + if dataset == "history": + if not titulo: + raise HTTPException(400, "dataset=history requires `titulo`") + return await bonds.get_bond_history(titulo, start, end) + return await bonds.get_treasury_bonds(tipo, start, end, limit) + + +@router.get( + "/tesouro/siconfi", + operation_id="tesouro_siconfi", + response_model=None, + summary="SICONFI public-finance reports — RREO, RGF, or the federation-entity list", +) +async def tesouro_siconfi( + report: Literal["rreo", "rgf", "entes"] = Query("entes"), + year: int | None = Query(None, ge=2013), + period: int | None = Query( + None, ge=1, le=6, description="RREO: bimestre 1-6; RGF: quadrimestre 1-3" + ), + cod_ibge: int | None = Query( + None, description="IBGE entity code (1=União); discover via report=entes" + ), + poder: str = Query("E", description="RGF only: E/L/J/M/D power branch"), + anexo: str | None = Query(None, description='e.g. "RREO-Anexo 01"'), +) -> Any: + """``entes`` lists every federation entity with its IBGE code (start here). + ``rreo`` (bimestral) and ``rgf`` (quadrimestral) need ``year``, ``period``, and + ``cod_ibge``. + """ + if report == "entes": + return await siconfi.get_entes() + if year is None or period is None or cod_ibge is None: + raise HTTPException(400, f"report={report} requires year, period, and cod_ibge") + if report == "rgf": + return await siconfi.get_rgf(year, period, cod_ibge, poder=poder) # type: ignore[arg-type] + return await siconfi.get_rreo(year, period, cod_ibge, anexo=anexo) + + +# ── IBGE ─────────────────────────────────────────────────────────── + + +@router.get( + "/ibge/indicator", + operation_id="ibge_indicator", + response_model=None, + summary="IBGE economic indicators — list the catalog or fetch one by name (e.g. ipca_mensal)", +) +async def ibge_indicator( + name: str | None = Query(None, description="Indicator name; omit to list all available"), + periods: int = Query(12, ge=1, le=120, description="Recent periods to return"), +) -> Any: + """Omit ``name`` to list every IBGE indicator we expose; pass ``name`` to fetch + its recent values. + """ + if name is None: + return indicators.IBGE_INDICATORS + return await indicators.get_indicator(name, periods) + + +@router.get( + "/ibge/ipca-breakdown", + operation_id="ibge_ipca_breakdown", + response_model=None, + summary="IPCA monthly variation broken down by the major groups (not available from BCB SGS)", +) +async def ibge_ipca_breakdown( + periods: int = Query(6, ge=1, le=60, description="Recent months to return"), +) -> Any: + """IPCA monthly variation for all major groups (food, housing, transport, + health, …) — granularity BCB SGS does not provide. + """ + return await indicators.get_ipca_breakdown(periods) + + +# ── IPEA ─────────────────────────────────────────────────────────── + + +@router.get( + "/ipea/series", + operation_id="ipea_series", + response_model=None, + summary="IPEA series — curated catalog, series values, or metadata by SERCODIGO", +) +async def ipea_series_tool( + sercodigo: str | None = Query( + None, description="Series code, e.g. BM12_TJOVER12; omit to list the curated catalog" + ), + dataset: Literal["values", "metadata"] = Query("values"), + top: int | None = Query(None, ge=1, le=5000, description="Most recent N values"), +) -> Any: + """Omit ``sercodigo`` to list the curated catalog. With it, ``values`` returns + the observations and ``metadata`` returns name/unit/periodicity/source. For + discovery across the full ~8k-series catalog use ``ipea_search``. + """ + if sercodigo is None: + return ipea_series.IPEA_CATALOG + if dataset == "metadata": + meta = await ipea_series.get_metadata(sercodigo) + if meta is None: + raise HTTPException(404, f"unknown SERCODIGO: {sercodigo}") + return meta + return await ipea_series.get_series_values(sercodigo, top) + + +@router.get( + "/ipea/search", + operation_id="ipea_search", + response_model=None, + summary="Full-text search across the ~8k-series IPEA catalog", +) +async def ipea_search( + q: str = Query(..., min_length=2, description="Search query"), + top: int = Query(25, ge=1, le=200), +) -> Any: + """Find IPEA series by free-text query; returns metadata you can feed back to + ``ipea_series`` as ``sercodigo``. + """ + return await ipea_series.search_series(q, top) + + +# ── ANBIMA ───────────────────────────────────────────────────────── + + +@router.get( + "/anbima", + operation_id="anbima", + response_model=None, + summary="ANBIMA public data — IMA index family, ETTJ yield curve, or debenture quotes", +) +async def anbima_tool( + dataset: Literal["ima", "ettj", "debentures"] = Query("ima"), + family: str | None = Query( + None, description="ima: filter to one IMA family, e.g. IRF-M, IMA-B" + ), + data: date | None = Query(None, description="Reference date (ettj/debentures; default latest)"), + emissor: str | None = Query(None, description="debentures: issuer-name substring filter"), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """``ima`` returns the latest IMA snapshot (optionally one ``family``); ``ettj`` + returns the zero-coupon yield curve for ``data``; ``debentures`` returns daily + secondary-market quotes (optionally filtered by ``emissor``). + """ + if dataset == "ettj": + return await anbima_src.get_ettj(data) + if dataset == "debentures": + rows = await anbima_src.get_debentures(data) + if emissor: + needle = emissor.upper() + rows = [r for r in rows if needle in r.emissor.upper()] + return rows[:limit] + fam = anbima_src.IMAFamily(family) if family else None + return await anbima_src.get_ima(fam) + + +# ── Open Finance Brasil ──────────────────────────────────────────── + + +@router.get( + "/openfinance/directory", + operation_id="openfinance_directory", + response_model=None, + summary="Open Finance Brasil Directory — participants, API endpoints, resources, or roles", +) +async def openfinance_directory( + dataset: Literal["participants", "endpoints", "resources", "roles"] = Query("participants"), + role: str | None = Query(None, description="participants: Directory role filter, e.g. DADOS"), + status: str | None = Query( + "Active", description="participants/endpoints: status; empty for all" + ), + api_family: str | None = Query( + None, description="participants/endpoints: API family substring" + ), + q: str | None = Query(None, min_length=2, description="participants: name/CNPJ substring"), + limit: int = Query(100, ge=1, le=1000), +) -> Any: + """``participants`` lists ecosystem participants (summarised); ``endpoints`` + flattens their advertised API endpoints; ``resources`` lists supported public + resources; ``roles`` lists Directory roles. + """ + env: of_dir.Environment = "production" + if dataset == "resources": + return of_dir.public_resources(env) + if dataset == "roles": + return (await of_dir.get_roles(env))[:limit] + raw = await of_dir.get_participants(env) + if dataset == "endpoints": + return of_dir.flatten_api_endpoints(raw, api_family=api_family, status=status or None)[ + :limit + ] + filtered = of_dir.filter_participants( + raw, role=role, status=status or None, api_family=api_family, query=q + ) + return of_dir.summarise_participants(filtered[:limit]) + + +# ── Base dos Dados ───────────────────────────────────────────────── + + +@router.get( + "/basedosdados/search", + operation_id="basedosdados_search", + response_model=None, + summary="Search the Base dos Dados catalog (free BigQuery datasets)", +) +async def basedosdados_search( + q: str | None = Query(None, min_length=2, description="Free-text query"), + theme: str | None = Query(None, description="Theme filter, e.g. economics"), + only_free_download: bool = Query( + False, description="Restrict to datasets marked free direct-download" + ), + page: int = Query(1, ge=1), +) -> Any: + """Search the public catalog. Set ``only_free_download=true`` to restrict to + datasets you can download without BigQuery. Use ``basedosdados_sql`` to get a + starter query for a chosen table. + """ + if only_free_download: + return await catalog.search_direct_download_free(theme=theme, page=page) + return await catalog.search_datasets(q=q, theme=theme, page=page) + + +@router.get( + "/basedosdados/sql", + operation_id="basedosdados_sql", + response_model=None, + summary="Generate a starter BigQuery SQL snippet for a Base dos Dados table", +) +async def basedosdados_sql( + dataset_id: str = Query(..., min_length=1), + table_id: str = Query(..., min_length=1), + limit: int = Query(100, ge=1, le=10_000), +) -> Any: + """Returns a ready-to-run BigQuery reference (project.dataset.table + a LIMITed + SELECT) for the given Base dos Dados table. + """ + return catalog.table_ref(dataset_id, table_id, limit) + + +# ── Receita Federal ──────────────────────────────────────────────── + + +@router.get( + "/receita/arrecadacao", + operation_id="receita_arrecadacao", + response_model=None, + summary="Receita Federal monthly tax revenue (arrecadação) by period, UF, and tributo", +) +async def receita_arrecadacao( + year: int | None = Query(None, ge=2000), + month: int | None = Query(None, ge=1, le=12), + uf: str | None = Query(None, description="State UF, e.g. SP, RJ"), + tributo: str | None = Query(None, description="Tax-category substring, e.g. IRPF, COFINS"), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """Federal-tax revenue in long form (one row per period × UF × tributo). + Filter by any combination of ``year``/``month``/``uf``/``tributo``. + """ + rows = await arrecadacao.get_arrecadacao(year, month, uf, tributo) + return rows[:limit] + + +# ── ANEEL ────────────────────────────────────────────────────────── + + +@router.get( + "/aneel/leiloes", + operation_id="aneel_leiloes", + response_model=None, + summary="ANEEL energy-auction results — generation or transmission", +) +async def aneel_leiloes( + kind: Literal["geracao", "transmissao"] = Query("geracao"), + year: int | None = Query(None), + fonte: str | None = Query( + None, description="geracao: energy-source substring, e.g. Eólica, Solar" + ), + uf: str | None = Query(None), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """Winning bids per auction. ``geracao`` (since 2005) supports a ``fonte`` filter; + ``transmissao`` (since 1999) does not. Both support ``year``/``uf``. + """ + if kind == "transmissao": + return (await leiloes.get_aneel_leiloes_transmissao(year=year, uf=uf))[:limit] + return (await leiloes.get_aneel_leiloes_geracao(year=year, fonte=fonte, uf=uf))[:limit] + + +# ── SUSEP ────────────────────────────────────────────────────────── + + +@router.get( + "/susep/empresas", + operation_id="susep_empresas", + response_model=None, + summary="SUSEP-supervised entities (insurance, previdência, capitalização) — list or search", +) +async def susep_empresas( + q: str | None = Query(None, min_length=2, description="Name substring; omit to list all"), + limit: int = Query(500, ge=1, le=5000), +) -> Any: + """Pass ``q`` to search SUSEP entities by name; omit it to list all (paginated).""" + if q: + return await empresas.search_susep_empresa(q) + return (await empresas.get_susep_empresas())[:limit] + + +# ── C — Code mode (optional, gated) ──────────────────────────────── + +_CODE_MODE_ENABLED = os.getenv("FINDATA_MCP_CODE_MODE", "").strip().lower() in { + "1", + "true", + "yes", + "on", +} +_CODE_OUTPUT_CAP = 20_000 +_CODE_TIMEOUT_MAX = 120 + + +class RunCodeRequest(BaseModel): + """Input for the code-mode tool.""" + + code: str = Field( + ..., + description="Python source to execute. The `findata` library is importable. " + "Source functions are async — wrap calls in asyncio.run(). Print results to stdout.", + ) + timeout_s: int = Field( + 30, ge=1, le=_CODE_TIMEOUT_MAX, description="Wall-clock timeout in seconds" + ) + + +async def _execute_code(code: str, timeout_s: int) -> dict[str, Any]: + """Run ``code`` in an isolated child interpreter, capturing combined output. + + PROTOTYPE — this is NOT a security sandbox: the child runs arbitrary Python + with full library and network access. It is gated off by default and intended + for trusted, local/agent use only. + """ + timeout = max(1, min(timeout_s, _CODE_TIMEOUT_MAX)) + proc = await asyncio.create_subprocess_exec( + sys.executable, + "-I", # isolated mode: ignore env vars and user site, don't add cwd to path + "-c", + code, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + cwd=tempfile.gettempdir(), + ) + try: + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout) + except TimeoutError: + proc.kill() + await proc.wait() + return {"timed_out": True, "exit_code": None, "output": f"(killed: exceeded {timeout}s)"} + text = stdout.decode("utf-8", errors="replace") + return { + "timed_out": False, + "exit_code": proc.returncode, + "truncated": len(text) > _CODE_OUTPUT_CAP, + "output": text[:_CODE_OUTPUT_CAP], + } + + +if _CODE_MODE_ENABLED: + + @router.post( + "/run-code", + operation_id="findata_run_code", + response_model=None, + summary="Run a Python snippet against the findata library and return its stdout", + ) + async def findata_run_code(payload: RunCodeRequest) -> Any: + """Execute arbitrary Python with the ``findata`` library available, returning + captured stdout/stderr. This replaces dozens of fine-grained calls: filter, + join, and aggregate across sources in one round-trip instead of streaming + every intermediate result through the model's context. + + Example:: + + import asyncio + from findata.sources.bcb import ptax + + print(asyncio.run(ptax.get_ptax_usd())) + + Security: runs in an isolated child interpreter with a timeout and output + cap, but is NOT a hardened sandbox. Enabled only when the server sets + FINDATA_MCP_CODE_MODE=1. + """ + return await _execute_code(payload.code, payload.timeout_s) + + +# ── The MCP-only FastAPI app ─────────────────────────────────────── + +mcp_app = FastAPI( + title="findata-br (MCP)", + description="Curated MCP tool surface for findata-br.", + version="1", +) + + +@mcp_app.exception_handler(ValueError) +async def _value_error_handler(_: Request, exc: ValueError) -> JSONResponse: + return JSONResponse(status_code=400, content={"detail": str(exc)}) + + +mcp_app.include_router(router) diff --git a/tests/test_mcp_surface.py b/tests/test_mcp_surface.py new file mode 100644 index 0000000..ba999b3 --- /dev/null +++ b/tests/test_mcp_surface.py @@ -0,0 +1,125 @@ +"""Tests for the curated MCP surface (findata.api.mcp_app). + +Guards the three promises of the MCP curation: + 1. the tool catalog is small and curated, not 1:1 with the 94 REST routes, + 2. the public REST API still exposes all 94 routes (curation is MCP-only), + 3. consolidated tools dispatch by their ``dataset``/``kind`` selector and + validate bad combinations with a 400. + +All assertions are offline — no live gov-API calls. +""" + +from __future__ import annotations + +import importlib + +import pytest +from fastapi.testclient import TestClient + +from findata.api.app import app +from findata.api.mcp_app import mcp_app + +EXPECTED_TOOLS = 24 # curated tools with code mode OFF (the default) +EXPECTED_REST_OPERATIONS = 95 # all REST routes (unconditional); bump when the surface changes + +_HTTP_METHODS = {"get", "post", "put", "delete", "patch"} + + +def _operation_ids(fastapi_app: object) -> set[str]: + ids: set[str] = set() + for path, methods in fastapi_app.openapi()["paths"].items(): # type: ignore[attr-defined] + for method, spec in methods.items(): + if method in _HTTP_METHODS: + ids.add(spec.get("operationId") or f"{method} {path}") + return ids + + +# ── catalog size & REST integrity ────────────────────────────────── + + +def test_curated_mcp_is_a_small_fraction_of_the_rest_surface() -> None: + mcp_ids = _operation_ids(mcp_app) + rest_ids = _operation_ids(app) + assert len(mcp_ids) == EXPECTED_TOOLS + assert len(rest_ids) == EXPECTED_REST_OPERATIONS + # the whole point of curation: catalog << REST surface + assert len(mcp_ids) < len(rest_ids) // 3 + + +def test_rest_api_untouched_by_curation() -> None: + # the consolidated REST routes that MCP tools fold together must still exist — + # they back the CLI and HTTP consumers. + paths = set(app.openapi()["paths"]) + for p in ( + "/bcb/ptax/usd/period", + "/bcb/focus/selic", + "/cvm/funds/holdings", + "/cvm/funds/fidc/direitos-creditorios", + ): + assert p in paths, f"REST route {p} disappeared" + + +def test_mcp_transport_mounted_on_public_app() -> None: + paths = {getattr(r, "path", None) for r in app.routes} + assert "/mcp" in paths + + +def test_every_tool_has_an_agent_oriented_summary() -> None: + for _path, methods in mcp_app.openapi()["paths"].items(): + for method, spec in methods.items(): + if method not in _HTTP_METHODS: + continue + summary = spec.get("summary", "") + # a real description, not the auto-generated "GET /path" + assert summary and not summary.startswith(("GET ", "POST ")) + assert len(summary) > 20 + + +# ── consolidated-tool dispatch (offline) ─────────────────────────── + + +def test_bcb_series_lists_catalog_with_no_args() -> None: + r = TestClient(mcp_app).get("/bcb/series") + assert r.status_code == 200 + assert len(r.json()) > 10 # the curated SGS catalog + + +def test_registry_lookup_resolves_ticker_offline() -> None: + r = TestClient(mcp_app).get("/registry/lookup", params={"q": "PETR4", "limit": 3}) + assert r.status_code == 200 + body = r.json() + assert body["entities"], "expected at least one match for PETR4" + assert body["entities"][0]["cnpj"].startswith("33.000.167") + + +def test_consolidated_tool_validates_missing_selector_args() -> None: + # cvm_company dataset=filings requires `year` -> 400 (not a 500) + r = TestClient(mcp_app).get("/cvm/company", params={"dataset": "filings"}) + assert r.status_code == 400 + assert "year" in r.json()["detail"] + + +def test_cvm_fund_holdings_requires_cnpj_and_month() -> None: + r = TestClient(mcp_app).get("/cvm/fund", params={"dataset": "holdings", "year": 2024}) + assert r.status_code == 400 + assert "cnpj" in r.json()["detail"] + + +# ── code-mode gating ─────────────────────────────────────────────── + + +def test_code_mode_is_off_by_default() -> None: + assert "findata_run_code" not in _operation_ids(mcp_app) + + +def test_code_mode_registers_tool_when_enabled(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FINDATA_MCP_CODE_MODE", "1") + import findata.api.mcp_app as fresh + + reloaded = importlib.reload(fresh) + try: + assert "findata_run_code" in _operation_ids(reloaded.mcp_app) + finally: + # restore the canonical (code-mode off) module for any later imports + monkeypatch.delenv("FINDATA_MCP_CODE_MODE", raising=False) + importlib.reload(fresh) From 6b0a21a98ddaeb53f51b44bbe1746acebbb12288 Mon Sep 17 00:00:00 2001 From: Roberto Date: Thu, 25 Jun 2026 22:16:19 -0300 Subject: [PATCH 2/5] chore: normalize AGENTS.md title punctuation Co-Authored-By: Claude Opus 4.8 --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 20db87b..805a5bd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ -# AGENTS.md — Dados Financeiros Abertos +# AGENTS.md: Dados Financeiros Abertos This file is for coding agents working in this repository. Keep it practical: follow the project conventions, avoid speculative dependencies, and produce From 6fdf25f1f1dabb2342d78438c73867348a0aab49 Mon Sep 17 00:00:00 2001 From: Roberto Date: Thu, 25 Jun 2026 22:24:49 -0300 Subject: [PATCH 3/5] docs: drop em dashes and correct stale REST count (94 to 95) in MCP surface Co-Authored-By: Claude Opus 4.8 --- docs/MCP_SURFACE.md | 28 ++++++++-------- src/findata/api/app.py | 6 ++-- src/findata/api/mcp_app.py | 66 +++++++++++++++++++------------------- tests/test_mcp_surface.py | 8 ++--- 4 files changed, 54 insertions(+), 54 deletions(-) diff --git a/docs/MCP_SURFACE.md b/docs/MCP_SURFACE.md index b64e34b..db9d067 100644 --- a/docs/MCP_SURFACE.md +++ b/docs/MCP_SURFACE.md @@ -1,4 +1,4 @@ -# MCP surface — curated tools over the REST API +# MCP surface: curated tools over the REST API > Status: prototype / design proposal (alpha 0.3.x). Non-breaking: the REST API > is untouched. Implemented in [`src/findata/api/mcp_app.py`](../src/findata/api/mcp_app.py). @@ -6,12 +6,12 @@ ## Problem The MCP server used to be auto-generated **1:1 from the FastAPI app**: -`FastApiMCP(app)` turns every route into a tool, so the catalog was **94 tools** -— one per dataset/endpoint. From a client/agent's point of view that means: +`FastApiMCP(app)` turns every route into a tool, so the catalog was **95 tools**, +one per dataset/endpoint. From a client/agent's point of view that means: - **~21k tokens of `tools/list`** loaded at the start of every session, before a single call. -- **Worse tool selection** — a model picks worse among 94 near-duplicate names +- **Worse tool selection**, a model picks worse among 95 near-duplicate names (one tool per SGS series, per CVM fund facet…) than among ~two dozen well-described tools. @@ -22,7 +22,7 @@ It exposes a small, hand-curated set of tools that dispatch to the same `findata.sources.*` functions the REST routers already use. ```python -# app.py — tools come from mcp_app; transport is served on the public app +# app.py: tools come from mcp_app; transport is served on the public app _mcp = FastApiMCP(mcp_app, name=..., description=...) _mcp.mount_http(router=app) # /mcp on the public app; REST routes untouched ``` @@ -30,16 +30,16 @@ _mcp.mount_http(router=app) # /mcp on the public app; REST routes untouched `FastApiMCP(mcp_app)` builds the catalog from `mcp_app`'s OpenAPI and executes each tool via `httpx.ASGITransport(app=mcp_app)`. Because the routers carry no app-state/rate-limiter coupling, reusing the source functions in a second app is -safe. **The 94 REST routes that back the CLI and HTTP consumers never change.** +safe. **The 95 REST routes that back the CLI and HTTP consumers never change.** -- **A — curation.** Each tool has an explicit `operation_id`, an agent-oriented +- **A, curation.** Each tool has an explicit `operation_id`, an agent-oriented one-line `summary`, and a docstring written *for an agent deciding whether to - call it* — not the raw route docstring. `response_model=None` + `-> Any` keeps + call it*, not the raw route docstring. `response_model=None` + `-> Any` keeps response schemas out of the catalog (they would re-inflate it). -- **B — consolidation.** Sprawly clusters collapse behind a `dataset`/`kind` +- **B, consolidation.** Sprawly clusters collapse behind a `dataset`/`kind` selector (see table). The work moves from "many thin tools" to "few tools with good docs". -- **C — code mode.** One optional tool, `findata_run_code`, runs a Python +- **C, code mode.** One optional tool, `findata_run_code`, runs a Python snippet against the `findata` library in an isolated child interpreter. It replaces dozens of fine-grained calls for filter/join/aggregate flows that would otherwise stream every intermediate result through the model's context. @@ -49,9 +49,9 @@ safe. **The 94 REST routes that back the CLI and HTTP consumers never change.** | | 1:1 (old) | curated (new) | |---|---:|---:| -| MCP tools | 94 | **24** (25 with code mode) | +| MCP tools | 95 | **24** (25 with code mode) | | `tools/list` size | ~85k chars (~21k tok) | **~29k chars (~7k tok)** | -| REST operations | 94 | **94 (unchanged)** | +| REST operations | 95 | **95 (unchanged)** | ## The 24 curated tools @@ -89,7 +89,7 @@ findata_run_code (code mode, opt-in) ## Tradeoffs - **Fewer but "fatter" tools.** Each carries a `dataset` enum and more doc. The - whole bet is that good descriptions beat tool count — so the docstrings are the + whole bet is that good descriptions beat tool count, so the docstrings are the deliverable, not an afterthought. - **Consolidation can hide endpoint-specific params behind an enum.** Mitigated by documenting each `dataset`/`kind` value and validating bad combinations with @@ -99,7 +99,7 @@ findata_run_code (code mode, opt-in) individually surfaced as tools. They remain fully reachable over REST and via `findata_run_code`. -## Code mode — security +## Code mode: security `findata_run_code` is a **prototype, not a hardened sandbox**. The snippet runs in a child `python -I` (isolated mode, cwd in a tempdir) with a wall-clock diff --git a/src/findata/api/app.py b/src/findata/api/app.py index fda6e24..741683b 100644 --- a/src/findata/api/app.py +++ b/src/findata/api/app.py @@ -151,11 +151,11 @@ async def _value_error_handler(_: Request, exc: ValueError) -> JSONResponse: from findata.api.mcp_app import mcp_app # The MCP tool catalog is built from the *curated* `mcp_app` (a separate - # FastAPI app, ~24 well-described tools), not from the public `app` — that - # would expose one near-duplicate tool per REST route (~94) and bloat every + # FastAPI app, ~24 well-described tools), not from the public `app`, which + # would expose one near-duplicate tool per REST route (~95) and bloat every # agent's context. `mount_http(router=app)` serves the /mcp transport on the # public app, while the tools are generated from and executed against - # `mcp_app` (via its ASGI transport). The 94 REST routes stay untouched. + # `mcp_app` (via its ASGI transport). The 95 REST routes stay untouched. _mcp = FastApiMCP( mcp_app, name=_PROJECT_SLUG, diff --git a/src/findata/api/mcp_app.py b/src/findata/api/mcp_app.py index 52a4c49..6e5f9d7 100644 --- a/src/findata/api/mcp_app.py +++ b/src/findata/api/mcp_app.py @@ -1,8 +1,8 @@ """Curated MCP surface for the findata-br server. -The public REST API (``findata.api.app``) exposes ~94 fine-grained routes — one +The public REST API (``findata.api.app``) exposes ~95 fine-grained routes, one per upstream dataset/endpoint. Mapping those 1:1 to MCP tools floods an agent's -context with ~94 near-duplicate tool schemas before it makes a single call, and +context with ~95 near-duplicate tool schemas before it makes a single call, and hurts tool-selection accuracy. This module is a *separate* FastAPI app whose only purpose is to be the source @@ -14,11 +14,11 @@ Wiring lives in ``app.py``: ``FastApiMCP(mcp_app).mount_http(router=app)`` builds the tool catalog from *this* app while serving ``/mcp`` on the public app. The -94 REST routes are never touched. +95 REST routes are never touched. - A — curation: only the headline tools are exposed, with real descriptions. - B — consolidation: ``bcb_*``/``cvm_*``/``tesouro_*``… fold many routes into one. - C — code mode: optional ``findata_run_code`` runs a Python snippet against the + A, curation: only the headline tools are exposed, with real descriptions. + B, consolidation: ``bcb_*``/``cvm_*``/``tesouro_*``… fold many routes into one. + C, code mode: optional ``findata_run_code`` runs a Python snippet against the library (gated by ``FINDATA_MCP_CODE_MODE=1``; off by default). """ @@ -67,7 +67,7 @@ _MIN_YEAR_BCB_SGS = 1986 -# ── Registry — the entry point ───────────────────────────────────── +# ── Registry: the entry point ───────────────────────────────────── @router.get( @@ -93,7 +93,7 @@ async def registry_lookup( return await lookup(q, limit=limit) -# ── BCB — Banco Central ──────────────────────────────────────────── +# ── BCB: Banco Central ──────────────────────────────────────────── @router.get( @@ -126,7 +126,7 @@ async def bcb_series( "/bcb/ptax", operation_id="bcb_ptax", response_model=None, - summary="PTAX official exchange rate for any currency — single date or a date range", + summary="PTAX official exchange rate for any currency, single date or a date range", ) async def bcb_ptax( currency: str = Query("USD", description="ISO currency code, e.g. USD, EUR, GBP"), @@ -151,7 +151,7 @@ async def bcb_ptax( "/bcb/focus", operation_id="bcb_focus", response_model=None, - summary="Boletim Focus expectations — annual/monthly, market or Top-5, or Selic per COPOM", + summary="Boletim Focus expectations, annual/monthly, market or Top-5, or Selic per COPOM", ) async def bcb_focus( indicator: str = Query( @@ -181,7 +181,7 @@ async def bcb_focus( return await focus.get_focus_annual(indicator, top) -# ── CVM — companies & funds ──────────────────────────────────────── +# ── CVM: companies & funds ──────────────────────────────────────── @router.get( @@ -209,7 +209,7 @@ async def cvm_company( ) -> Any: """The company side of CVM. ``search`` needs ``query``; ``list`` is the full registry. ``fca_general|fca_securities|fca_dri`` are cadastral facets needing - ``year`` (+ optional ``cnpj``/``ticker``). ``filings`` (IPE — fatos relevantes, + ``year`` (+ optional ``cnpj``/``ticker``). ``filings`` (IPE, fatos relevantes, comunicados) needs ``year`` (+ optional ``cnpj``/``categoria``). """ if dataset == "search": @@ -235,7 +235,7 @@ async def cvm_company( "/cvm/financials", operation_id="cvm_financials", response_model=None, - summary="CVM financial statements — annual (DFP) or quarterly (ITR) for a company", + summary="CVM financial statements, annual (DFP) or quarterly (ITR) for a company", ) async def cvm_financials( year: int = Query(..., ge=2010, description="Fiscal year"), @@ -247,7 +247,7 @@ async def cvm_financials( description="Statement type: BPA/BPP/DRE/DFC_MI/DMPL/DVA, _con (consolidated) or _ind", ), cnpj: str | None = Query( - None, description="Company CNPJ — strongly recommended (avoids the full dataset)" + None, description="Company CNPJ, strongly recommended (avoids the full dataset)" ), limit: int = Query(500, ge=1, le=5000), ) -> Any: @@ -346,7 +346,7 @@ async def _structured_fidc( "/cvm/structured-fund", operation_id="cvm_structured_fund", response_model=None, - summary="Structured CVM funds — FII (real estate), FIDC (receivables), FIP (private equity)", + summary="Structured CVM funds, FII (real estate), FIDC (receivables), FIP (private equity)", ) async def cvm_structured_fund( kind: Literal["fii", "fidc", "fip"] = Query(...), @@ -356,7 +356,7 @@ async def cvm_structured_fund( cnpj: str | None = Query(None, description="Fund CNPJ filter"), year: int = Query(..., description="Reference year"), month: int | None = Query(None, ge=1, le=12, description="Required for FIDC; optional for FII"), - quarter: int | None = Query(None, ge=1, le=4, description="FIP only — informe quarter"), + quarter: int | None = Query(None, ge=1, le=4, description="FIP only, informe quarter"), limit: int = Query(500, ge=1, le=5000), ) -> Any: """Structured funds by ``kind``. FII has ``geral`` (cadastral) and ``complemento`` @@ -370,13 +370,13 @@ async def cvm_structured_fund( return (await fip.get_fip(year, cnpj=cnpj, quarter=quarter))[:limit] -# ── B3 — Bolsa ───────────────────────────────────────────────────── +# ── B3: Bolsa ───────────────────────────────────────────────────── def _b3_quotes() -> Any: try: from findata.sources.b3 import quotes - except ImportError as exc: # pragma: no cover — only without the [b3] extra + except ImportError as exc: # pragma: no cover, only without the [b3] extra raise HTTPException( 503, "Live quotes need the optional extra: pip install 'findata-br[b3]'" ) from exc @@ -387,7 +387,7 @@ def _b3_quotes() -> Any: "/b3/quote", operation_id="b3_quote", response_model=None, - summary="Live B3 stock quote(s) (optional [b3] extra) — prefer b3_cotahist for official EOD", + summary="Live B3 stock quote(s) (optional [b3] extra), prefer b3_cotahist for official EOD", ) async def b3_quote( tickers: str = Query( @@ -412,14 +412,14 @@ async def b3_quote( "/b3/cotahist", operation_id="b3_cotahist", response_model=None, - summary="Official B3 COTAHIST daily quotes — by year, month, or single day", + summary="Official B3 COTAHIST daily quotes, by year, month, or single day", ) async def b3_cotahist( year: int = Query(..., ge=_MIN_YEAR_BCB_SGS, description="Year (B3 publishes since 1986)"), month: int | None = Query(None, ge=1, le=12), day: int | None = Query(None, ge=1, le=31), ticker: str | None = Query( - None, description="CODNEG filter, e.g. PETR4 — recommended (annual files are ~85 MB)" + None, description="CODNEG filter, e.g. PETR4, recommended (annual files are ~85 MB)" ), market_codes: str | None = Query( None, description="CODBDI whitelist, comma-separated, e.g. 02,96" @@ -476,7 +476,7 @@ async def b3_index( "/tesouro/bonds", operation_id="tesouro_bonds", response_model=None, - summary="Tesouro Direto bonds — list/filter, search names, or price+rate history", + summary="Tesouro Direto bonds, list/filter, search names, or price+rate history", ) async def tesouro_bonds( dataset: Literal["list", "search", "history"] = Query("list"), @@ -508,7 +508,7 @@ async def tesouro_bonds( "/tesouro/siconfi", operation_id="tesouro_siconfi", response_model=None, - summary="SICONFI public-finance reports — RREO, RGF, or the federation-entity list", + summary="SICONFI public-finance reports, RREO, RGF, or the federation-entity list", ) async def tesouro_siconfi( report: Literal["rreo", "rgf", "entes"] = Query("entes"), @@ -542,7 +542,7 @@ async def tesouro_siconfi( "/ibge/indicator", operation_id="ibge_indicator", response_model=None, - summary="IBGE economic indicators — list the catalog or fetch one by name (e.g. ipca_mensal)", + summary="IBGE economic indicators, list the catalog or fetch one by name (e.g. ipca_mensal)", ) async def ibge_indicator( name: str | None = Query(None, description="Indicator name; omit to list all available"), @@ -566,7 +566,7 @@ async def ibge_ipca_breakdown( periods: int = Query(6, ge=1, le=60, description="Recent months to return"), ) -> Any: """IPCA monthly variation for all major groups (food, housing, transport, - health, …) — granularity BCB SGS does not provide. + health, …), granularity BCB SGS does not provide. """ return await indicators.get_ipca_breakdown(periods) @@ -578,7 +578,7 @@ async def ibge_ipca_breakdown( "/ipea/series", operation_id="ipea_series", response_model=None, - summary="IPEA series — curated catalog, series values, or metadata by SERCODIGO", + summary="IPEA series, curated catalog, series values, or metadata by SERCODIGO", ) async def ipea_series_tool( sercodigo: str | None = Query( @@ -624,7 +624,7 @@ async def ipea_search( "/anbima", operation_id="anbima", response_model=None, - summary="ANBIMA public data — IMA index family, ETTJ yield curve, or debenture quotes", + summary="ANBIMA public data, IMA index family, ETTJ yield curve, or debenture quotes", ) async def anbima_tool( dataset: Literal["ima", "ettj", "debentures"] = Query("ima"), @@ -658,7 +658,7 @@ async def anbima_tool( "/openfinance/directory", operation_id="openfinance_directory", response_model=None, - summary="Open Finance Brasil Directory — participants, API endpoints, resources, or roles", + summary="Open Finance Brasil Directory, participants, API endpoints, resources, or roles", ) async def openfinance_directory( dataset: Literal["participants", "endpoints", "resources", "roles"] = Query("participants"), @@ -765,7 +765,7 @@ async def receita_arrecadacao( "/aneel/leiloes", operation_id="aneel_leiloes", response_model=None, - summary="ANEEL energy-auction results — generation or transmission", + summary="ANEEL energy-auction results, generation or transmission", ) async def aneel_leiloes( kind: Literal["geracao", "transmissao"] = Query("geracao"), @@ -791,7 +791,7 @@ async def aneel_leiloes( "/susep/empresas", operation_id="susep_empresas", response_model=None, - summary="SUSEP-supervised entities (insurance, previdência, capitalização) — list or search", + summary="SUSEP-supervised entities (insurance, previdência, capitalização), list or search", ) async def susep_empresas( q: str | None = Query(None, min_length=2, description="Name substring; omit to list all"), @@ -803,7 +803,7 @@ async def susep_empresas( return (await empresas.get_susep_empresas())[:limit] -# ── C — Code mode (optional, gated) ──────────────────────────────── +# ── C: Code mode (optional, gated) ──────────────────────────────── _CODE_MODE_ENABLED = os.getenv("FINDATA_MCP_CODE_MODE", "").strip().lower() in { "1", @@ -821,7 +821,7 @@ class RunCodeRequest(BaseModel): code: str = Field( ..., description="Python source to execute. The `findata` library is importable. " - "Source functions are async — wrap calls in asyncio.run(). Print results to stdout.", + "Source functions are async, wrap calls in asyncio.run(). Print results to stdout.", ) timeout_s: int = Field( 30, ge=1, le=_CODE_TIMEOUT_MAX, description="Wall-clock timeout in seconds" @@ -831,7 +831,7 @@ class RunCodeRequest(BaseModel): async def _execute_code(code: str, timeout_s: int) -> dict[str, Any]: """Run ``code`` in an isolated child interpreter, capturing combined output. - PROTOTYPE — this is NOT a security sandbox: the child runs arbitrary Python + PROTOTYPE, this is NOT a security sandbox: the child runs arbitrary Python with full library and network access. It is gated off by default and intended for trusted, local/agent use only. """ diff --git a/tests/test_mcp_surface.py b/tests/test_mcp_surface.py index ba999b3..65b82a0 100644 --- a/tests/test_mcp_surface.py +++ b/tests/test_mcp_surface.py @@ -1,12 +1,12 @@ """Tests for the curated MCP surface (findata.api.mcp_app). Guards the three promises of the MCP curation: - 1. the tool catalog is small and curated, not 1:1 with the 94 REST routes, - 2. the public REST API still exposes all 94 routes (curation is MCP-only), + 1. the tool catalog is small and curated, not 1:1 with the 95 REST routes, + 2. the public REST API still exposes all 95 routes (curation is MCP-only), 3. consolidated tools dispatch by their ``dataset``/``kind`` selector and validate bad combinations with a 400. -All assertions are offline — no live gov-API calls. +All assertions are offline, no live gov-API calls. """ from __future__ import annotations @@ -47,7 +47,7 @@ def test_curated_mcp_is_a_small_fraction_of_the_rest_surface() -> None: def test_rest_api_untouched_by_curation() -> None: - # the consolidated REST routes that MCP tools fold together must still exist — + # the consolidated REST routes that MCP tools fold together must still exist, # they back the CLI and HTTP consumers. paths = set(app.openapi()["paths"]) for p in ( From 25e21d13246b9df24355729fcd8796756adba8d9 Mon Sep 17 00:00:00 2001 From: Roberto Date: Thu, 25 Jun 2026 22:36:57 -0300 Subject: [PATCH 4/5] fix(mcp): bound unbounded tool outputs and correct stale b3 install hint - cvm_fund dataset=returns, cvm_company dataset=search and anbima dataset=ima now apply the documented [:limit] like their sibling branches, so an agent cannot pull the whole-market dataset by omitting a filter. - b3_quote install hint said 'findata-br[b3]'; the package is 'openfindata', matching the REST router and source message. - cvm_fund dataset=periods uses the public list_periods re-export instead of reaching into the private _directory module. - blocks and market_codes splits drop empty elements (trailing comma) like the tickers split already did. Co-Authored-By: Claude Opus 4.8 --- src/findata/api/mcp_app.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/findata/api/mcp_app.py b/src/findata/api/mcp_app.py index 6e5f9d7..6f59453 100644 --- a/src/findata/api/mcp_app.py +++ b/src/findata/api/mcp_app.py @@ -52,6 +52,7 @@ holdings, ipe, lamina, + list_periods, profile, ) from findata.sources.ibge import indicators @@ -215,7 +216,7 @@ async def cvm_company( if dataset == "search": if not query: raise HTTPException(400, "dataset=search requires `query`") - return await companies.search_company(query, True) + return (await companies.search_company(query, True))[:limit] if dataset == "list": return (await companies.get_companies(True))[:limit] if dataset == "filings": @@ -295,15 +296,13 @@ async def cvm_fund( if dataset == "catalog": return (await funds.get_fund_catalog(True, None))[:limit] if dataset == "periods": - from findata.sources.cvm import _directory - - return await _directory.list_periods("FI", f"DOC/{product}") + return await list_periods("FI", f"DOC/{product}") if year is None: raise HTTPException(400, f"dataset={dataset} requires `year`") if dataset == "holdings": if not cnpj or month is None: raise HTTPException(400, "dataset=holdings requires `cnpj` and `month`") - block_list = [b.strip() for b in blocks.split(",")] if blocks else None + block_list = [b.strip() for b in blocks.split(",") if b.strip()] if blocks else None return await holdings.get_fund_holdings(cnpj, year, month, block_list) if month is None: raise HTTPException(400, f"dataset={dataset} requires `month`") @@ -314,8 +313,8 @@ async def cvm_fund( if dataset == "profile": return (await profile.get_fund_profile(year, month, cnpj))[:limit] if horizon == "yearly": - return await lamina.get_fund_yearly_returns(year, month, cnpj) - return await lamina.get_fund_monthly_returns(year, month, cnpj) + return (await lamina.get_fund_yearly_returns(year, month, cnpj))[:limit] + return (await lamina.get_fund_monthly_returns(year, month, cnpj))[:limit] async def _structured_fii( @@ -378,7 +377,7 @@ def _b3_quotes() -> Any: from findata.sources.b3 import quotes except ImportError as exc: # pragma: no cover, only without the [b3] extra raise HTTPException( - 503, "Live quotes need the optional extra: pip install 'findata-br[b3]'" + 503, "Live quotes need the optional extra: pip install 'openfindata[b3]'" ) from exc return quotes @@ -429,7 +428,7 @@ async def b3_cotahist( ``month`` → one month, otherwise the whole ``year``. Pass ``ticker`` for single-issuer queries. """ - codes = [c.strip() for c in market_codes.split(",")] if market_codes else None + codes = [c.strip() for c in market_codes.split(",") if c.strip()] if market_codes else None if day is not None: if month is None: raise HTTPException(400, "`day` requires `month`") @@ -648,7 +647,7 @@ async def anbima_tool( rows = [r for r in rows if needle in r.emissor.upper()] return rows[:limit] fam = anbima_src.IMAFamily(family) if family else None - return await anbima_src.get_ima(fam) + return (await anbima_src.get_ima(fam))[:limit] # ── Open Finance Brasil ──────────────────────────────────────────── From 0efe01d852c9ecf4ca9802cab32dbec8365b5c41 Mon Sep 17 00:00:00 2001 From: Roberto Date: Thu, 25 Jun 2026 23:05:03 -0300 Subject: [PATCH 5/5] fix(mcp): tighten consolidated-tool validation and harden code-mode - tesouro_siconfi rejects an RGF period outside 1-3 (the quadrimestre range) with a 400 instead of querying SICONFI with a bad period. - bcb_focus rejects panel=top5 with horizon=monthly (Top-5 is annual-only) instead of silently downgrading to annual. - cvm_structured_fund rejects a dataset for kind=fip (FIP has no facet). - cvm_fund product is now a Literal, so the schema rejects unknown values. - code-mode: the child runs in its own process group and a timeout kills the whole tree (start_new_session + killpg), so a spawned grandchild cannot orphan past the timeout. Reading output fully before the cap stays a documented limit. - _MIN_YEAR_B3_COTAHIST replaces the mislabeled _MIN_YEAR_BCB_SGS for the cotahist lower bound. - 3 offline tests for the new validations. Co-Authored-By: Claude Opus 4.8 --- src/findata/api/mcp_app.py | 35 ++++++++++++++++++++++++++++------- tests/test_mcp_surface.py | 27 +++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/src/findata/api/mcp_app.py b/src/findata/api/mcp_app.py index 6f59453..d43a48d 100644 --- a/src/findata/api/mcp_app.py +++ b/src/findata/api/mcp_app.py @@ -25,7 +25,9 @@ from __future__ import annotations import asyncio +import contextlib import os +import signal import sys import tempfile from datetime import date @@ -65,7 +67,8 @@ router = APIRouter() _MAX_TICKERS = 20 -_MIN_YEAR_BCB_SGS = 1986 +_MIN_YEAR_B3_COTAHIST = 1986 # B3 publishes COTAHIST since 1986 +_RGF_MAX_PERIOD = 3 # RGF quadrimestre runs 1..3 # ── Registry: the entry point ───────────────────────────────────── @@ -175,6 +178,8 @@ async def bcb_focus( return focus.FOCUS_INDICATORS if key == "selic": return await focus.get_focus_selic(top) + if panel == "top5" and horizon == "monthly": + raise HTTPException(400, "panel=top5 is annual-only; use horizon=annual") if panel == "top5": return await focus.get_focus_top5_annual(indicator, top) if horizon == "monthly": @@ -282,10 +287,15 @@ async def cvm_fund( None, description="holdings: block whitelist, e.g. BLC_1,BLC_4 (of BLC_1..BLC_8,CONFID,PL,FIE)", ), - product: str = Query( + product: Literal[ "INF_DIARIO", - description="periods: INF_DIARIO|CDA|LAMINA|PERFIL_MENSAL|BALANCETE|EVENTUAL|EXTRATO", - ), + "CDA", + "LAMINA", + "PERFIL_MENSAL", + "BALANCETE", + "EVENTUAL", + "EXTRATO", + ] = Query("INF_DIARIO", description="periods: which CVM document set to list stamps for"), limit: int = Query(500, ge=1, le=5000), ) -> Any: """Open funds in one tool. ``catalog`` lists registered funds; ``periods`` lists @@ -366,6 +376,8 @@ async def cvm_structured_fund( return await _structured_fii(dataset, cnpj, year, month) if kind == "fidc": return await _structured_fidc(dataset, cnpj, year, month) + if dataset is not None: + raise HTTPException(400, "kind=fip takes no `dataset` (use `quarter`)") return (await fip.get_fip(year, cnpj=cnpj, quarter=quarter))[:limit] @@ -414,7 +426,7 @@ async def b3_quote( summary="Official B3 COTAHIST daily quotes, by year, month, or single day", ) async def b3_cotahist( - year: int = Query(..., ge=_MIN_YEAR_BCB_SGS, description="Year (B3 publishes since 1986)"), + year: int = Query(..., ge=_MIN_YEAR_B3_COTAHIST, description="Year (B3 publishes since 1986)"), month: int | None = Query(None, ge=1, le=12), day: int | None = Query(None, ge=1, le=31), ticker: str | None = Query( @@ -530,6 +542,8 @@ async def tesouro_siconfi( if year is None or period is None or cod_ibge is None: raise HTTPException(400, f"report={report} requires year, period, and cod_ibge") if report == "rgf": + if not 1 <= period <= _RGF_MAX_PERIOD: + raise HTTPException(400, "RGF period is the quadrimestre 1-3") return await siconfi.get_rgf(year, period, cod_ibge, poder=poder) # type: ignore[arg-type] return await siconfi.get_rreo(year, period, cod_ibge, anexo=anexo) @@ -832,7 +846,10 @@ async def _execute_code(code: str, timeout_s: int) -> dict[str, Any]: PROTOTYPE, this is NOT a security sandbox: the child runs arbitrary Python with full library and network access. It is gated off by default and intended - for trusted, local/agent use only. + for trusted, local/agent use only. The child runs in its own process group so + a timeout kills the whole tree, not just the direct child. Output is read in + full before the cap is applied, so a deployment that enables this should add + OS-level memory/output limits for the child. """ timeout = max(1, min(timeout_s, _CODE_TIMEOUT_MAX)) proc = await asyncio.create_subprocess_exec( @@ -843,11 +860,15 @@ async def _execute_code(code: str, timeout_s: int) -> dict[str, Any]: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT, cwd=tempfile.gettempdir(), + start_new_session=True, # own process group so a timeout can kill the tree ) try: stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout) except TimeoutError: - proc.kill() + # Kill the whole process GROUP: a snippet that spawned its own subprocesses + # must not outlive the timeout as an orphan. + with contextlib.suppress(ProcessLookupError, PermissionError): + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) await proc.wait() return {"timed_out": True, "exit_code": None, "output": f"(killed: exceeded {timeout}s)"} text = stdout.decode("utf-8", errors="replace") diff --git a/tests/test_mcp_surface.py b/tests/test_mcp_surface.py index 65b82a0..ebffc61 100644 --- a/tests/test_mcp_surface.py +++ b/tests/test_mcp_surface.py @@ -123,3 +123,30 @@ def test_code_mode_registers_tool_when_enabled(monkeypatch: pytest.MonkeyPatch) # restore the canonical (code-mode off) module for any later imports monkeypatch.delenv("FINDATA_MCP_CODE_MODE", raising=False) importlib.reload(fresh) + + +# -- added validations (offline) ------------------------------------ + + +def test_siconfi_rgf_rejects_out_of_range_period() -> None: + # RGF is the quadrimestre 1-3; period 6 is valid only for RREO bimestre. + r = TestClient(mcp_app).get( + "/tesouro/siconfi", + params={"report": "rgf", "year": 2024, "period": 6, "cod_ibge": 1}, + ) + assert r.status_code == 400 + assert "1-3" in r.json()["detail"] + + +def test_focus_rejects_top5_monthly() -> None: + # Top-5 panel exists only for the annual horizon. + r = TestClient(mcp_app).get("/bcb/focus", params={"panel": "top5", "horizon": "monthly"}) + assert r.status_code == 400 + + +def test_structured_fund_fip_rejects_dataset() -> None: + # FIP has no dataset facet; passing one is a client error, not silently ignored. + r = TestClient(mcp_app).get( + "/cvm/structured-fund", params={"kind": "fip", "year": 2024, "dataset": "geral"} + ) + assert r.status_code == 400