Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@ adheres to [Semantic Versioning](https://semver.org/).

## [Unreleased]

### Fixed

- **`cvm holdings` ignored bare-digit CNPJs.** The CDA reader compared the
CNPJ argument to the stored value as raw strings, but CVM stores CNPJs
punctuated (`22.187.946/0001-41`). A user passing bare digits
(`22187946000141`) — exactly what the `--help` text advertises — got
"No holdings". Both sides are now normalized to digits before comparing,
so punctuated and bare forms return identical results.

## [0.3.1] — 2026-04-29

Patch release fixing 5 bugs caught in adversarial review of v0.3.0 by
Expand Down
12 changes: 10 additions & 2 deletions src/findata/sources/cvm/holdings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import csv
import io
import re
import sys
import zipfile

Expand All @@ -43,6 +44,13 @@

CDA_URL = f"{CVM_BASE}/FI/DOC/CDA/DADOS/cda_fi_{{ym}}.zip"

_NON_DIGIT = re.compile(r"\D")


def _digits(value: str | None) -> str:
"""Strip a CNPJ down to its digits so punctuated and bare forms compare equal."""
return _NON_DIGIT.sub("", value or "")


class FundHolding(BaseModel):
"""One holding row (one asset position) inside one fund's portfolio."""
Expand Down Expand Up @@ -139,7 +147,7 @@ async def get_fund_holdings(
"""
ym = f"{year}{month:02d}"
raw = await get_bytes(CDA_URL.format(ym=ym), cache_ttl=86400)
cnpj_norm = cnpj.strip()
cnpj_norm = _digits(cnpj)
Comment on lines 148 to +150

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

We can optimize this function by validating the CNPJ and pre-computing both its bare and punctuated forms before downloading the large ZIP file. This avoids unnecessary network requests for invalid inputs and allows us to perform fast string comparisons in the hot loop instead of running regex normalization on every single row.

    cnpj_bare = _digits(cnpj)
    if not cnpj_bare:
        return []
    cnpj_punctuated = (
        f"{cnpj_bare[:2]}.{cnpj_bare[2:5]}.{cnpj_bare[5:8]}/{cnpj_bare[8:12]}-{cnpj_bare[12:]}"
        if len(cnpj_bare) == 14
        else cnpj_bare
    )
    ym = f"{year}{month:02d}"
    raw = await get_bytes(CDA_URL.format(ym=ym), cache_ttl=86400)

wanted_blocks = {b.upper() for b in blocks} if blocks else None
holdings: list[FundHolding] = []
with zipfile.ZipFile(io.BytesIO(raw)) as zf:
Expand All @@ -152,7 +160,7 @@ async def get_fund_holdings(
with zf.open(entry) as f:
reader = csv.DictReader(io.StringIO(f.read().decode("iso-8859-1")), delimiter=";")
for row in reader:
row_cnpj = (row.get("CNPJ_FUNDO_CLASSE") or row.get("CNPJ_FUNDO", "")).strip()
row_cnpj = _digits(row.get("CNPJ_FUNDO_CLASSE") or row.get("CNPJ_FUNDO"))
if row_cnpj != cnpj_norm:
continue
Comment on lines +163 to 165

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Instead of running the regex-based _digits normalization on every single row of the CSV (which can contain hundreds of thousands of rows and becomes a major CPU bottleneck), we can perform a direct string comparison against the pre-computed bare and punctuated CNPJ forms.

Suggested change
row_cnpj = _digits(row.get("CNPJ_FUNDO_CLASSE") or row.get("CNPJ_FUNDO"))
if row_cnpj != cnpj_norm:
continue
row_cnpj = (row.get("CNPJ_FUNDO_CLASSE") or row.get("CNPJ_FUNDO") or "").strip()
if row_cnpj != cnpj_punctuated and row_cnpj != cnpj_bare:
continue

h = _row_to_holding(row, block, include_raw=include_raw)
Expand Down
16 changes: 16 additions & 0 deletions tests/test_cvm_funds.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,22 @@ async def test_holdings_block_whitelist() -> None:
assert all(r.bloco == "BLC_4" for r in rows)


@respx.mock
async def test_holdings_bare_digit_cnpj_matches_punctuated() -> None:
"""CVM stores CNPJs punctuated; a bare-digit query must still match.

Regression: the reader used to compare raw strings, so ``12345678000199``
never matched the stored ``12.345.678/0001-99`` and returned no holdings.
"""
respx.get(re.compile(r"https://.*cda_fi_202603\.zip")).mock(
return_value=httpx.Response(200, content=_make_cda_zip())
)
rows = await get_fund_holdings("12345678000199", year=2026, month=3)
assert len(rows) == 3 # identical to the punctuated query
assert {r.bloco for r in rows} == {"BLC_4", "BLC_8"}
assert all(r.cnpj == "12.345.678/0001-99" for r in rows) # output keeps CVM format


# ── LAMINA ───────────────────────────────────────────────────────


Expand Down
Loading