Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ adheres to [Semantic Versioning](https://semver.org/).
(`22187946000141`) — exactly what the `--help` text advertises — got
"No holdings". Both sides are now normalized to digits before comparing,
so punctuated and bare forms return identical results.
- **`cvm holdings` silently dropped BLC_2 (cotas de fundos).** CDA free-text
fund names contain stray double-quotes; the default `csv` dialect treated
them as quote chars and swallowed delimiters/newlines across rows, so whole
blocks parsed to garbage and matched nothing — the CLI printed a partial
table that looked complete. For a fund-of-funds (FIC) this hid ~99.8% of the
portfolio (e.g. Verde FIC's R$ 850.657.058,68 Verde Master position). The
reader now passes `quoting=csv.QUOTE_NONE`, keeping every `"` literal.

## [0.3.1] — 2026-04-29

Expand Down
10 changes: 9 additions & 1 deletion src/findata/sources/cvm/holdings.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,15 @@ async def get_fund_holdings(
if wanted_blocks is not None and block not in wanted_blocks:
continue
with zf.open(entry) as f:
reader = csv.DictReader(io.StringIO(f.read().decode("iso-8859-1")), delimiter=";")
# CDA free-text names (esp. BLC_2 cotas de fundos) carry stray
# double-quotes; the default dialect would treat them as quote
# chars and swallow delimiters across rows, silently dropping
# whole blocks. QUOTE_NONE keeps every '"' as a literal char.
reader = csv.DictReader(
io.StringIO(f.read().decode("iso-8859-1")),
delimiter=";",
quoting=csv.QUOTE_NONE,
)
Comment on lines +165 to +169

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The module docstring states that we line-stream every CSV inside the ZIP to avoid burning RAM and CPU. However, f.read().decode("iso-8859-1") reads and decodes the entire uncompressed CSV file into memory at once before wrapping it in io.StringIO.

For large CSV files (which can be tens of megabytes), this defeats the streaming design and can cause significant memory spikes.

We can achieve true line-streaming by wrapping the binary stream f in io.TextIOWrapper instead of reading the whole file into memory.

Suggested change
reader = csv.DictReader(
io.StringIO(f.read().decode("iso-8859-1")),
delimiter=";",
quoting=csv.QUOTE_NONE,
)
reader = csv.DictReader(
io.TextIOWrapper(f, encoding="iso-8859-1"),
delimiter=";",
quoting=csv.QUOTE_NONE,
)

for row in reader:
row_cnpj = _digits(row.get("CNPJ_FUNDO_CLASSE") or row.get("CNPJ_FUNDO"))
if row_cnpj != cnpj_norm:
Expand Down
40 changes: 40 additions & 0 deletions tests/test_cvm_funds.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,46 @@ async def test_holdings_bare_digit_cnpj_matches_punctuated() -> None:
assert all(r.cnpj == "12.345.678/0001-99" for r in rows) # output keeps CVM format


def _make_cda_blc2_zip() -> bytes:
"""BLC_2 (cotas de fundos) zip whose free-text name has a stray quote.

The first invested-fund name opens a double-quote that is only closed on
the next row. The default csv dialect treats ``"`` as a quote char, so it
swallows the delimiter/newline and merges both rows into one — silently
dropping the second fund-of-funds position. Real CDA files carry exactly
this pattern, which made an entire FIC portfolio vanish.
"""
header = (
"TP_FUNDO_CLASSE;CNPJ_FUNDO_CLASSE;DENOM_SOCIAL;DT_COMPTC;TP_APLIC;"
"TP_ATIVO;EMISSOR_LIGADO;QT_POS_FINAL;VL_MERC_POS_FINAL;DS_ATIVO\n"
)
rows = (
"FIC;22.187.946/0001-41;VERDE FIC;2025-12-31;Cotas de Fundos;Cota de FI;"
'N;850657.05;850657058.68;"VERDE MASTER FIC\n'
"FIC;22.187.946/0001-41;VERDE FIC;2025-12-31;Cotas de Fundos;Cota de FI;"
'N;100;500000.00;OUTRO" FUNDO INVESTIDO\n'
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr("cda_fi_BLC_2_202512.csv", (header + rows).encode("iso-8859-1"))
return buf.getvalue()


@respx.mock
async def test_holdings_blc2_stray_quote_not_dropped() -> None:
"""A BLC_2 row with a stray quote in the fund name must not be dropped."""
respx.get(re.compile(r"https://.*cda_fi_202512\.zip")).mock(
return_value=httpx.Response(200, content=_make_cda_blc2_zip())
)
rows = await get_fund_holdings("22.187.946/0001-41", year=2025, month=12)
assert len(rows) == 2 # both fund-of-funds positions survive parsing
assert all(r.bloco == "BLC_2" for r in rows)
assert rows[0].valor_mercado == 850657058.68 # ~99.8% of PL — the big one
assert rows[1].valor_mercado == 500000.00
# stray quote is kept literally, not treated as a CSV quote char
assert rows[0].descricao == '"VERDE MASTER FIC'


# ── LAMINA ───────────────────────────────────────────────────────


Expand Down
Loading