From bfdd7c37ac93a6657bfe20349be2fd19338937c7 Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 11 Jun 2026 23:51:04 +0800 Subject: [PATCH 1/5] test: add parse/export/search performance benchmarks and CI artifacts --- .github/workflows/ci.yml | 35 ++++++++++++ CONTRIBUTING.md | 1 + benchmarks/README.md | 36 ++++++++++++ benchmarks/baselines.json | 10 ++++ pyproject.toml | 5 +- requirements-dev.txt | 1 + tests/benchmarks/__init__.py | 0 tests/benchmarks/conftest.py | 79 +++++++++++++++++++++++++++ tests/benchmarks/test_export_bench.py | 38 +++++++++++++ tests/benchmarks/test_parse_bench.py | 24 ++++++++ tests/benchmarks/test_parse_memory.py | 23 ++++++++ tests/benchmarks/test_search_bench.py | 18 ++++++ 12 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/baselines.json create mode 100644 tests/benchmarks/__init__.py create mode 100644 tests/benchmarks/conftest.py create mode 100644 tests/benchmarks/test_export_bench.py create mode 100644 tests/benchmarks/test_parse_bench.py create mode 100644 tests/benchmarks/test_parse_memory.py create mode 100644 tests/benchmarks/test_search_bench.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 545dfa5..e7372ff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -204,3 +204,38 @@ jobs: esac npm install --no-save "${PKG}@${ROLLUP_VERSION}" - run: npm test + + benchmarks: + name: Performance benchmarks (informational) + runs-on: ubuntu-latest + permissions: + contents: read + actions: write + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + persist-credentials: false + + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.12" + cache: pip + cache-dependency-path: | + requirements.txt + requirements-dev.txt + + - name: Install dev dependencies + run: pip install -r requirements-dev.txt + + - name: Run benchmarks + run: > + pytest tests/benchmarks/ + --benchmark-only + --benchmark-json=benchmark-results.json + --benchmark-columns=min,max,mean,stddev,rounds + -o addopts= + + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + name: benchmark-results + path: benchmark-results.json diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 48bc61d..a123b2f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,6 +66,7 @@ pytest tests/test_api_integration.py -v pytest tests/test_search.py -v pytest tests/test_api_routes.py -v pytest tests/test_error_codes.py -v +pytest tests/benchmarks/ --benchmark-only -o addopts= -v # performance baselines (see benchmarks/README.md) ``` ### JavaScript (vitest) diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..b8d9e05 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,36 @@ +# Performance benchmarks + +Repeatable local measurements for parse, bulk export, and search hot paths. + +## Run locally + +```bash +pip install -r requirements-dev.txt +pytest tests/benchmarks/ --benchmark-only -o addopts= -v +``` + +## Memory check + +```bash +pytest tests/benchmarks/test_parse_memory.py -v +``` + +The memory test also runs as part of the normal `pytest` suite (timing benchmarks are skipped via `--benchmark-skip` in `pyproject.toml`). + +## Scenarios + +| Group | What | +|-------|------| +| parse | `parse_session` on 10 / 500 / 5000+ line JSONL | +| export | `run_bulk_export` over 10 / 50 / 100 sessions | +| search | `GET /api/search` over a 50-session synthetic corpus | + +Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git. + +## CI + +The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet. + +## Refresh baselines + +After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it. diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json new file mode 100644 index 0000000..123a2b4 --- /dev/null +++ b/benchmarks/baselines.json @@ -0,0 +1,10 @@ +{ + "_note": "Informational snapshot only — CI does not gate on these values.", + "updated": null, + "machine": null, + "groups": { + "parse": {}, + "export": {}, + "search": {} + } +} diff --git a/pyproject.toml b/pyproject.toml index 5e8f63b..b71fd6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,8 +5,11 @@ packages = ["api", "utils", "models"] exclude = ["tests/"] [tool.pytest.ini_options] -addopts = "--cov=api --cov=utils --cov-report=term-missing --cov-report=xml:coverage.xml" +addopts = "--cov=api --cov=utils --cov-report=term-missing --cov-report=xml:coverage.xml --benchmark-skip" testpaths = ["tests"] +markers = [ + "benchmark: performance benchmarks (pytest-benchmark)", +] [tool.coverage.run] omit = [ diff --git a/requirements-dev.txt b/requirements-dev.txt index 7e83784..e4ef069 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,3 +6,4 @@ pytest-cov>=5.0 ruff>=0.9.0 pip-audit>=2.7.0 hypothesis>=6.100.0 +pytest-benchmark>=4.0.0 diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py new file mode 100644 index 0000000..fd419e8 --- /dev/null +++ b/tests/benchmarks/conftest.py @@ -0,0 +1,79 @@ +"""Synthetic corpora for parse/export/search performance benchmarks.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from app import create_app + +FIXTURES = Path(__file__).resolve().parents[1] / "fixtures" +TEMPLATE_LINE = (FIXTURES / "session_with_tools.jsonl").read_text(encoding="utf-8").splitlines()[0] + + +def write_jsonl(path: Path, line_count: int) -> Path: + """Write a JSONL session file with *line_count* rows derived from the template fixture.""" + with path.open("w", encoding="utf-8") as f: + for i in range(line_count): + entry = json.loads(TEMPLATE_LINE) + entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z" + if i % 3 == 1: + msg = entry.setdefault("message", {}) + if isinstance(msg, dict) and "content" in msg: + msg["content"] = [{"type": "text", "text": f"benchmark token {i} searchable"}] + f.write(json.dumps(entry, separators=(",", ":")) + "\n") + return path + + +def seed_search_corpus( + base_dir: Path, + *, + session_count: int = 50, + lines_per_session: int = 20, +) -> Path: + """Create a multi-session project tree under *base_dir* for search benchmarks.""" + project = base_dir / "bench-project" + project.mkdir(parents=True) + for i in range(session_count): + write_jsonl(project / f"session_{i:04d}.jsonl", lines_per_session) + return base_dir + + +@pytest.fixture(scope="session") +def parse_small_file(tmp_path_factory: pytest.TempPathFactory) -> Path: + root = tmp_path_factory.mktemp("bench") + return write_jsonl(root / "small.jsonl", 10) + + +@pytest.fixture(scope="session") +def parse_medium_file(tmp_path_factory: pytest.TempPathFactory) -> Path: + root = tmp_path_factory.mktemp("bench") + return write_jsonl(root / "medium.jsonl", 500) + + +@pytest.fixture(scope="session") +def parse_large_file(tmp_path_factory: pytest.TempPathFactory) -> Path: + root = tmp_path_factory.mktemp("bench") + return write_jsonl(root / "large.jsonl", 5000) + + +@pytest.fixture +def export_corpus(tmp_path: Path, request: pytest.FixtureRequest) -> Path: + """Project dir with N session files. Parametrize N via indirect fixture.""" + count = request.param + project = tmp_path / "bench-project" + project.mkdir() + for i in range(count): + write_jsonl(project / f"session_{i:04d}.jsonl", 20) + return project + + +@pytest.fixture +def bench_client_search_corpus(tmp_path: Path): + """Flask test client backed by a 50-session synthetic project tree.""" + seed_search_corpus(tmp_path) + app = create_app(base_dir=str(tmp_path)) + app.config["TESTING"] = True + return app.test_client() diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py new file mode 100644 index 0000000..c33bf53 --- /dev/null +++ b/tests/benchmarks/test_export_bench.py @@ -0,0 +1,38 @@ +"""Benchmark run_bulk_export over 10, 50, and 100 session corpora.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from utils.export_engine import NoopSink, run_bulk_export + + +@pytest.mark.benchmark(group="export") +@pytest.mark.parametrize( + "export_corpus", + [10, 50, 100], + indirect=True, + ids=["sessions-10", "sessions-50", "sessions-100"], +) +def test_bulk_export_session_count( + benchmark, + export_corpus: Path, +) -> None: + projects = [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}] + + def _run() -> object: + return run_bulk_export( + projects=projects, + since="all", + rules=[], + last_export_sessions={}, + sink=NoopSink(), + fmt="md", + path_layout="api", + manifest_style="api", + ) + + result = benchmark(_run) + assert result.exported_session_count > 0 diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py new file mode 100644 index 0000000..d400cca --- /dev/null +++ b/tests/benchmarks/test_parse_bench.py @@ -0,0 +1,24 @@ +"""Benchmark parse_session on small, medium, and large JSONL corpora.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from utils.jsonl_parser import parse_session + + +@pytest.mark.benchmark(group="parse") +def test_parse_session_small(benchmark, parse_small_file: Path) -> None: + benchmark(parse_session, str(parse_small_file)) + + +@pytest.mark.benchmark(group="parse") +def test_parse_session_medium(benchmark, parse_medium_file: Path) -> None: + benchmark(parse_session, str(parse_medium_file)) + + +@pytest.mark.benchmark(group="parse") +def test_parse_session_large(benchmark, parse_large_file: Path) -> None: + benchmark(parse_session, str(parse_large_file)) diff --git a/tests/benchmarks/test_parse_memory.py b/tests/benchmarks/test_parse_memory.py new file mode 100644 index 0000000..ad2d1e9 --- /dev/null +++ b/tests/benchmarks/test_parse_memory.py @@ -0,0 +1,23 @@ +"""Peak memory ceiling for large-file parse_session (regular pytest, not benchmark-only).""" + +from __future__ import annotations + +import tracemalloc +from pathlib import Path + +from utils.jsonl_parser import parse_session + + +def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None: + path = parse_large_file + file_bytes = path.stat().st_size + ceiling = file_bytes * 10 + + tracemalloc.start() + try: + parse_session(str(path)) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + + assert peak < ceiling, f"peak {peak} bytes exceeds 10x file size {file_bytes}" diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py new file mode 100644 index 0000000..16b9b87 --- /dev/null +++ b/tests/benchmarks/test_search_bench.py @@ -0,0 +1,18 @@ +"""Benchmark full-corpus search via the HTTP test client.""" + +from __future__ import annotations + +import pytest +from flask.testing import FlaskClient + + +@pytest.mark.benchmark(group="search") +def test_search_full_corpus( + benchmark, + bench_client_search_corpus: FlaskClient, +) -> None: + def _run() -> object: + return bench_client_search_corpus.get("/api/search?q=searchable&limit=50") + + resp = benchmark(_run) + assert resp.status_code == 200 From 68ba23cefc7e914c5beb266ac5eb0e92fd21fb4f Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 12 Jun 2026 00:44:43 +0800 Subject: [PATCH 2/5] chore: pin pytest-benchmark 5.2.3 and clarify json.dumps in bench fixtures Pin pytest-benchmark to 5.2.3 after verifying compatibility with pytest 9.0 and the benchmark suite. Annotate benchmark JSONL serialization to document that json.dumps is intentional for file I/O, not Flask jsonify. --- requirements-dev.txt | 2 +- tests/benchmarks/conftest.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index e4ef069..b54a2d1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,4 +6,4 @@ pytest-cov>=5.0 ruff>=0.9.0 pip-audit>=2.7.0 hypothesis>=6.100.0 -pytest-benchmark>=4.0.0 +pytest-benchmark==5.2.3 diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index fd419e8..6d07f88 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -23,7 +23,11 @@ def write_jsonl(path: Path, line_count: int) -> Path: msg = entry.setdefault("message", {}) if isinstance(msg, dict) and "content" in msg: msg["content"] = [{"type": "text", "text": f"benchmark token {i} searchable"}] - f.write(json.dumps(entry, separators=(",", ":")) + "\n") + # json.dumps for file I/O — jsonify is Flask's HTTP helper, not file serialization. + serialized = ( + json.dumps(entry, separators=(",", ":")) + "\n" # linters-ignore: prefer-jsonify + ) + f.write(serialized) return path From c1d303007d01c1e58338796c12a5ac927d0b7188 Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 12 Jun 2026 03:18:50 +0800 Subject: [PATCH 3/5] refactor(benchmarks): harden corpus helpers in conftest Parse the JSONL template once per write_jsonl call and deepcopy entries in the loop. Use exist_ok=True when creating bench-project in seed_search_corpus. --- tests/benchmarks/conftest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index 6d07f88..cd4369c 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +from copy import deepcopy from pathlib import Path import pytest @@ -15,9 +16,10 @@ def write_jsonl(path: Path, line_count: int) -> Path: """Write a JSONL session file with *line_count* rows derived from the template fixture.""" + template = json.loads(TEMPLATE_LINE) with path.open("w", encoding="utf-8") as f: for i in range(line_count): - entry = json.loads(TEMPLATE_LINE) + entry = deepcopy(template) entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z" if i % 3 == 1: msg = entry.setdefault("message", {}) @@ -39,7 +41,7 @@ def seed_search_corpus( ) -> Path: """Create a multi-session project tree under *base_dir* for search benchmarks.""" project = base_dir / "bench-project" - project.mkdir(parents=True) + project.mkdir(parents=True, exist_ok=True) for i in range(session_count): write_jsonl(project / f"session_{i:04d}.jsonl", lines_per_session) return base_dir From ea3b1ca6e0821c822ebb3cf7e57ad40206c6cc41 Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 12 Jun 2026 05:27:17 +0800 Subject: [PATCH 4/5] fix(benchmarks): harden memory test and ruff test glob Reset tracemalloc peak before measuring large-file parse, assert non-empty message count, extend E402 per-file-ignores to tests/**, and clarify README that benchmark tests live under tests/benchmarks/. --- benchmarks/README.md | 4 +++- pyproject.toml | 2 +- tests/benchmarks/test_parse_memory.py | 4 +++- tests/benchmarks/test_search_bench.py | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index b8d9e05..e62da18 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,5 +1,7 @@ # Performance benchmarks +Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot. + Repeatable local measurements for parse, bulk export, and search hot paths. ## Run locally @@ -12,7 +14,7 @@ pytest tests/benchmarks/ --benchmark-only -o addopts= -v ## Memory check ```bash -pytest tests/benchmarks/test_parse_memory.py -v +pytest tests/benchmarks/test_parse_memory.py -v -o addopts= ``` The memory test also runs as part of the normal `pytest` suite (timing benchmarks are skipped via `--benchmark-skip` in `pyproject.toml`). diff --git a/pyproject.toml b/pyproject.toml index b71fd6f..7203ef0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,4 @@ combine-as-imports = true # CLI bootstrap: sys.path must be set before local imports. "scripts/export.py" = ["E402"] # Tests mirror the same path bootstrap before importing app/utils. -"tests/*.py" = ["E402"] +"tests/**/*.py" = ["E402"] diff --git a/tests/benchmarks/test_parse_memory.py b/tests/benchmarks/test_parse_memory.py index ad2d1e9..fcba372 100644 --- a/tests/benchmarks/test_parse_memory.py +++ b/tests/benchmarks/test_parse_memory.py @@ -14,8 +14,10 @@ def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None: ceiling = file_bytes * 10 tracemalloc.start() + tracemalloc.clear_traces() try: - parse_session(str(path)) + result = parse_session(str(path)) + assert len(result["messages"]) > 0, "parse_session returned no messages" _, peak = tracemalloc.get_traced_memory() finally: tracemalloc.stop() diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py index 16b9b87..2d51a68 100644 --- a/tests/benchmarks/test_search_bench.py +++ b/tests/benchmarks/test_search_bench.py @@ -16,3 +16,4 @@ def _run() -> object: resp = benchmark(_run) assert resp.status_code == 200 + assert resp.get_json(), "expected search hits from synthetic searchable tokens" From 800244ac58aa94757d5e9b2cddcb9453e3c25bd3 Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 12 Jun 2026 21:34:44 +0800 Subject: [PATCH 5/5] fix(benchmarks): address PR #76 review feedback from @timon0305 Drop unnecessary actions: write on benchmarks CI job; assert explicit search hit count on list response; document 10x memory ceiling and v1 template limitations in README; note NoopSink export rounds are stateless. --- .github/workflows/ci.yml | 1 - benchmarks/README.md | 4 ++++ tests/benchmarks/test_export_bench.py | 1 + tests/benchmarks/test_parse_memory.py | 3 +++ tests/benchmarks/test_search_bench.py | 3 ++- 5 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e7372ff..f157984 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -210,7 +210,6 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - actions: write steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: diff --git a/benchmarks/README.md b/benchmarks/README.md index e62da18..75ff9cc 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -29,6 +29,10 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git. +Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/export numbers measure steady-state throughput on a narrow schema slice — not full parser branch coverage. Treat as v1 baselines, not exhaustive perf proof. + +The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session. + ## CI The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet. diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py index c33bf53..46c0eaf 100644 --- a/tests/benchmarks/test_export_bench.py +++ b/tests/benchmarks/test_export_bench.py @@ -23,6 +23,7 @@ def test_bulk_export_session_count( projects = [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}] def _run() -> object: + # NoopSink + since="all" + empty last_export_sessions: no disk/state writes per round. return run_bulk_export( projects=projects, since="all", diff --git a/tests/benchmarks/test_parse_memory.py b/tests/benchmarks/test_parse_memory.py index fcba372..de1c886 100644 --- a/tests/benchmarks/test_parse_memory.py +++ b/tests/benchmarks/test_parse_memory.py @@ -11,6 +11,9 @@ def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None: path = parse_large_file file_bytes = path.stat().st_size + # Issue #7 ceiling: Python heap peak (tracemalloc) vs on-disk JSONL size. Parsed + # dict/str objects often exceed raw bytes; 10x is a generous v1 guard — relax with + # a comment here if the parser legitimately grows. ceiling = file_bytes * 10 tracemalloc.start() diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py index 2d51a68..95c5c5b 100644 --- a/tests/benchmarks/test_search_bench.py +++ b/tests/benchmarks/test_search_bench.py @@ -16,4 +16,5 @@ def _run() -> object: resp = benchmark(_run) assert resp.status_code == 200 - assert resp.get_json(), "expected search hits from synthetic searchable tokens" + hits = resp.get_json() + assert isinstance(hits, list) and len(hits) > 0, "expected search hits from synthetic corpus"