diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 545dfa5..f157984 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -204,3 +204,37 @@ jobs: esac npm install --no-save "${PKG}@${ROLLUP_VERSION}" - run: npm test + + benchmarks: + name: Performance benchmarks (informational) + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + persist-credentials: false + + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.12" + cache: pip + cache-dependency-path: | + requirements.txt + requirements-dev.txt + + - name: Install dev dependencies + run: pip install -r requirements-dev.txt + + - name: Run benchmarks + run: > + pytest tests/benchmarks/ + --benchmark-only + --benchmark-json=benchmark-results.json + --benchmark-columns=min,max,mean,stddev,rounds + -o addopts= + + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + name: benchmark-results + path: benchmark-results.json diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 48bc61d..a123b2f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,6 +66,7 @@ pytest tests/test_api_integration.py -v pytest tests/test_search.py -v pytest tests/test_api_routes.py -v pytest tests/test_error_codes.py -v +pytest tests/benchmarks/ --benchmark-only -o addopts= -v # performance baselines (see benchmarks/README.md) ``` ### JavaScript (vitest) diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..75ff9cc --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,42 @@ +# Performance benchmarks + +Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot. + +Repeatable local measurements for parse, bulk export, and search hot paths. + +## Run locally + +```bash +pip install -r requirements-dev.txt +pytest tests/benchmarks/ --benchmark-only -o addopts= -v +``` + +## Memory check + +```bash +pytest tests/benchmarks/test_parse_memory.py -v -o addopts= +``` + +The memory test also runs as part of the normal `pytest` suite (timing benchmarks are skipped via `--benchmark-skip` in `pyproject.toml`). + +## Scenarios + +| Group | What | +|-------|------| +| parse | `parse_session` on 10 / 500 / 5000+ line JSONL | +| export | `run_bulk_export` over 10 / 50 / 100 sessions | +| search | `GET /api/search` over a 50-session synthetic corpus | + +Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git. + +Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/export numbers measure steady-state throughput on a narrow schema slice — not full parser branch coverage. Treat as v1 baselines, not exhaustive perf proof. + +The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session. + +## CI + +The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet. + +## Refresh baselines + +After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it. diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json new file mode 100644 index 0000000..123a2b4 --- /dev/null +++ b/benchmarks/baselines.json @@ -0,0 +1,10 @@ +{ + "_note": "Informational snapshot only — CI does not gate on these values.", + "updated": null, + "machine": null, + "groups": { + "parse": {}, + "export": {}, + "search": {} + } +} diff --git a/pyproject.toml b/pyproject.toml index 5e8f63b..7203ef0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,8 +5,11 @@ packages = ["api", "utils", "models"] exclude = ["tests/"] [tool.pytest.ini_options] -addopts = "--cov=api --cov=utils --cov-report=term-missing --cov-report=xml:coverage.xml" +addopts = "--cov=api --cov=utils --cov-report=term-missing --cov-report=xml:coverage.xml --benchmark-skip" testpaths = ["tests"] +markers = [ + "benchmark: performance benchmarks (pytest-benchmark)", +] [tool.coverage.run] omit = [ @@ -31,4 +34,4 @@ combine-as-imports = true # CLI bootstrap: sys.path must be set before local imports. "scripts/export.py" = ["E402"] # Tests mirror the same path bootstrap before importing app/utils. -"tests/*.py" = ["E402"] +"tests/**/*.py" = ["E402"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 7e83784..b54a2d1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,3 +6,4 @@ pytest-cov>=5.0 ruff>=0.9.0 pip-audit>=2.7.0 hypothesis>=6.100.0 +pytest-benchmark==5.2.3 diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py new file mode 100644 index 0000000..cd4369c --- /dev/null +++ b/tests/benchmarks/conftest.py @@ -0,0 +1,85 @@ +"""Synthetic corpora for parse/export/search performance benchmarks.""" + +from __future__ import annotations + +import json +from copy import deepcopy +from pathlib import Path + +import pytest + +from app import create_app + +FIXTURES = Path(__file__).resolve().parents[1] / "fixtures" +TEMPLATE_LINE = (FIXTURES / "session_with_tools.jsonl").read_text(encoding="utf-8").splitlines()[0] + + +def write_jsonl(path: Path, line_count: int) -> Path: + """Write a JSONL session file with *line_count* rows derived from the template fixture.""" + template = json.loads(TEMPLATE_LINE) + with path.open("w", encoding="utf-8") as f: + for i in range(line_count): + entry = deepcopy(template) + entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z" + if i % 3 == 1: + msg = entry.setdefault("message", {}) + if isinstance(msg, dict) and "content" in msg: + msg["content"] = [{"type": "text", "text": f"benchmark token {i} searchable"}] + # json.dumps for file I/O — jsonify is Flask's HTTP helper, not file serialization. + serialized = ( + json.dumps(entry, separators=(",", ":")) + "\n" # linters-ignore: prefer-jsonify + ) + f.write(serialized) + return path + + +def seed_search_corpus( + base_dir: Path, + *, + session_count: int = 50, + lines_per_session: int = 20, +) -> Path: + """Create a multi-session project tree under *base_dir* for search benchmarks.""" + project = base_dir / "bench-project" + project.mkdir(parents=True, exist_ok=True) + for i in range(session_count): + write_jsonl(project / f"session_{i:04d}.jsonl", lines_per_session) + return base_dir + + +@pytest.fixture(scope="session") +def parse_small_file(tmp_path_factory: pytest.TempPathFactory) -> Path: + root = tmp_path_factory.mktemp("bench") + return write_jsonl(root / "small.jsonl", 10) + + +@pytest.fixture(scope="session") +def parse_medium_file(tmp_path_factory: pytest.TempPathFactory) -> Path: + root = tmp_path_factory.mktemp("bench") + return write_jsonl(root / "medium.jsonl", 500) + + +@pytest.fixture(scope="session") +def parse_large_file(tmp_path_factory: pytest.TempPathFactory) -> Path: + root = tmp_path_factory.mktemp("bench") + return write_jsonl(root / "large.jsonl", 5000) + + +@pytest.fixture +def export_corpus(tmp_path: Path, request: pytest.FixtureRequest) -> Path: + """Project dir with N session files. Parametrize N via indirect fixture.""" + count = request.param + project = tmp_path / "bench-project" + project.mkdir() + for i in range(count): + write_jsonl(project / f"session_{i:04d}.jsonl", 20) + return project + + +@pytest.fixture +def bench_client_search_corpus(tmp_path: Path): + """Flask test client backed by a 50-session synthetic project tree.""" + seed_search_corpus(tmp_path) + app = create_app(base_dir=str(tmp_path)) + app.config["TESTING"] = True + return app.test_client() diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py new file mode 100644 index 0000000..46c0eaf --- /dev/null +++ b/tests/benchmarks/test_export_bench.py @@ -0,0 +1,39 @@ +"""Benchmark run_bulk_export over 10, 50, and 100 session corpora.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from utils.export_engine import NoopSink, run_bulk_export + + +@pytest.mark.benchmark(group="export") +@pytest.mark.parametrize( + "export_corpus", + [10, 50, 100], + indirect=True, + ids=["sessions-10", "sessions-50", "sessions-100"], +) +def test_bulk_export_session_count( + benchmark, + export_corpus: Path, +) -> None: + projects = [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}] + + def _run() -> object: + # NoopSink + since="all" + empty last_export_sessions: no disk/state writes per round. + return run_bulk_export( + projects=projects, + since="all", + rules=[], + last_export_sessions={}, + sink=NoopSink(), + fmt="md", + path_layout="api", + manifest_style="api", + ) + + result = benchmark(_run) + assert result.exported_session_count > 0 diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py new file mode 100644 index 0000000..d400cca --- /dev/null +++ b/tests/benchmarks/test_parse_bench.py @@ -0,0 +1,24 @@ +"""Benchmark parse_session on small, medium, and large JSONL corpora.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from utils.jsonl_parser import parse_session + + +@pytest.mark.benchmark(group="parse") +def test_parse_session_small(benchmark, parse_small_file: Path) -> None: + benchmark(parse_session, str(parse_small_file)) + + +@pytest.mark.benchmark(group="parse") +def test_parse_session_medium(benchmark, parse_medium_file: Path) -> None: + benchmark(parse_session, str(parse_medium_file)) + + +@pytest.mark.benchmark(group="parse") +def test_parse_session_large(benchmark, parse_large_file: Path) -> None: + benchmark(parse_session, str(parse_large_file)) diff --git a/tests/benchmarks/test_parse_memory.py b/tests/benchmarks/test_parse_memory.py new file mode 100644 index 0000000..de1c886 --- /dev/null +++ b/tests/benchmarks/test_parse_memory.py @@ -0,0 +1,28 @@ +"""Peak memory ceiling for large-file parse_session (regular pytest, not benchmark-only).""" + +from __future__ import annotations + +import tracemalloc +from pathlib import Path + +from utils.jsonl_parser import parse_session + + +def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None: + path = parse_large_file + file_bytes = path.stat().st_size + # Issue #7 ceiling: Python heap peak (tracemalloc) vs on-disk JSONL size. Parsed + # dict/str objects often exceed raw bytes; 10x is a generous v1 guard — relax with + # a comment here if the parser legitimately grows. + ceiling = file_bytes * 10 + + tracemalloc.start() + tracemalloc.clear_traces() + try: + result = parse_session(str(path)) + assert len(result["messages"]) > 0, "parse_session returned no messages" + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + + assert peak < ceiling, f"peak {peak} bytes exceeds 10x file size {file_bytes}" diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py new file mode 100644 index 0000000..95c5c5b --- /dev/null +++ b/tests/benchmarks/test_search_bench.py @@ -0,0 +1,20 @@ +"""Benchmark full-corpus search via the HTTP test client.""" + +from __future__ import annotations + +import pytest +from flask.testing import FlaskClient + + +@pytest.mark.benchmark(group="search") +def test_search_full_corpus( + benchmark, + bench_client_search_corpus: FlaskClient, +) -> None: + def _run() -> object: + return bench_client_search_corpus.get("/api/search?q=searchable&limit=50") + + resp = benchmark(_run) + assert resp.status_code == 200 + hits = resp.get_json() + assert isinstance(hits, list) and len(hits) > 0, "expected search hits from synthetic corpus"