cppalliance · wpak-ai · Jun 12, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -204,3 +204,37 @@ jobs:
           esac
           npm install --no-save "${PKG}@${ROLLUP_VERSION}"
       - run: npm test
+
+  benchmarks:
+    name: Performance benchmarks (informational)
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        with:
+          persist-credentials: false
+
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
+        with:
+          python-version: "3.12"
+          cache: pip
+          cache-dependency-path: |
+            requirements.txt
+            requirements-dev.txt
+
+      - name: Install dev dependencies
+        run: pip install -r requirements-dev.txt
+
+      - name: Run benchmarks
+        run: >
+          pytest tests/benchmarks/
+          --benchmark-only
+          --benchmark-json=benchmark-results.json
+          --benchmark-columns=min,max,mean,stddev,rounds
+          -o addopts=
+
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        with:
+          name: benchmark-results
+          path: benchmark-results.json
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -66,6 +66,7 @@ pytest tests/test_api_integration.py -v
 pytest tests/test_search.py -v
 pytest tests/test_api_routes.py -v
 pytest tests/test_error_codes.py -v
+pytest tests/benchmarks/ --benchmark-only -o addopts= -v   # performance baselines (see benchmarks/README.md)
 ```
 
 ### JavaScript (vitest)

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,42 @@
+# Performance benchmarks
+
+Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot.
+
+Repeatable local measurements for parse, bulk export, and search hot paths.
+
+## Run locally
+
+```bash
+pip install -r requirements-dev.txt
+pytest tests/benchmarks/ --benchmark-only -o addopts= -v
+```
+
+## Memory check
+
+```bash
+pytest tests/benchmarks/test_parse_memory.py -v -o addopts=
+```
+
+The memory test also runs as part of the normal `pytest` suite (timing benchmarks are skipped via `--benchmark-skip` in `pyproject.toml`).
+
+## Scenarios
+
+| Group | What |
+|-------|------|
+| parse | `parse_session` on 10 / 500 / 5000+ line JSONL |
+| export | `run_bulk_export` over 10 / 50 / 100 sessions |
+| search | `GET /api/search` over a 50-session synthetic corpus |
+
+Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git.
+
+Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/export numbers measure steady-state throughput on a narrow schema slice — not full parser branch coverage. Treat as v1 baselines, not exhaustive perf proof.
+
+The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session.
+
+## CI
+
+The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet.
+
+## Refresh baselines
+
+After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it.
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
@@ -0,0 +1,10 @@
+{
+  "_note": "Informational snapshot only — CI does not gate on these values.",
+  "updated": null,
+  "machine": null,
+  "groups": {
+    "parse": {},
+    "export": {},
+    "search": {}
+  }
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,8 +5,11 @@ packages = ["api", "utils", "models"]
 exclude = ["tests/"]
 
 [tool.pytest.ini_options]
-addopts = "--cov=api --cov=utils --cov-report=term-missing --cov-report=xml:coverage.xml"
+addopts = "--cov=api --cov=utils --cov-report=term-missing --cov-report=xml:coverage.xml --benchmark-skip"
 testpaths = ["tests"]
+markers = [
+    "benchmark: performance benchmarks (pytest-benchmark)",
+]
 
 [tool.coverage.run]
 omit = [
@@ -31,4 +34,4 @@ combine-as-imports = true
 # CLI bootstrap: sys.path must be set before local imports.
 "scripts/export.py" = ["E402"]
 # Tests mirror the same path bootstrap before importing app/utils.
-"tests/*.py" = ["E402"]
+"tests/**/*.py" = ["E402"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -6,3 +6,4 @@ pytest-cov>=5.0
 ruff>=0.9.0
 pip-audit>=2.7.0
 hypothesis>=6.100.0
+pytest-benchmark==5.2.3
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
@@ -0,0 +1,85 @@
+"""Synthetic corpora for parse/export/search performance benchmarks."""
+
+from __future__ import annotations
+
+import json
+from copy import deepcopy
+from pathlib import Path
+
+import pytest
+
+from app import create_app
+
+FIXTURES = Path(__file__).resolve().parents[1] / "fixtures"
+TEMPLATE_LINE = (FIXTURES / "session_with_tools.jsonl").read_text(encoding="utf-8").splitlines()[0]
+
+
+def write_jsonl(path: Path, line_count: int) -> Path:
+    """Write a JSONL session file with *line_count* rows derived from the template fixture."""
+    template = json.loads(TEMPLATE_LINE)
+    with path.open("w", encoding="utf-8") as f:
+        for i in range(line_count):
+            entry = deepcopy(template)
+            entry["timestamp"] = f"2026-06-12T10:{i % 60:02d}:00Z"
+            if i % 3 == 1:
+                msg = entry.setdefault("message", {})
+                if isinstance(msg, dict) and "content" in msg:
+                    msg["content"] = [{"type": "text", "text": f"benchmark token {i} searchable"}]
+            # json.dumps for file I/O — jsonify is Flask's HTTP helper, not file serialization.
+            serialized = (
+                json.dumps(entry, separators=(",", ":")) + "\n"  # linters-ignore: prefer-jsonify
+            )
+            f.write(serialized)
+    return path
+
+
+def seed_search_corpus(
+    base_dir: Path,
+    *,
+    session_count: int = 50,
+    lines_per_session: int = 20,
+) -> Path:
+    """Create a multi-session project tree under *base_dir* for search benchmarks."""
+    project = base_dir / "bench-project"
+    project.mkdir(parents=True, exist_ok=True)
+    for i in range(session_count):
+        write_jsonl(project / f"session_{i:04d}.jsonl", lines_per_session)
+    return base_dir
+
+
+@pytest.fixture(scope="session")
+def parse_small_file(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    root = tmp_path_factory.mktemp("bench")
+    return write_jsonl(root / "small.jsonl", 10)
+
+
+@pytest.fixture(scope="session")
+def parse_medium_file(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    root = tmp_path_factory.mktemp("bench")
+    return write_jsonl(root / "medium.jsonl", 500)
+
+
+@pytest.fixture(scope="session")
+def parse_large_file(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    root = tmp_path_factory.mktemp("bench")
+    return write_jsonl(root / "large.jsonl", 5000)
+
+
+@pytest.fixture
+def export_corpus(tmp_path: Path, request: pytest.FixtureRequest) -> Path:
+    """Project dir with N session files. Parametrize N via indirect fixture."""
+    count = request.param
+    project = tmp_path / "bench-project"
+    project.mkdir()
+    for i in range(count):
+        write_jsonl(project / f"session_{i:04d}.jsonl", 20)
+    return project
+
+
+@pytest.fixture
+def bench_client_search_corpus(tmp_path: Path):
+    """Flask test client backed by a 50-session synthetic project tree."""
+    seed_search_corpus(tmp_path)
+    app = create_app(base_dir=str(tmp_path))
+    app.config["TESTING"] = True
+    return app.test_client()
diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py
@@ -0,0 +1,39 @@
+"""Benchmark run_bulk_export over 10, 50, and 100 session corpora."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from utils.export_engine import NoopSink, run_bulk_export
+
+
+@pytest.mark.benchmark(group="export")
+@pytest.mark.parametrize(
+    "export_corpus",
+    [10, 50, 100],
+    indirect=True,
+    ids=["sessions-10", "sessions-50", "sessions-100"],
+)
+def test_bulk_export_session_count(
+    benchmark,
+    export_corpus: Path,
+) -> None:
+    projects = [{"name": "bench-project", "path": str(export_corpus), "display_name": "Bench"}]
+
+    def _run() -> object:
+        # NoopSink + since="all" + empty last_export_sessions: no disk/state writes per round.
+        return run_bulk_export(
+            projects=projects,
+            since="all",
+            rules=[],
+            last_export_sessions={},
+            sink=NoopSink(),
+            fmt="md",
+            path_layout="api",
+            manifest_style="api",
+        )
+
+    result = benchmark(_run)
+    assert result.exported_session_count > 0
diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py
@@ -0,0 +1,24 @@
+"""Benchmark parse_session on small, medium, and large JSONL corpora."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from utils.jsonl_parser import parse_session
+
+
+@pytest.mark.benchmark(group="parse")
+def test_parse_session_small(benchmark, parse_small_file: Path) -> None:
+    benchmark(parse_session, str(parse_small_file))
+
+
+@pytest.mark.benchmark(group="parse")
+def test_parse_session_medium(benchmark, parse_medium_file: Path) -> None:
+    benchmark(parse_session, str(parse_medium_file))
+
+
+@pytest.mark.benchmark(group="parse")
+def test_parse_session_large(benchmark, parse_large_file: Path) -> None:
+    benchmark(parse_session, str(parse_large_file))
diff --git a/tests/benchmarks/test_parse_memory.py b/tests/benchmarks/test_parse_memory.py
@@ -0,0 +1,28 @@
+"""Peak memory ceiling for large-file parse_session (regular pytest, not benchmark-only)."""
+
+from __future__ import annotations
+
+import tracemalloc
+from pathlib import Path
+
+from utils.jsonl_parser import parse_session
+
+
+def test_large_parse_peak_memory_under_ceiling(parse_large_file: Path) -> None:
+    path = parse_large_file
+    file_bytes = path.stat().st_size
+    # Issue #7 ceiling: Python heap peak (tracemalloc) vs on-disk JSONL size. Parsed
+    # dict/str objects often exceed raw bytes; 10x is a generous v1 guard — relax with
+    # a comment here if the parser legitimately grows.
+    ceiling = file_bytes * 10
+
+    tracemalloc.start()
+    tracemalloc.clear_traces()
+    try:
+        result = parse_session(str(path))
+        assert len(result["messages"]) > 0, "parse_session returned no messages"
+        _, peak = tracemalloc.get_traced_memory()
+    finally:
+        tracemalloc.stop()
+
+    assert peak < ceiling, f"peak {peak} bytes exceeds 10x file size {file_bytes}"
diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py
@@ -0,0 +1,20 @@
+"""Benchmark full-corpus search via the HTTP test client."""
+
+from __future__ import annotations
+
+import pytest
+from flask.testing import FlaskClient
+
+
+@pytest.mark.benchmark(group="search")
+def test_search_full_corpus(
+    benchmark,
+    bench_client_search_corpus: FlaskClient,
+) -> None:
+    def _run() -> object:
+        return bench_client_search_corpus.get("/api/search?q=searchable&limit=50")
+
+    resp = benchmark(_run)
+    assert resp.status_code == 200
+    hits = resp.get_json()
+    assert isinstance(hits, list) and len(hits) > 0, "expected search hits from synthetic corpus"