From 672c66886f3acf98ce9ccc7f9bc3a53d664d83df Mon Sep 17 00:00:00 2001 From: Vedant Khandelwal Date: Tue, 9 Jun 2026 13:40:30 -0400 Subject: [PATCH] Add public AssetOpsBench competition starter kit Signed-off-by: Vedant Khandelwal --- .gitignore | 4 + competition/__init__.py | 2 + competition/dataset_utils.py | 133 +++++++ competition/eval_framework.py | 344 ++++++++++++++++++ competition/examples/baseline_predictor.py | 14 + competition/examples/public_scenarios.jsonl | 1 + competition/metadata_config_phase1.json | 20 + competition/metadata_config_phase2.json | 20 + competition/readme.md | 75 ++++ competition/run.py | 53 +++ competition/tests/test_competition_starter.py | 114 ++++++ 11 files changed, 780 insertions(+) create mode 100644 competition/__init__.py create mode 100644 competition/dataset_utils.py create mode 100644 competition/eval_framework.py create mode 100644 competition/examples/baseline_predictor.py create mode 100644 competition/examples/public_scenarios.jsonl create mode 100644 competition/metadata_config_phase1.json create mode 100644 competition/metadata_config_phase2.json create mode 100644 competition/run.py create mode 100644 competition/tests/test_competition_starter.py diff --git a/.gitignore b/.gitignore index a775beb4f..98a322fa4 100644 --- a/.gitignore +++ b/.gitignore @@ -204,3 +204,7 @@ src/tmp/ # Observability artifacts (OTLP-JSON traces + per-run trajectory JSON). traces/ + +# Public competition submission artifacts. +competition_results/ +competition/competition_results/ diff --git a/competition/__init__.py b/competition/__init__.py new file mode 100644 index 000000000..bd9b57365 --- /dev/null +++ b/competition/__init__.py @@ -0,0 +1,2 @@ +"""AssetOpsBench public competition starter kit.""" + diff --git a/competition/dataset_utils.py b/competition/dataset_utils.py new file mode 100644 index 000000000..e77fa1ac2 --- /dev/null +++ b/competition/dataset_utils.py @@ -0,0 +1,133 @@ +"""Public dataset loading helpers for AssetOpsBench competition submissions. + +The public competition dataset must not contain ground truth or rubric fields. +These helpers intentionally reject private/evaluation fields by default. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Iterable + + +PRIVATE_FIELD_NAMES = { + "answer", + "answers", + "correct_answer", + "expected_answer", + "ground_truth", + "label", + "labels", + "reference_answer", + "rubric", + "scoring_method", + "target", + "characteristic_form", +} + +PUBLIC_EXTRA_FIELDS = { + "type", + "category", + "asset_class", + "domain", + "phase", + "difficulty", +} + + +@dataclass(frozen=True) +class AssetOpsScenario: + """One public AssetOpsBench competition scenario.""" + + id: str + text: str + metadata: dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return {"id": self.id, "text": self.text, **self.metadata} + + +def read_json_records(path: str | Path) -> list[dict[str, Any]]: + """Read a JSON list, single JSON object, or JSONL file.""" + + p = Path(path) + text = p.read_text(encoding="utf-8").strip() + if not text: + return [] + + if p.suffix == ".jsonl": + return [json.loads(line) for line in text.splitlines() if line.strip()] + + raw = json.loads(text) + if isinstance(raw, list): + return raw + if isinstance(raw, dict): + if isinstance(raw.get("data"), list): + return raw["data"] + return [raw] + raise ValueError(f"Unsupported JSON shape in {p}: {type(raw).__name__}") + + +def load_public_scenarios( + path: str | Path, + *, + allow_private_fields: bool = False, +) -> list[AssetOpsScenario]: + """Load scenarios from a public dataset file. + + By default this raises if any record includes ground-truth-like fields. + Set ``allow_private_fields=True`` only for local organizer-side conversion + scripts, never for a public Kaggle data artifact. + """ + + scenarios: list[AssetOpsScenario] = [] + for index, raw in enumerate(read_json_records(path)): + if not isinstance(raw, dict): + raise ValueError(f"Record {index} must be an object, got {type(raw).__name__}") + + private = sorted(PRIVATE_FIELD_NAMES.intersection(raw)) + if private and not allow_private_fields: + joined = ", ".join(private) + raise ValueError( + f"Record {index} contains private evaluation field(s): {joined}. " + "Remove ground truth before publishing or submitting." + ) + + scenario_id = raw.get("id", raw.get("scenario_id")) + text = raw.get("text", raw.get("question", raw.get("prompt"))) + if scenario_id is None: + raise ValueError(f"Record {index} is missing required field 'id'.") + if not text: + raise ValueError(f"Record {index} is missing required field 'text'.") + + metadata = {k: raw[k] for k in PUBLIC_EXTRA_FIELDS if k in raw} + scenarios.append( + AssetOpsScenario(id=str(scenario_id), text=str(text), metadata=metadata) + ) + + return scenarios + + +def strip_private_fields(records: Iterable[dict[str, Any]]) -> list[dict[str, Any]]: + """Return public-safe copies of private scenario records.""" + + public_records: list[dict[str, Any]] = [] + for raw in records: + cleaned = {k: v for k, v in raw.items() if k not in PRIVATE_FIELD_NAMES} + public_records.append(cleaned) + return public_records + + +def write_public_dataset(source_path: str | Path, output_path: str | Path) -> Path: + """Create a public-safe JSONL dataset by removing private fields.""" + + records = strip_private_fields(read_json_records(source_path)) + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + with out.open("w", encoding="utf-8") as f: + for record in records: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + return out + diff --git a/competition/eval_framework.py b/competition/eval_framework.py new file mode 100644 index 000000000..26b2d5605 --- /dev/null +++ b/competition/eval_framework.py @@ -0,0 +1,344 @@ +"""AssetOpsBench public competition submission framework. + +It runs participant prediction code over public scenarios and packages a +Kaggle/offline submission zip. AssetOpsBench scoring remains organizer-side. +""" + +from __future__ import annotations + +import argparse +import csv +import importlib +import importlib.util +import json +import logging +import os +import subprocess +import sys +import zipfile +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable + +try: + from .dataset_utils import AssetOpsScenario, load_public_scenarios +except ImportError: + from dataset_utils import AssetOpsScenario, load_public_scenarios + + +logger = logging.getLogger(__name__) + +PredictionFunc = Callable[[AssetOpsScenario], Any] + + +REQUIRED_METADATA_FIELDS = ( + "model_name", + "track", + "base_model_type", + "base_model_name", + "dataset", +) + + +@dataclass +class SubmissionResult: + dataset_name: str + predictions: list[dict[str, str]] + + +def _clean_cell(value: Any, fallback: str = "NOTAVALUE") -> str: + if value is None: + return fallback + text = str(value).replace("\r", " ").strip() + return text if text else fallback + + +def _normalize_prediction(raw: Any) -> dict[str, str]: + """Normalize predictor output into submission columns.""" + + if isinstance(raw, dict): + prediction = raw.get("prediction", raw.get("answer", raw.get("response", ""))) + reasoning = raw.get("reasoning", raw.get("rationale", "")) + trajectory = raw.get("trajectory", raw.get("trace", "")) + else: + prediction = raw + reasoning = "" + trajectory = "" + + if isinstance(reasoning, (dict, list)): + reasoning = json.dumps(reasoning, ensure_ascii=False) + if isinstance(trajectory, (dict, list)): + trajectory = json.dumps(trajectory, ensure_ascii=False) + + return { + "prediction": _clean_cell(prediction, "No prediction available"), + "reasoning": _clean_cell(reasoning, "No reasoning provided"), + "trajectory": _clean_cell(trajectory, "No trajectory provided"), + } + + +def _load_module_from_file(path: Path): + spec = importlib.util.spec_from_file_location(path.stem, path) + if spec is None or spec.loader is None: + raise ImportError(f"Could not load module from {path}") + module = importlib.util.module_from_spec(spec) + sys.modules[path.stem] = module + spec.loader.exec_module(module) + return module + + +def load_predictor(spec: str) -> PredictionFunc: + """Load ``module:function`` or an absolute Python-file path plus function.""" + + if ":" not in spec: + raise ValueError("Predictor must be formatted as 'module:function'.") + + module_name, function_name = spec.rsplit(":", 1) + module_path = Path(module_name) + if module_path.suffix == ".py" or module_path.exists(): + module = _load_module_from_file(module_path.resolve()) + else: + module = importlib.import_module(module_name) + + func = getattr(module, function_name) + if not callable(func): + raise TypeError(f"Predictor target is not callable: {spec}") + return func + + +def command_predictor(command_template: str) -> PredictionFunc: + """Create a predictor that invokes a participant-controlled command. + + Available template fields: + - ``{id}`` + - ``{question}`` + - ``{question_json}`` + - ``{scenario_json}`` + + If stdout is JSON with an ``answer`` or ``prediction`` field, that field is + used. Otherwise stdout becomes the prediction text. + """ + + def predict(scenario: AssetOpsScenario) -> dict[str, str]: + command = command_template.format( + id=scenario.id, + question=scenario.text, + question_json=json.dumps(scenario.text), + scenario_json=json.dumps(scenario.to_dict(), ensure_ascii=False), + ) + completed = subprocess.run( + command, + shell=True, + check=False, + capture_output=True, + text=True, + ) + if completed.returncode != 0: + return { + "prediction": "Error occurred", + "reasoning": completed.stderr.strip(), + "trajectory": "", + } + + stdout = completed.stdout.strip() + try: + parsed = json.loads(stdout) + except json.JSONDecodeError: + return {"prediction": stdout, "reasoning": "", "trajectory": ""} + if isinstance(parsed, dict): + return parsed + return {"prediction": stdout, "reasoning": "", "trajectory": ""} + + return predict + + +class CompetitionKit: + """Small public starter-kit class for generating submissions.""" + + def __init__(self, config_path: str | None = None): + self.config_path = config_path + self.config = load_config_file(config_path) if config_path else {} + self.output_dir = Path(self.config.get("output_dir", "competition_results")) + self.output_dir.mkdir(parents=True, exist_ok=True) + self.dataset_config = self.config.get("dataset", {}) + self.metadata_config = self.config.get("metadata", {}) + + def list_datasets(self) -> None: + name = self.dataset_config.get("dataset_name", "assetopsbench") + description = self.dataset_config.get("description", "") + print(f"{name}: {description}") + + def run_predictions( + self, + predictor: PredictionFunc, + *, + subset_size: int | None = None, + dataset_path: str | None = None, + ) -> SubmissionResult: + dataset_path = dataset_path or self.dataset_config.get("dataset_path") + if not dataset_path: + raise ValueError("Dataset path is required. Set dataset.dataset_path or pass --dataset-path.") + + dataset_name = self.dataset_config.get("dataset_name", Path(dataset_path).stem) + scenarios = load_public_scenarios(dataset_path) + if subset_size is not None and subset_size > 0: + scenarios = scenarios[:subset_size] + + predictions: list[dict[str, str]] = [] + for index, scenario in enumerate(scenarios, start=1): + logger.info("Predicting %s/%s: %s", index, len(scenarios), scenario.id) + try: + normalized = _normalize_prediction(predictor(scenario)) + except Exception as exc: + logger.exception("Predictor failed for scenario %s", scenario.id) + normalized = { + "prediction": "Error occurred", + "reasoning": str(exc), + "trajectory": "No trajectory provided", + } + predictions.append({"id": scenario.id, **normalized}) + + return SubmissionResult(dataset_name=dataset_name, predictions=predictions) + + def save_submission( + self, + result: SubmissionResult, + *, + filename: str = "submission.csv", + metadata: dict[str, Any] | None = None, + ) -> Path: + metadata = self.get_metadata(result.dataset_name, metadata) + self._validate_metadata(metadata) + + csv_path = self.output_dir / filename + with csv_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter( + f, + fieldnames=["id", "prediction", "reasoning", "trajectory"], + quoting=csv.QUOTE_ALL, + ) + writer.writeheader() + writer.writerows(result.predictions) + + metadata_path = self.output_dir / "meta_data.json" + metadata_path.write_text( + json.dumps({"meta_data": metadata}, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + + zip_path = self.output_dir / filename.replace(".csv", ".zip") + with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: + zipf.write(csv_path, filename) + zipf.write(metadata_path, "meta_data.json") + + return zip_path + + def get_metadata( + self, + dataset_name: str, + fallback_metadata: dict[str, Any] | None = None, + ) -> dict[str, Any]: + metadata = { + "model_name": "unknown", + "track": "agentic_reasoning", + "base_model_type": "API", + "base_model_name": "unknown", + "dataset": dataset_name, + "additional_info": "", + } + metadata.update(self.metadata_config) + if fallback_metadata: + metadata.update({k: v for k, v in fallback_metadata.items() if v is not None}) + return metadata + + @staticmethod + def _validate_metadata(metadata: dict[str, Any]) -> None: + missing = [field for field in REQUIRED_METADATA_FIELDS if not metadata.get(field)] + if missing: + raise ValueError(f"Missing required metadata field(s): {', '.join(missing)}") + + +def create_metadata_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="AssetOpsBench submission starter kit") + parser.add_argument("--config", type=str, help="Path to metadata/dataset JSON config.") + parser.add_argument("--dataset-path", type=str, help="Override dataset path from config.") + parser.add_argument("--predictor", type=str, help="Python predictor as module:function.") + parser.add_argument( + "--agent-command", + type=str, + help="Shell command template for an existing agent. Use {question_json} for the prompt.", + ) + parser.add_argument("--output-dir", type=str, default=None) + parser.add_argument("--output-file", type=str, default="submission.csv") + parser.add_argument("--subset-size", type=int, default=None) + parser.add_argument("--model-name", type=str) + parser.add_argument("--track", type=str, choices=["internal_reasoning", "agentic_reasoning"]) + parser.add_argument("--base-model-type", type=str, choices=["API", "OpenWeighted", "Hybrid"]) + parser.add_argument("--base-model-name", type=str) + parser.add_argument("--dataset", type=str) + parser.add_argument("--additional-info", type=str) + parser.add_argument("-v", "--verbose", action="store_true") + return parser + + +def load_config_file(config_path: str | None) -> dict[str, Any]: + if not config_path: + return {} + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"Configuration file not found: {path}") + return json.loads(path.read_text(encoding="utf-8")) + + +def load_and_merge_config(args: argparse.Namespace) -> argparse.Namespace: + config = load_config_file(args.config) if args.config else {} + + if args.output_dir is None and config.get("output_dir"): + args.output_dir = config["output_dir"] + if args.output_file == "submission.csv" and config.get("output_file"): + args.output_file = config["output_file"] + + predictor = config.get("predictor", {}) + if args.predictor is None and predictor.get("path"): + args.predictor = predictor["path"] + if args.agent_command is None and predictor.get("agent_command"): + args.agent_command = predictor["agent_command"] + + dataset = config.get("dataset", {}) + if args.dataset_path is None and dataset.get("dataset_path"): + args.dataset_path = dataset["dataset_path"] + if args.dataset is None and dataset.get("dataset_name"): + args.dataset = dataset["dataset_name"] + + metadata = config.get("metadata", {}) + for field in ( + "model_name", + "track", + "base_model_type", + "base_model_name", + "additional_info", + ): + arg_name = field.replace("-", "_") + if getattr(args, arg_name, None) is None and field in metadata: + setattr(args, arg_name, metadata[field]) + + return args + + +def metadata_from_args(args: argparse.Namespace) -> dict[str, Any]: + return { + "model_name": args.model_name, + "track": args.track, + "base_model_type": args.base_model_type, + "base_model_name": args.base_model_name, + "dataset": args.dataset, + "additional_info": args.additional_info, + } + + +def build_predictor_from_args(args: argparse.Namespace) -> PredictionFunc: + if args.predictor: + return load_predictor(args.predictor) + if args.agent_command: + return command_predictor(args.agent_command) + raise ValueError("Provide either --predictor module:function or --agent-command.") diff --git a/competition/examples/baseline_predictor.py b/competition/examples/baseline_predictor.py new file mode 100644 index 000000000..66cc754e9 --- /dev/null +++ b/competition/examples/baseline_predictor.py @@ -0,0 +1,14 @@ +"""Minimal AssetOpsBench predictor entry point.""" + +from __future__ import annotations + + +def predict(scenario): + return { + "prediction": ( + "No automated answer was generated by the bundled baseline for " + f"scenario {scenario.id}." + ), + "reasoning": "Bundled baseline predictor.", + "trajectory": [], + } diff --git a/competition/examples/public_scenarios.jsonl b/competition/examples/public_scenarios.jsonl new file mode 100644 index 000000000..6c119edc3 --- /dev/null +++ b/competition/examples/public_scenarios.jsonl @@ -0,0 +1 @@ +{"id":"301","text":"What vibration analysis capabilities are available?","type":"Vibration","category":"Knowledge Query"} diff --git a/competition/metadata_config_phase1.json b/competition/metadata_config_phase1.json new file mode 100644 index 000000000..21dfd5efb --- /dev/null +++ b/competition/metadata_config_phase1.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "model_name": "assetopsbench-agent", + "track": "agentic_reasoning", + "base_model_type": "API", + "base_model_name": "assetopsbench-agent-base", + "dataset": "assetopsbench_phase1", + "additional_info": "Submission generated with the AssetOpsBench public competition starter kit." + }, + "dataset": { + "dataset_name": "assetopsbench_phase1", + "dataset_path": "competition/examples/public_scenarios.jsonl", + "description": "AssetOpsBench phase 1 public competition scenarios." + }, + "predictor": { + "path": "competition/examples/baseline_predictor.py:predict" + }, + "output_dir": "competition_results", + "output_file": "submission.csv" +} diff --git a/competition/metadata_config_phase2.json b/competition/metadata_config_phase2.json new file mode 100644 index 000000000..04c53f38d --- /dev/null +++ b/competition/metadata_config_phase2.json @@ -0,0 +1,20 @@ +{ + "metadata": { + "model_name": "assetopsbench-agent", + "track": "agentic_reasoning", + "base_model_type": "API", + "base_model_name": "assetopsbench-agent-base", + "dataset": "assetopsbench_phase2", + "additional_info": "Final phase submission generated with the AssetOpsBench public competition starter kit." + }, + "dataset": { + "dataset_name": "assetopsbench_phase2", + "dataset_path": "competition/examples/public_scenarios.jsonl", + "description": "AssetOpsBench final private phase 2 competition scenarios." + }, + "predictor": { + "path": "competition/examples/baseline_predictor.py:predict" + }, + "output_dir": "competition_results", + "output_file": "submission.csv" +} diff --git a/competition/readme.md b/competition/readme.md index e69de29bb..984cd96f7 100644 --- a/competition/readme.md +++ b/competition/readme.md @@ -0,0 +1,75 @@ +# AssetOpsBench Competition Starter Kit + +This folder is the public submission kit for AssetOpsBench competitions. It is +designed for AssetOpsBench agent scenarios: + +```text +competition/ +├── run.py # command-line submission generator +├── eval_framework.py # public packaging framework, no scoring +├── dataset_utils.py # public dataset loader and ground-truth guard +├── metadata_config_phase1.json # editable phase 1 config +├── metadata_config_phase2.json # editable final phase 2 config +└── examples/baseline_predictor.py +``` + +## Public data rule + +Do not publish or upload ground truth. Public Kaggle data should include only +fields such as: + +```json +{"id": "301", "text": "What vibration analysis capabilities are available?", "type": "Vibration"} +``` + +The runner rejects records containing private evaluation fields such as +`expected_answer`, `correct_answer`, `answer`, `ground_truth`, +`characteristic_form`, or `scoring_method`. + +## Quick start + +Set `dataset.dataset_path` in `metadata_config_phase1.json` to the public +Kaggle JSONL file, and set `predictor.path` to the submission predictor: + +```bash +python competition/run.py --config competition/metadata_config_phase1.json +``` + +The predictor is a `module:function` reference. The function receives an +`AssetOpsScenario` with `id`, `text`, and optional metadata. It can return a +string, or a dictionary: + +```python +def predict(scenario): + return { + "prediction": "final answer text", + "reasoning": "short optional rationale", + "trajectory": [{"tool": "get_assets", "status": "ok"}], + } +``` + +The generated zip contains: + +- `submission.csv` with `id`, `prediction`, `reasoning`, and `trajectory` +- `meta_data.json` with model and track metadata + +## Existing agent command + +You can also wrap an existing CLI instead of a Python function: + +```bash +python competition/run.py \ + --config competition/metadata_config_phase1.json \ + --agent-command 'uv run plan-execute --json {question_json}' +``` + +If the command prints JSON with an `answer` or `prediction` field, that value +is used. Otherwise stdout is used as the prediction. + +## Final phase + +One week before the competition ends, organizers should release the final phase +2 public dataset with the same public schema and no ground truth. Participants +should switch to `metadata_config_phase2.json`, generate the zip, and submit it +before the deadline. Final scoring can then be run offline by organizers against +the private ground-truth scenarios. diff --git a/competition/run.py b/competition/run.py new file mode 100644 index 000000000..6840aaff1 --- /dev/null +++ b/competition/run.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +"""AssetOpsBench public competition submission script.""" + +from __future__ import annotations + +import logging + +from eval_framework import ( + CompetitionKit, + build_predictor_from_args, + create_metadata_parser, + load_and_merge_config, + metadata_from_args, +) + + +def main() -> int: + parser = create_metadata_parser() + args = load_and_merge_config(parser.parse_args()) + + logging.basicConfig( + level=logging.INFO if args.verbose else logging.WARNING, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + + kit = CompetitionKit(config_path=args.config) + if args.output_dir: + kit.output_dir = kit.output_dir.__class__(args.output_dir) + kit.output_dir.mkdir(parents=True, exist_ok=True) + + predictor = build_predictor_from_args(args) + print("AssetOpsBench Competition - Submission Generation") + kit.list_datasets() + + result = kit.run_predictions( + predictor, + subset_size=args.subset_size, + dataset_path=args.dataset_path, + ) + submission_path = kit.save_submission( + result, + filename=args.output_file, + metadata=metadata_from_args(args), + ) + + print(f"Processed scenarios: {len(result.predictions)}") + print(f"Submission package: {submission_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/competition/tests/test_competition_starter.py b/competition/tests/test_competition_starter.py new file mode 100644 index 000000000..d13b4af4f --- /dev/null +++ b/competition/tests/test_competition_starter.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import csv +import json +import sys +import zipfile +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parents[2])) + +from competition.dataset_utils import load_public_scenarios, write_public_dataset +from competition.eval_framework import CompetitionKit + + +def test_load_public_scenarios_rejects_private_fields(tmp_path: Path): + dataset = tmp_path / "private.jsonl" + dataset.write_text( + json.dumps( + { + "id": 1, + "text": "Q", + "characteristic_form": "private rubric", + } + ) + + "\n", + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="private evaluation field"): + load_public_scenarios(dataset) + + +def test_write_public_dataset_strips_private_fields(tmp_path: Path): + private_dataset = tmp_path / "private.json" + public_dataset = tmp_path / "public.jsonl" + private_dataset.write_text( + json.dumps( + [ + { + "id": 1, + "text": "Q", + "type": "Vibration", + "expected_answer": "A", + "characteristic_form": "rubric", + } + ] + ), + encoding="utf-8", + ) + + write_public_dataset(private_dataset, public_dataset) + scenarios = load_public_scenarios(public_dataset) + + assert len(scenarios) == 1 + assert scenarios[0].id == "1" + assert scenarios[0].metadata["type"] == "Vibration" + + +def test_competition_kit_packages_submission(tmp_path: Path): + dataset = tmp_path / "public.jsonl" + dataset.write_text( + '{"id": "301", "text": "What vibration tools are available?", "type": "Vibration"}\n', + encoding="utf-8", + ) + config = tmp_path / "config.json" + config.write_text( + json.dumps( + { + "metadata": { + "model_name": "unit-model", + "track": "agentic_reasoning", + "base_model_type": "API", + "base_model_name": "unit-base", + "dataset": "unit_dataset", + }, + "dataset": { + "dataset_name": "unit_dataset", + "dataset_path": str(dataset), + "description": "test", + }, + "output_dir": str(tmp_path / "out"), + } + ), + encoding="utf-8", + ) + + kit = CompetitionKit(str(config)) + result = kit.run_predictions( + lambda scenario: { + "prediction": f"answer for {scenario.id}", + "reasoning": {"used": "unit"}, + "trajectory": [{"tool": "none"}], + } + ) + package = kit.save_submission(result) + + assert package.exists() + with zipfile.ZipFile(package) as zf: + assert sorted(zf.namelist()) == ["meta_data.json", "submission.csv"] + with zf.open("submission.csv") as f: + rows = list(csv.DictReader(line.decode("utf-8") for line in f)) + metadata = json.loads(zf.read("meta_data.json").decode("utf-8")) + + assert rows == [ + { + "id": "301", + "prediction": "answer for 301", + "reasoning": '{"used": "unit"}', + "trajectory": '[{"tool": "none"}]', + } + ] + assert metadata["meta_data"]["dataset"] == "unit_dataset"