From 672c66886f3acf98ce9ccc7f9bc3a53d664d83df Mon Sep 17 00:00:00 2001
From: Vedant Khandelwal <kh.vedant02@gmail.com>
Date: Tue, 9 Jun 2026 13:40:30 -0400
Subject: [PATCH] Add public AssetOpsBench competition starter kit

Signed-off-by: Vedant Khandelwal <kh.vedant02@gmail.com>
---
 .gitignore                                    |   4 +
 competition/__init__.py                       |   2 +
 competition/dataset_utils.py                  | 133 +++++++
 competition/eval_framework.py                 | 344 ++++++++++++++++++
 competition/examples/baseline_predictor.py    |  14 +
 competition/examples/public_scenarios.jsonl   |   1 +
 competition/metadata_config_phase1.json       |  20 +
 competition/metadata_config_phase2.json       |  20 +
 competition/readme.md                         |  75 ++++
 competition/run.py                            |  53 +++
 competition/tests/test_competition_starter.py | 114 ++++++
 11 files changed, 780 insertions(+)
 create mode 100644 competition/__init__.py
 create mode 100644 competition/dataset_utils.py
 create mode 100644 competition/eval_framework.py
 create mode 100644 competition/examples/baseline_predictor.py
 create mode 100644 competition/examples/public_scenarios.jsonl
 create mode 100644 competition/metadata_config_phase1.json
 create mode 100644 competition/metadata_config_phase2.json
 create mode 100644 competition/run.py
 create mode 100644 competition/tests/test_competition_starter.py

diff --git a/.gitignore b/.gitignore
index a775beb4f..98a322fa4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -204,3 +204,7 @@ src/tmp/
 
 # Observability artifacts (OTLP-JSON traces + per-run trajectory JSON).
 traces/
+
+# Public competition submission artifacts.
+competition_results/
+competition/competition_results/
diff --git a/competition/__init__.py b/competition/__init__.py
new file mode 100644
index 000000000..bd9b57365
--- /dev/null
+++ b/competition/__init__.py
@@ -0,0 +1,2 @@
+"""AssetOpsBench public competition starter kit."""
+
diff --git a/competition/dataset_utils.py b/competition/dataset_utils.py
new file mode 100644
index 000000000..e77fa1ac2
--- /dev/null
+++ b/competition/dataset_utils.py
@@ -0,0 +1,133 @@
+"""Public dataset loading helpers for AssetOpsBench competition submissions.
+
+The public competition dataset must not contain ground truth or rubric fields.
+These helpers intentionally reject private/evaluation fields by default.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Iterable
+
+
+PRIVATE_FIELD_NAMES = {
+    "answer",
+    "answers",
+    "correct_answer",
+    "expected_answer",
+    "ground_truth",
+    "label",
+    "labels",
+    "reference_answer",
+    "rubric",
+    "scoring_method",
+    "target",
+    "characteristic_form",
+}
+
+PUBLIC_EXTRA_FIELDS = {
+    "type",
+    "category",
+    "asset_class",
+    "domain",
+    "phase",
+    "difficulty",
+}
+
+
+@dataclass(frozen=True)
+class AssetOpsScenario:
+    """One public AssetOpsBench competition scenario."""
+
+    id: str
+    text: str
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"id": self.id, "text": self.text, **self.metadata}
+
+
+def read_json_records(path: str | Path) -> list[dict[str, Any]]:
+    """Read a JSON list, single JSON object, or JSONL file."""
+
+    p = Path(path)
+    text = p.read_text(encoding="utf-8").strip()
+    if not text:
+        return []
+
+    if p.suffix == ".jsonl":
+        return [json.loads(line) for line in text.splitlines() if line.strip()]
+
+    raw = json.loads(text)
+    if isinstance(raw, list):
+        return raw
+    if isinstance(raw, dict):
+        if isinstance(raw.get("data"), list):
+            return raw["data"]
+        return [raw]
+    raise ValueError(f"Unsupported JSON shape in {p}: {type(raw).__name__}")
+
+
+def load_public_scenarios(
+    path: str | Path,
+    *,
+    allow_private_fields: bool = False,
+) -> list[AssetOpsScenario]:
+    """Load scenarios from a public dataset file.
+
+    By default this raises if any record includes ground-truth-like fields.
+    Set ``allow_private_fields=True`` only for local organizer-side conversion
+    scripts, never for a public Kaggle data artifact.
+    """
+
+    scenarios: list[AssetOpsScenario] = []
+    for index, raw in enumerate(read_json_records(path)):
+        if not isinstance(raw, dict):
+            raise ValueError(f"Record {index} must be an object, got {type(raw).__name__}")
+
+        private = sorted(PRIVATE_FIELD_NAMES.intersection(raw))
+        if private and not allow_private_fields:
+            joined = ", ".join(private)
+            raise ValueError(
+                f"Record {index} contains private evaluation field(s): {joined}. "
+                "Remove ground truth before publishing or submitting."
+            )
+
+        scenario_id = raw.get("id", raw.get("scenario_id"))
+        text = raw.get("text", raw.get("question", raw.get("prompt")))
+        if scenario_id is None:
+            raise ValueError(f"Record {index} is missing required field 'id'.")
+        if not text:
+            raise ValueError(f"Record {index} is missing required field 'text'.")
+
+        metadata = {k: raw[k] for k in PUBLIC_EXTRA_FIELDS if k in raw}
+        scenarios.append(
+            AssetOpsScenario(id=str(scenario_id), text=str(text), metadata=metadata)
+        )
+
+    return scenarios
+
+
+def strip_private_fields(records: Iterable[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Return public-safe copies of private scenario records."""
+
+    public_records: list[dict[str, Any]] = []
+    for raw in records:
+        cleaned = {k: v for k, v in raw.items() if k not in PRIVATE_FIELD_NAMES}
+        public_records.append(cleaned)
+    return public_records
+
+
+def write_public_dataset(source_path: str | Path, output_path: str | Path) -> Path:
+    """Create a public-safe JSONL dataset by removing private fields."""
+
+    records = strip_private_fields(read_json_records(source_path))
+    out = Path(output_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8") as f:
+        for record in records:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    return out
+
diff --git a/competition/eval_framework.py b/competition/eval_framework.py
new file mode 100644
index 000000000..26b2d5605
--- /dev/null
+++ b/competition/eval_framework.py
@@ -0,0 +1,344 @@
+"""AssetOpsBench public competition submission framework.
+
+It runs participant prediction code over public scenarios and packages a
+Kaggle/offline submission zip. AssetOpsBench scoring remains organizer-side.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import importlib
+import importlib.util
+import json
+import logging
+import os
+import subprocess
+import sys
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable
+
+try:
+    from .dataset_utils import AssetOpsScenario, load_public_scenarios
+except ImportError:
+    from dataset_utils import AssetOpsScenario, load_public_scenarios
+
+
+logger = logging.getLogger(__name__)
+
+PredictionFunc = Callable[[AssetOpsScenario], Any]
+
+
+REQUIRED_METADATA_FIELDS = (
+    "model_name",
+    "track",
+    "base_model_type",
+    "base_model_name",
+    "dataset",
+)
+
+
+@dataclass
+class SubmissionResult:
+    dataset_name: str
+    predictions: list[dict[str, str]]
+
+
+def _clean_cell(value: Any, fallback: str = "NOTAVALUE") -> str:
+    if value is None:
+        return fallback
+    text = str(value).replace("\r", " ").strip()
+    return text if text else fallback
+
+
+def _normalize_prediction(raw: Any) -> dict[str, str]:
+    """Normalize predictor output into submission columns."""
+
+    if isinstance(raw, dict):
+        prediction = raw.get("prediction", raw.get("answer", raw.get("response", "")))
+        reasoning = raw.get("reasoning", raw.get("rationale", ""))
+        trajectory = raw.get("trajectory", raw.get("trace", ""))
+    else:
+        prediction = raw
+        reasoning = ""
+        trajectory = ""
+
+    if isinstance(reasoning, (dict, list)):
+        reasoning = json.dumps(reasoning, ensure_ascii=False)
+    if isinstance(trajectory, (dict, list)):
+        trajectory = json.dumps(trajectory, ensure_ascii=False)
+
+    return {
+        "prediction": _clean_cell(prediction, "No prediction available"),
+        "reasoning": _clean_cell(reasoning, "No reasoning provided"),
+        "trajectory": _clean_cell(trajectory, "No trajectory provided"),
+    }
+
+
+def _load_module_from_file(path: Path):
+    spec = importlib.util.spec_from_file_location(path.stem, path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not load module from {path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[path.stem] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def load_predictor(spec: str) -> PredictionFunc:
+    """Load ``module:function`` or an absolute Python-file path plus function."""
+
+    if ":" not in spec:
+        raise ValueError("Predictor must be formatted as 'module:function'.")
+
+    module_name, function_name = spec.rsplit(":", 1)
+    module_path = Path(module_name)
+    if module_path.suffix == ".py" or module_path.exists():
+        module = _load_module_from_file(module_path.resolve())
+    else:
+        module = importlib.import_module(module_name)
+
+    func = getattr(module, function_name)
+    if not callable(func):
+        raise TypeError(f"Predictor target is not callable: {spec}")
+    return func
+
+
+def command_predictor(command_template: str) -> PredictionFunc:
+    """Create a predictor that invokes a participant-controlled command.
+
+    Available template fields:
+    - ``{id}``
+    - ``{question}``
+    - ``{question_json}``
+    - ``{scenario_json}``
+
+    If stdout is JSON with an ``answer`` or ``prediction`` field, that field is
+    used. Otherwise stdout becomes the prediction text.
+    """
+
+    def predict(scenario: AssetOpsScenario) -> dict[str, str]:
+        command = command_template.format(
+            id=scenario.id,
+            question=scenario.text,
+            question_json=json.dumps(scenario.text),
+            scenario_json=json.dumps(scenario.to_dict(), ensure_ascii=False),
+        )
+        completed = subprocess.run(
+            command,
+            shell=True,
+            check=False,
+            capture_output=True,
+            text=True,
+        )
+        if completed.returncode != 0:
+            return {
+                "prediction": "Error occurred",
+                "reasoning": completed.stderr.strip(),
+                "trajectory": "",
+            }
+
+        stdout = completed.stdout.strip()
+        try:
+            parsed = json.loads(stdout)
+        except json.JSONDecodeError:
+            return {"prediction": stdout, "reasoning": "", "trajectory": ""}
+        if isinstance(parsed, dict):
+            return parsed
+        return {"prediction": stdout, "reasoning": "", "trajectory": ""}
+
+    return predict
+
+
+class CompetitionKit:
+    """Small public starter-kit class for generating submissions."""
+
+    def __init__(self, config_path: str | None = None):
+        self.config_path = config_path
+        self.config = load_config_file(config_path) if config_path else {}
+        self.output_dir = Path(self.config.get("output_dir", "competition_results"))
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.dataset_config = self.config.get("dataset", {})
+        self.metadata_config = self.config.get("metadata", {})
+
+    def list_datasets(self) -> None:
+        name = self.dataset_config.get("dataset_name", "assetopsbench")
+        description = self.dataset_config.get("description", "")
+        print(f"{name}: {description}")
+
+    def run_predictions(
+        self,
+        predictor: PredictionFunc,
+        *,
+        subset_size: int | None = None,
+        dataset_path: str | None = None,
+    ) -> SubmissionResult:
+        dataset_path = dataset_path or self.dataset_config.get("dataset_path")
+        if not dataset_path:
+            raise ValueError("Dataset path is required. Set dataset.dataset_path or pass --dataset-path.")
+
+        dataset_name = self.dataset_config.get("dataset_name", Path(dataset_path).stem)
+        scenarios = load_public_scenarios(dataset_path)
+        if subset_size is not None and subset_size > 0:
+            scenarios = scenarios[:subset_size]
+
+        predictions: list[dict[str, str]] = []
+        for index, scenario in enumerate(scenarios, start=1):
+            logger.info("Predicting %s/%s: %s", index, len(scenarios), scenario.id)
+            try:
+                normalized = _normalize_prediction(predictor(scenario))
+            except Exception as exc:
+                logger.exception("Predictor failed for scenario %s", scenario.id)
+                normalized = {
+                    "prediction": "Error occurred",
+                    "reasoning": str(exc),
+                    "trajectory": "No trajectory provided",
+                }
+            predictions.append({"id": scenario.id, **normalized})
+
+        return SubmissionResult(dataset_name=dataset_name, predictions=predictions)
+
+    def save_submission(
+        self,
+        result: SubmissionResult,
+        *,
+        filename: str = "submission.csv",
+        metadata: dict[str, Any] | None = None,
+    ) -> Path:
+        metadata = self.get_metadata(result.dataset_name, metadata)
+        self._validate_metadata(metadata)
+
+        csv_path = self.output_dir / filename
+        with csv_path.open("w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=["id", "prediction", "reasoning", "trajectory"],
+                quoting=csv.QUOTE_ALL,
+            )
+            writer.writeheader()
+            writer.writerows(result.predictions)
+
+        metadata_path = self.output_dir / "meta_data.json"
+        metadata_path.write_text(
+            json.dumps({"meta_data": metadata}, indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+
+        zip_path = self.output_dir / filename.replace(".csv", ".zip")
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+            zipf.write(csv_path, filename)
+            zipf.write(metadata_path, "meta_data.json")
+
+        return zip_path
+
+    def get_metadata(
+        self,
+        dataset_name: str,
+        fallback_metadata: dict[str, Any] | None = None,
+    ) -> dict[str, Any]:
+        metadata = {
+            "model_name": "unknown",
+            "track": "agentic_reasoning",
+            "base_model_type": "API",
+            "base_model_name": "unknown",
+            "dataset": dataset_name,
+            "additional_info": "",
+        }
+        metadata.update(self.metadata_config)
+        if fallback_metadata:
+            metadata.update({k: v for k, v in fallback_metadata.items() if v is not None})
+        return metadata
+
+    @staticmethod
+    def _validate_metadata(metadata: dict[str, Any]) -> None:
+        missing = [field for field in REQUIRED_METADATA_FIELDS if not metadata.get(field)]
+        if missing:
+            raise ValueError(f"Missing required metadata field(s): {', '.join(missing)}")
+
+
+def create_metadata_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="AssetOpsBench submission starter kit")
+    parser.add_argument("--config", type=str, help="Path to metadata/dataset JSON config.")
+    parser.add_argument("--dataset-path", type=str, help="Override dataset path from config.")
+    parser.add_argument("--predictor", type=str, help="Python predictor as module:function.")
+    parser.add_argument(
+        "--agent-command",
+        type=str,
+        help="Shell command template for an existing agent. Use {question_json} for the prompt.",
+    )
+    parser.add_argument("--output-dir", type=str, default=None)
+    parser.add_argument("--output-file", type=str, default="submission.csv")
+    parser.add_argument("--subset-size", type=int, default=None)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--track", type=str, choices=["internal_reasoning", "agentic_reasoning"])
+    parser.add_argument("--base-model-type", type=str, choices=["API", "OpenWeighted", "Hybrid"])
+    parser.add_argument("--base-model-name", type=str)
+    parser.add_argument("--dataset", type=str)
+    parser.add_argument("--additional-info", type=str)
+    parser.add_argument("-v", "--verbose", action="store_true")
+    return parser
+
+
+def load_config_file(config_path: str | None) -> dict[str, Any]:
+    if not config_path:
+        return {}
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {path}")
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_and_merge_config(args: argparse.Namespace) -> argparse.Namespace:
+    config = load_config_file(args.config) if args.config else {}
+
+    if args.output_dir is None and config.get("output_dir"):
+        args.output_dir = config["output_dir"]
+    if args.output_file == "submission.csv" and config.get("output_file"):
+        args.output_file = config["output_file"]
+
+    predictor = config.get("predictor", {})
+    if args.predictor is None and predictor.get("path"):
+        args.predictor = predictor["path"]
+    if args.agent_command is None and predictor.get("agent_command"):
+        args.agent_command = predictor["agent_command"]
+
+    dataset = config.get("dataset", {})
+    if args.dataset_path is None and dataset.get("dataset_path"):
+        args.dataset_path = dataset["dataset_path"]
+    if args.dataset is None and dataset.get("dataset_name"):
+        args.dataset = dataset["dataset_name"]
+
+    metadata = config.get("metadata", {})
+    for field in (
+        "model_name",
+        "track",
+        "base_model_type",
+        "base_model_name",
+        "additional_info",
+    ):
+        arg_name = field.replace("-", "_")
+        if getattr(args, arg_name, None) is None and field in metadata:
+            setattr(args, arg_name, metadata[field])
+
+    return args
+
+
+def metadata_from_args(args: argparse.Namespace) -> dict[str, Any]:
+    return {
+        "model_name": args.model_name,
+        "track": args.track,
+        "base_model_type": args.base_model_type,
+        "base_model_name": args.base_model_name,
+        "dataset": args.dataset,
+        "additional_info": args.additional_info,
+    }
+
+
+def build_predictor_from_args(args: argparse.Namespace) -> PredictionFunc:
+    if args.predictor:
+        return load_predictor(args.predictor)
+    if args.agent_command:
+        return command_predictor(args.agent_command)
+    raise ValueError("Provide either --predictor module:function or --agent-command.")
diff --git a/competition/examples/baseline_predictor.py b/competition/examples/baseline_predictor.py
new file mode 100644
index 000000000..66cc754e9
--- /dev/null
+++ b/competition/examples/baseline_predictor.py
@@ -0,0 +1,14 @@
+"""Minimal AssetOpsBench predictor entry point."""
+
+from __future__ import annotations
+
+
+def predict(scenario):
+    return {
+        "prediction": (
+            "No automated answer was generated by the bundled baseline for "
+            f"scenario {scenario.id}."
+        ),
+        "reasoning": "Bundled baseline predictor.",
+        "trajectory": [],
+    }
diff --git a/competition/examples/public_scenarios.jsonl b/competition/examples/public_scenarios.jsonl
new file mode 100644
index 000000000..6c119edc3
--- /dev/null
+++ b/competition/examples/public_scenarios.jsonl
@@ -0,0 +1 @@
+{"id":"301","text":"What vibration analysis capabilities are available?","type":"Vibration","category":"Knowledge Query"}
diff --git a/competition/metadata_config_phase1.json b/competition/metadata_config_phase1.json
new file mode 100644
index 000000000..21dfd5efb
--- /dev/null
+++ b/competition/metadata_config_phase1.json
@@ -0,0 +1,20 @@
+{
+  "metadata": {
+    "model_name": "assetopsbench-agent",
+    "track": "agentic_reasoning",
+    "base_model_type": "API",
+    "base_model_name": "assetopsbench-agent-base",
+    "dataset": "assetopsbench_phase1",
+    "additional_info": "Submission generated with the AssetOpsBench public competition starter kit."
+  },
+  "dataset": {
+    "dataset_name": "assetopsbench_phase1",
+    "dataset_path": "competition/examples/public_scenarios.jsonl",
+    "description": "AssetOpsBench phase 1 public competition scenarios."
+  },
+  "predictor": {
+    "path": "competition/examples/baseline_predictor.py:predict"
+  },
+  "output_dir": "competition_results",
+  "output_file": "submission.csv"
+}
diff --git a/competition/metadata_config_phase2.json b/competition/metadata_config_phase2.json
new file mode 100644
index 000000000..04c53f38d
--- /dev/null
+++ b/competition/metadata_config_phase2.json
@@ -0,0 +1,20 @@
+{
+  "metadata": {
+    "model_name": "assetopsbench-agent",
+    "track": "agentic_reasoning",
+    "base_model_type": "API",
+    "base_model_name": "assetopsbench-agent-base",
+    "dataset": "assetopsbench_phase2",
+    "additional_info": "Final phase submission generated with the AssetOpsBench public competition starter kit."
+  },
+  "dataset": {
+    "dataset_name": "assetopsbench_phase2",
+    "dataset_path": "competition/examples/public_scenarios.jsonl",
+    "description": "AssetOpsBench final private phase 2 competition scenarios."
+  },
+  "predictor": {
+    "path": "competition/examples/baseline_predictor.py:predict"
+  },
+  "output_dir": "competition_results",
+  "output_file": "submission.csv"
+}
diff --git a/competition/readme.md b/competition/readme.md
index e69de29bb..984cd96f7 100644
--- a/competition/readme.md
+++ b/competition/readme.md
@@ -0,0 +1,75 @@
+# AssetOpsBench Competition Starter Kit
+
+This folder is the public submission kit for AssetOpsBench competitions. It is
+designed for AssetOpsBench agent scenarios:
+
+```text
+competition/
+├── run.py                       # command-line submission generator
+├── eval_framework.py            # public packaging framework, no scoring
+├── dataset_utils.py             # public dataset loader and ground-truth guard
+├── metadata_config_phase1.json  # editable phase 1 config
+├── metadata_config_phase2.json  # editable final phase 2 config
+└── examples/baseline_predictor.py
+```
+
+## Public data rule
+
+Do not publish or upload ground truth. Public Kaggle data should include only
+fields such as:
+
+```json
+{"id": "301", "text": "What vibration analysis capabilities are available?", "type": "Vibration"}
+```
+
+The runner rejects records containing private evaluation fields such as
+`expected_answer`, `correct_answer`, `answer`, `ground_truth`,
+`characteristic_form`, or `scoring_method`.
+
+## Quick start
+
+Set `dataset.dataset_path` in `metadata_config_phase1.json` to the public
+Kaggle JSONL file, and set `predictor.path` to the submission predictor:
+
+```bash
+python competition/run.py --config competition/metadata_config_phase1.json
+```
+
+The predictor is a `module:function` reference. The function receives an
+`AssetOpsScenario` with `id`, `text`, and optional metadata. It can return a
+string, or a dictionary:
+
+```python
+def predict(scenario):
+    return {
+        "prediction": "final answer text",
+        "reasoning": "short optional rationale",
+        "trajectory": [{"tool": "get_assets", "status": "ok"}],
+    }
+```
+
+The generated zip contains:
+
+- `submission.csv` with `id`, `prediction`, `reasoning`, and `trajectory`
+- `meta_data.json` with model and track metadata
+
+## Existing agent command
+
+You can also wrap an existing CLI instead of a Python function:
+
+```bash
+python competition/run.py \
+  --config competition/metadata_config_phase1.json \
+  --agent-command 'uv run plan-execute --json {question_json}'
+```
+
+If the command prints JSON with an `answer` or `prediction` field, that value
+is used. Otherwise stdout is used as the prediction.
+
+## Final phase
+
+One week before the competition ends, organizers should release the final phase
+2 public dataset with the same public schema and no ground truth. Participants
+should switch to `metadata_config_phase2.json`, generate the zip, and submit it
+before the deadline. Final scoring can then be run offline by organizers against
+the private ground-truth scenarios.
diff --git a/competition/run.py b/competition/run.py
new file mode 100644
index 000000000..6840aaff1
--- /dev/null
+++ b/competition/run.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""AssetOpsBench public competition submission script."""
+
+from __future__ import annotations
+
+import logging
+
+from eval_framework import (
+    CompetitionKit,
+    build_predictor_from_args,
+    create_metadata_parser,
+    load_and_merge_config,
+    metadata_from_args,
+)
+
+
+def main() -> int:
+    parser = create_metadata_parser()
+    args = load_and_merge_config(parser.parse_args())
+
+    logging.basicConfig(
+        level=logging.INFO if args.verbose else logging.WARNING,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+
+    kit = CompetitionKit(config_path=args.config)
+    if args.output_dir:
+        kit.output_dir = kit.output_dir.__class__(args.output_dir)
+        kit.output_dir.mkdir(parents=True, exist_ok=True)
+
+    predictor = build_predictor_from_args(args)
+    print("AssetOpsBench Competition - Submission Generation")
+    kit.list_datasets()
+
+    result = kit.run_predictions(
+        predictor,
+        subset_size=args.subset_size,
+        dataset_path=args.dataset_path,
+    )
+    submission_path = kit.save_submission(
+        result,
+        filename=args.output_file,
+        metadata=metadata_from_args(args),
+    )
+
+    print(f"Processed scenarios: {len(result.predictions)}")
+    print(f"Submission package: {submission_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
diff --git a/competition/tests/test_competition_starter.py b/competition/tests/test_competition_starter.py
new file mode 100644
index 000000000..d13b4af4f
--- /dev/null
+++ b/competition/tests/test_competition_starter.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import csv
+import json
+import sys
+import zipfile
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+
+from competition.dataset_utils import load_public_scenarios, write_public_dataset
+from competition.eval_framework import CompetitionKit
+
+
+def test_load_public_scenarios_rejects_private_fields(tmp_path: Path):
+    dataset = tmp_path / "private.jsonl"
+    dataset.write_text(
+        json.dumps(
+            {
+                "id": 1,
+                "text": "Q",
+                "characteristic_form": "private rubric",
+            }
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+
+    with pytest.raises(ValueError, match="private evaluation field"):
+        load_public_scenarios(dataset)
+
+
+def test_write_public_dataset_strips_private_fields(tmp_path: Path):
+    private_dataset = tmp_path / "private.json"
+    public_dataset = tmp_path / "public.jsonl"
+    private_dataset.write_text(
+        json.dumps(
+            [
+                {
+                    "id": 1,
+                    "text": "Q",
+                    "type": "Vibration",
+                    "expected_answer": "A",
+                    "characteristic_form": "rubric",
+                }
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    write_public_dataset(private_dataset, public_dataset)
+    scenarios = load_public_scenarios(public_dataset)
+
+    assert len(scenarios) == 1
+    assert scenarios[0].id == "1"
+    assert scenarios[0].metadata["type"] == "Vibration"
+
+
+def test_competition_kit_packages_submission(tmp_path: Path):
+    dataset = tmp_path / "public.jsonl"
+    dataset.write_text(
+        '{"id": "301", "text": "What vibration tools are available?", "type": "Vibration"}\n',
+        encoding="utf-8",
+    )
+    config = tmp_path / "config.json"
+    config.write_text(
+        json.dumps(
+            {
+                "metadata": {
+                    "model_name": "unit-model",
+                    "track": "agentic_reasoning",
+                    "base_model_type": "API",
+                    "base_model_name": "unit-base",
+                    "dataset": "unit_dataset",
+                },
+                "dataset": {
+                    "dataset_name": "unit_dataset",
+                    "dataset_path": str(dataset),
+                    "description": "test",
+                },
+                "output_dir": str(tmp_path / "out"),
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    kit = CompetitionKit(str(config))
+    result = kit.run_predictions(
+        lambda scenario: {
+            "prediction": f"answer for {scenario.id}",
+            "reasoning": {"used": "unit"},
+            "trajectory": [{"tool": "none"}],
+        }
+    )
+    package = kit.save_submission(result)
+
+    assert package.exists()
+    with zipfile.ZipFile(package) as zf:
+        assert sorted(zf.namelist()) == ["meta_data.json", "submission.csv"]
+        with zf.open("submission.csv") as f:
+            rows = list(csv.DictReader(line.decode("utf-8") for line in f))
+        metadata = json.loads(zf.read("meta_data.json").decode("utf-8"))
+
+    assert rows == [
+        {
+            "id": "301",
+            "prediction": "answer for 301",
+            "reasoning": '{"used": "unit"}',
+            "trajectory": '[{"tool": "none"}]',
+        }
+    ]
+    assert metadata["meta_data"]["dataset"] == "unit_dataset"