gooddata · zdenekmusil-gd · Jun 3, 2026 · Jun 3, 2026
@@ -15,7 +15,7 @@ on:
       - v*.*.*
 
 env:
-  COMPONENTS: '["gooddata-api-client","gooddata-pandas","gooddata-fdw","gooddata-sdk","gooddata-dbt","gooddata-flight-server","gooddata-flexconnect","gooddata-pipelines"]'
+  COMPONENTS: '["gooddata-api-client","gooddata-pandas","gooddata-fdw","gooddata-sdk","gooddata-dbt","gooddata-flight-server","gooddata-flexconnect","gooddata-pipelines","gooddata-eval"]'
 
 jobs:
   matrix-components:

@@ -9,7 +9,7 @@ on:
         default: "master"
 
 env:
-  COMPONENTS: '["gooddata-api-client","gooddata-pandas","gooddata-fdw","gooddata-sdk","gooddata-dbt","gooddata-flight-server","gooddata-flexconnect","gooddata-pipelines"]'
+  COMPONENTS: '["gooddata-api-client","gooddata-pandas","gooddata-fdw","gooddata-sdk","gooddata-dbt","gooddata-flight-server","gooddata-flexconnect","gooddata-pipelines","gooddata-eval"]'
 
 jobs:
   matrix-components:

@@ -36,6 +36,7 @@ jobs:
           - { path: packages/gooddata-flight-server, project: gooddata-flight-server }
           - { path: packages/gooddata-flexconnect,   project: gooddata-flexconnect }
           - { path: packages/gooddata-pipelines,     project: gooddata-pipelines }
+          - { path: packages/gooddata-eval,          project: gooddata-eval }
           - { path: gooddata-api-client,             project: gooddata-api-client }
     steps:
       - name: Checkout the code

@@ -24,7 +24,7 @@ jobs:
         if: ${{ matrix.python_version == 'py314' }}
         uses: codecov/codecov-action@v5
         with:
-          files: ./packages/gooddata-sdk/coverage.xml,./packages/gooddata-pandas/coverage.xml,./packages/gooddata-fdw/coverage.xml,./packages/gooddata-flight-server/coverage.xml,./packages/gooddata-flexconnect/coverage.xml,./packages/gooddata-dbt/coverage.xml,./packages/gooddata-pipelines/coverage.xml
+          files: ./packages/gooddata-sdk/coverage.xml,./packages/gooddata-pandas/coverage.xml,./packages/gooddata-fdw/coverage.xml,./packages/gooddata-flight-server/coverage.xml,./packages/gooddata-flexconnect/coverage.xml,./packages/gooddata-dbt/coverage.xml,./packages/gooddata-pipelines/coverage.xml,./packages/gooddata-eval/coverage.xml
           token: ${{ secrets.CODECOV_TOKEN }}
           override_commit: ${{ github.event.pull_request.head.sha || github.sha }}
           override_pr: ${{ github.event.number }}

@@ -0,0 +1,2 @@
+# (C) 2026 GoodData Corporation
+include ../../project_common.mk
@@ -0,0 +1,141 @@
+# gooddata-eval
+
+CLI to evaluate the GoodData AI agent against a dataset of natural-language
+questions on a chosen workspace and LLM model.
+
+## Install
+
+    uv add gooddata-eval
+
+Or install `gd-eval` as a standalone tool:
+
+    uv tool install gooddata-eval
+
+## Quick start
+
+```bash
+export GOODDATA_TOKEN='your-api-token'
+
+gd-eval run \
+  --host  https://your.gooddata.cloud \
+  --workspace  demo \
+  --dataset  ./my-dataset \
+  --model  gpt-5.2 \
+  --runs  2 \
+  --json  results.json
+```
+
+## All flags
+
+### Connection
+
+| Flag | Env var | Description |
+|---|---|---|
+| `--host HOST` | — | GoodData host URL (e.g. `https://your.gooddata.cloud`). |
+| `--token TOKEN` | `GOODDATA_TOKEN` | API token. Pass via flag or env var. |
+| `--profile NAME` | — | Profile name in `~/.gooddata/profiles.yaml` (same file as the `gdc` CLI). Provides host + token when both flags are omitted. |
+| `--workspace ID` | — | **Required.** Workspace id to evaluate against. |
+
+### Dataset source (pick one)
+
+| Flag | Description |
+|---|---|
+| `--dataset PATH` | Path to a flat folder of JSON files — one question per file. |
+| `--langfuse-dataset NAME` | Pull dataset items by name from Langfuse. Requires `LANGFUSE_PUBLIC_KEY`, `LANGFUSE_SECRET_KEY`, `LANGFUSE_HOST` env vars. |
+
+### Model selection
+
+| Flag | Description |
+|---|---|
+| `--model ID` | LLM model id to evaluate (e.g. `gpt-5.2`). Defaults to the workspace's currently active model. If the model is offered by a different provider than the active one, the workspace's active provider is switched automatically. |
+| `--provider NAME_OR_ID` | LLM provider name or id. Use when `--model` is offered by multiple providers and you need to pick one. Accepts either the human-readable provider name or its UUID id. |
+
+### Evaluation
+
+| Flag | Default | Description |
+|---|---|---|
+| `--runs K` | `2` | Number of independent conversation runs per item (pass@K). An item passes if any run passes. |
+
+### Output
+
+| Flag | Description |
+|---|---|
+| `--json PATH` | Write a machine-readable JSON report (keyed by item id, with per-item scores) to this path. Console summary is always printed. |
+| `--quiet` | Suppress per-item progress output. Only the final table and summary are printed. |
+
+### Langfuse sink
+
+| Flag | Description |
+|---|---|
+| `--langfuse` | Log evaluation results to Langfuse after each item. Requires `--langfuse-dataset` (so item ids can be linked to Langfuse dataset items). Creates a named experiment run (`gd-eval-{timestamp}-{model}`) in the Langfuse dataset. Requires `LANGFUSE_PUBLIC_KEY`, `LANGFUSE_SECRET_KEY`, `LANGFUSE_HOST`. |
+
+## Dataset format
+
+A dataset is a folder of `.json` files, one per question. Each file must
+contain a common envelope:
+
+```json
+{
+  "id":           "stable-unique-id",
+  "dataset_name": "my_dataset",
+  "test_kind":    "visualization",
+  "question":     "Show revenue by quarter",
+  "expected_output": { }
+}
+```
+
+Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
+`search_tool`, `general_question`, `guardrail`.
+
+See the full dataset specification for `expected_output` shapes per test kind.
+
+## Supported test kinds
+
+| test_kind | What the agent must produce | Extra required |
+|---|---|---|
+| `visualization` | Correct AAC visualization (metrics, dimensions, filters, type) | — |
+| `metric_skill` | `create_metric` tool call with correct MAQL and format | — |
+| `alert_skill` | `create_metric_alert` tool call with correct operator, threshold, trigger, filters, metric, recipients | — |
+| `search_tool` | `search_objects` tool call (correct function called = pass; correct arguments = quality score) | — |
+| `general_question` | Text answer judged by LLM | `[llm-judge]` |
+| `guardrail` | Refusal/redirect (visualization response auto-fails) | `[llm-judge]` |
+
+## Optional extras
+
+### `[llm-judge]` — LLM-as-judge evaluators
+
+`general_question` and `guardrail` items are scored by an LLM judge (GPT-4o)
+that compares the agent's text response against your expected-output description.
+This requires the OpenAI Python package and an API key:
+
+```bash
+uv add 'gooddata-eval[llm-judge]'        # project dependency
+# or, for the standalone gd-eval tool:
+uv tool install 'gooddata-eval[llm-judge]'
+```
+
+Set your OpenAI key before running:
+
+```bash
+export OPENAI_API_KEY='sk-...'
+```
+
+Without `[llm-judge]`, items with `test_kind: general_question` or `guardrail`
+are reported as **skipped**.
+
+
+## Exit codes
+
+| Code | Meaning |
+|---|---|
+| `0` | Run completed. Evaluation failures do **not** cause a non-zero exit — check the report. |
+| `2` | Operational error: bad connection, missing model, unreadable dataset, missing credentials. |
+
+## Scores (in JSON report and Langfuse)
+
+| Score | Description |
+|---|---|
+| `pass_at_k` | 1 if any of the K runs passed strict checks, else 0. |
+| `quality_score` | Fraction of strict check flags that are `True` (0.0–1.0). Shown in CLI as a percentage. |
+| `value_score` | Weighted blend: 0.6 × quality + 0.2 × speed (where speed = max(0, 1 − latency/60s)). |
+| `latency_s` | Average per-run latency in seconds. |
@@ -0,0 +1,65 @@
+# (C) 2026 GoodData Corporation
+[project]
+name = "gooddata-eval"
+version = "1.67.0"
+description = "Evaluate the GoodData AI agent against your own questions and models."
+readme = "README.md"
+license = "MIT"
+authors = [
+    {name = "GoodData", email = "support@gooddata.com"}
+]
+keywords = ["gooddata", "ai", "evaluation", "llm", "analytics", "cli"]
+requires-python = ">=3.10"
+dependencies = [
+    "gooddata-sdk~=1.67.0",
+    "httpx>=0.27,<1.0",
+    "orjson>=3.9.15,<4.0.0",
+    "pydantic>=2.6,<3.0",
+    "rich>=13.0,<15.0",
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Environment :: Console",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Software Development",
+    "Typing :: Typed",
+]
+
+[project.optional-dependencies]
+llm-judge = ["openai>=1.40,<2.0"]
+
+[project.scripts]
+gd-eval = "gooddata_eval.cli.main:main"
+
+[project.urls]
+Source = "https://github.com/gooddata/gooddata-python-sdk"
+
+[dependency-groups]
+test = [
+    "pytest~=8.3.4",
+    "pytest-cov~=6.0.0",
+    "pytest-mock>=3.14.0",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/gooddata_eval"]
+
+[tool.coverage.run]
+source = ["gooddata_eval"]
+
+[tool.coverage.paths]
+source = [
+    "src/gooddata_eval",
+    "**/site-packages/gooddata_eval",
+]
+
+[tool.ty.analysis]
+allowed-unresolved-imports = ["openai.**", "gooddata_api_client.**"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
@@ -0,0 +1,6 @@
+# (C) 2026 GoodData Corporation
+"""gooddata-eval: evaluate the GoodData AI agent against your own datasets."""
+
+from gooddata_eval._version import __version__
+
+__all__ = ["__version__"]
@@ -0,0 +1,7 @@
+# (C) 2026 GoodData Corporation
+from importlib import metadata
+
+try:
+    __version__ = metadata.version("gooddata-eval")
+except metadata.PackageNotFoundError:
+    __version__ = "unknown-version"
@@ -0,0 +1 @@
+# (C) 2026 GoodData Corporation
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,7 @@ on: @@
           - v*.*.*
     env:
-      COMPONENTS: '["gooddata-api-client","gooddata-pandas","gooddata-fdw","gooddata-sdk","gooddata-dbt","gooddata-flight-server","gooddata-flexconnect","gooddata-pipelines"]'
+      COMPONENTS: '["gooddata-api-client","gooddata-pandas","gooddata-fdw","gooddata-sdk","gooddata-dbt","gooddata-flight-server","gooddata-flexconnect","gooddata-pipelines","gooddata-eval"]'
     jobs:
       matrix-components:
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# (C) 2026 GoodData Corporation
		include ../../project_common.mk