Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -229,3 +229,4 @@ local_settings.py
Dockerfile
CLAUDE.md
.omc/
.deepeval/
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,9 @@ simulation = [
datasets = [
"requests>=2.31.0",
]
deepeval = [
"deepeval>=2.0.0",
]
autoevals = [
"autoevals>=0.0.50",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Third-party evaluation adapters for AgentCore code-based evaluators."""

from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter

__all__ = ["BaseAdapter"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Autoevals adapter for AgentCore code-based evaluators."""

from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter

__all__ = ["AutoevalsAdapter"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Autoevals adapter for AgentCore code-based evaluators."""

import logging
from typing import Any, Callable, Dict, Optional

from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter

logger = logging.getLogger(__name__)


class AutoevalsAdapter(BaseAdapter):
"""Adapter that runs an Autoevals scorer against AgentCore evaluation events.

Example::

from autoevals import Factuality
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter

scorer = Factuality()
adapter = AutoevalsAdapter(scorer=scorer)
"""

def __init__(
self,
scorer: Any,
field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
threshold: float = 0.5,
):
"""Initialize the adapter.

Args:
scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()).
field_mapper: Optional callable that receives the EvaluatorInput and
returns a dict of field values. Bypasses default span parsing.
threshold: Score threshold for Pass/Fail determination. Defaults to 0.5.
"""
super().__init__(field_mapper=field_mapper)
self.scorer = scorer
self.threshold = threshold

def validate_fields(self, fields: Dict[str, Any]) -> None:
"""Validate minimum required fields; scorer raises on additional missing params."""
missing = []
if not fields.get("input"):
missing.append("input")
if not fields.get("actual_output"):
missing.append("actual_output")
if missing:
scorer_name = type(self.scorer).__name__
raise ValueError(
f"Field(s) {missing} required by {scorer_name} but not found in evaluation event. "
f"Provide a field_mapper or ensure spans contain the necessary data."
)

def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
"""Run the Autoevals scorer and return formatted results."""
kwargs: Dict[str, Any] = {
"input": fields.get("input", ""),
"output": fields.get("actual_output", ""),
}
if fields.get("expected_output"):
kwargs["expected"] = fields["expected_output"]

result = self.scorer.eval(**kwargs)

score = result.score
label = "Pass" if score is not None and score >= self.threshold else "Fail"
explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else ""

return EvaluatorOutput(value=score, label=label, explanation=explanation)
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""Base adapter for third-party evaluation framework integrations."""

import abc
import logging
from typing import Any, Callable, Dict, Optional

from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
parse_spans,
)

logger = logging.getLogger(__name__)


class BaseAdapter(abc.ABC):
"""Base adapter for third-party evaluation framework integrations.

Accepts an EvaluatorInput (from the code_based_evaluators flow),
extracts fields from spans using the built-in parser layer, runs the
evaluation via execute(), and returns an EvaluatorOutput.

Never raises unhandled exceptions — always returns a valid EvaluatorOutput.
"""

def __init__(
self,
field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
):
"""Initialize the adapter.

Args:
field_mapper: Optional callable that receives the EvaluatorInput and
returns a dict with keys: 'input', 'actual_output', and optionally
'expected_output', 'context', 'retrieval_context'. Bypasses default
span parsing when provided.
"""
self.field_mapper = field_mapper

def __call__(self, evaluator_input: EvaluatorInput, context: Any = None) -> EvaluatorOutput:
"""Handle an evaluation invocation.

Args:
evaluator_input: Parsed EvaluatorInput from the code-based evaluator flow.
context: Lambda context object (unused).

Returns:
EvaluatorOutput with score, label, and explanation or error fields.
"""
try:
fields = self._extract_fields(evaluator_input)
except ValueError as e:
logger.error("Field extraction failed: %s", e)
return EvaluatorOutput(
label="Error",
errorCode="FIELD_EXTRACTION_ERROR",
errorMessage=str(e),
)

try:
self.validate_fields(fields)
except ValueError as e:
logger.error("Validation failed: %s", e)
return EvaluatorOutput(
label="Error",
errorCode="MISSING_REQUIRED_FIELD",
errorMessage=str(e),
)

try:
return self.execute(fields)
except Exception as e:
logger.error("Execution failed: %s", e, exc_info=True)
return EvaluatorOutput(
label="Error",
errorCode="METRIC_ERROR",
errorMessage=f"{type(self).__name__} failed: {e}",
)

def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]:
"""Extract fields from the EvaluatorInput."""
if self.field_mapper is not None:
return self.field_mapper(evaluator_input)

result = parse_spans(evaluator_input.session_spans, evaluator_input.reference_inputs)
return result.to_dict()

@abc.abstractmethod
def validate_fields(self, fields: Dict[str, Any]) -> None:
"""Validate that required fields are present.

Each adapter must explicitly declare its validation behavior.

Args:
fields: Extracted field dict.

Raises:
ValueError: If required fields are missing.
"""

@abc.abstractmethod
def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
"""Run the evaluation and return an EvaluatorOutput.

Args:
fields: Extracted field dict with keys like "input", "actual_output", etc.

Returns:
EvaluatorOutput with evaluation results.
"""
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""DeepEval adapter for AgentCore code-based evaluators."""

from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter

__all__ = ["DeepEvalAdapter"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""DeepEval adapter for AgentCore code-based evaluators."""

import logging
from typing import Any, Callable, Dict, Optional

from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase

from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter

logger = logging.getLogger(__name__)


class DeepEvalAdapter(BaseAdapter):
"""Adapter that runs a DeepEval metric against AgentCore evaluation events.

Example::

from deepeval.metrics import AnswerRelevancyMetric
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter

metric = AnswerRelevancyMetric(threshold=0.7)
adapter = DeepEvalAdapter(metric=metric)
"""

def __init__(
self,
metric: BaseMetric,
field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
model: Optional[Any] = None,
):
"""Initialize the adapter.

Args:
metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
field_mapper: Optional callable that receives the EvaluatorInput and
returns a dict of LLMTestCase field values. Bypasses default span
parsing when provided.
model: Optional model override for the metric's LLM.
"""
super().__init__(field_mapper=field_mapper)
self.metric = metric
if model is not None:
self.metric.model = model

def validate_fields(self, fields: Dict[str, Any]) -> None:
"""Validate that input and actual_output are present."""
missing = []
if not fields.get("input"):
missing.append("input")
if not fields.get("actual_output"):
missing.append("actual_output")
if missing:
metric_name = type(self.metric).__name__
raise ValueError(
f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
f"Provide a field_mapper or ensure spans contain the necessary data."
)

def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
"""Run the DeepEval metric and return formatted results."""
test_case = LLMTestCase(
input=fields.get("input", ""),
actual_output=fields.get("actual_output", ""),
expected_output=fields.get("expected_output"),
context=fields.get("context"),
retrieval_context=fields.get("retrieval_context"),
)

try:
self.metric.measure(test_case)
except Exception as e:
if type(e).__name__ == "MissingTestCaseParamsError":
return EvaluatorOutput(
label="Error",
errorCode="MISSING_REQUIRED_FIELD",
errorMessage=f"{type(self.metric).__name__} requires fields not extracted from spans: {e}. "
f"Provide a field_mapper to supply custom fields from your trace data.",
)
raise

score = self.metric.score
reason = getattr(self.metric, "reason", None) or ""
threshold = getattr(self.metric, "threshold", 0.5)
success = getattr(self.metric, "success", score is not None and score >= threshold)
label = "Pass" if success else "Fail"

return EvaluatorOutput(value=score, label=label, explanation=reason)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""Span parsers for extracting evaluation fields from Agent SDK trace formats."""

from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.base import (
SpanParseResult,
parse_spans,
)

__all__ = ["SpanParseResult", "parse_spans"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Base span parsing logic and orchestration across format-specific parsers."""

import logging
from typing import Any, Dict, List, Optional

from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
SpanParseResult,
)
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.strands import (
parse_strands_spans,
)
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.otel_langchain import (
parse_otel_langchain_spans,
)
from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.openinference import (
parse_openinference_spans,
)

logger = logging.getLogger(__name__)


_PARSERS = [
parse_strands_spans,
parse_otel_langchain_spans,
parse_openinference_spans,
]


def parse_spans(
session_spans: List[Dict[str, Any]],
reference_inputs: Optional[List[Any]] = None,
) -> SpanParseResult:
"""Parse session spans using the first matching agent-level parser.

Iterates through format-specific parsers (Strands, OTel LangChain,
OpenInference) and returns the result from the first one that
successfully extracts data.

Args:
session_spans: Raw ADOT span dicts from the evaluation service.
reference_inputs: Optional ReferenceInput list for expected_output.

Returns:
SpanParseResult with extracted fields.

Raises:
ValueError: If no parser can extract data from the spans.
"""
for parser in _PARSERS:
result = parser(session_spans)
if result is not None:
if reference_inputs:
ref = reference_inputs[0]
expected = getattr(ref, "expected_response_text", None)
if expected:
result.expected_output = expected
return result

raise ValueError(
"Could not extract evaluation fields from spans. "
"No agent-level span with gen_ai.operation.name=='invoke_agent' and "
"valid span_events found. Provide a field_mapper for custom formats."
)
Loading