aws · stone-coding · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -229,3 +229,4 @@ local_settings.py
 Dockerfile
 CLAUDE.md
 .omc/
+.deepeval/
diff --git a/pyproject.toml b/pyproject.toml
@@ -173,3 +173,9 @@ simulation = [
 datasets = [
     "requests>=2.31.0",
 ]
+deepeval = [
+    "deepeval>=2.0.0",
+]
+autoevals = [
+    "autoevals>=0.0.50",
+]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
@@ -0,0 +1,5 @@
+"""Third-party evaluation adapters for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+__all__ = ["BaseAdapter"]
diff --git a/...drock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py b/...drock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
@@ -0,0 +1,5 @@
+"""Autoevals adapter for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter
+
+__all__ = ["AutoevalsAdapter"]
diff --git a/...edrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py b/...edrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
@@ -0,0 +1,71 @@
+"""Autoevals adapter for AgentCore code-based evaluators."""
+
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class AutoevalsAdapter(BaseAdapter):
+    """Adapter that runs an Autoevals scorer against AgentCore evaluation events.
+
+    Example::
+
+        from autoevals import Factuality
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(scorer=scorer)
+    """
+
+    def __init__(
+        self,
+        scorer: Any,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+        threshold: float = 0.5,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()).
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict of field values. Bypasses default span parsing.
+            threshold: Score threshold for Pass/Fail determination. Defaults to 0.5.
+        """
+        super().__init__(field_mapper=field_mapper)
+        self.scorer = scorer
+        self.threshold = threshold
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate minimum required fields; scorer raises on additional missing params."""
+        missing = []
+        if not fields.get("input"):
+            missing.append("input")
+        if not fields.get("actual_output"):
+            missing.append("actual_output")
+        if missing:
+            scorer_name = type(self.scorer).__name__
+            raise ValueError(
+                f"Field(s) {missing} required by {scorer_name} but not found in evaluation event. "
+                f"Provide a field_mapper or ensure spans contain the necessary data."
+            )
+
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the Autoevals scorer and return formatted results."""
+        kwargs: Dict[str, Any] = {
+            "input": fields.get("input", ""),
+            "output": fields.get("actual_output", ""),
+        }
+        if fields.get("expected_output"):
+            kwargs["expected"] = fields["expected_output"]
+
+        result = self.scorer.eval(**kwargs)
+
+        score = result.score
+        label = "Pass" if score is not None and score >= self.threshold else "Fail"
+        explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else ""
+
+        return EvaluatorOutput(value=score, label=label, explanation=explanation)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
@@ -0,0 +1,109 @@
+"""Base adapter for third-party evaluation framework integrations."""
+
+import abc
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
+    parse_spans,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class BaseAdapter(abc.ABC):
+    """Base adapter for third-party evaluation framework integrations.
+
+    Accepts an EvaluatorInput (from the code_based_evaluators flow),
+    extracts fields from spans using the built-in parser layer, runs the
+    evaluation via execute(), and returns an EvaluatorOutput.
+
+    Never raises unhandled exceptions — always returns a valid EvaluatorOutput.
+    """
+
+    def __init__(
+        self,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict with keys: 'input', 'actual_output', and optionally
+                'expected_output', 'context', 'retrieval_context'. Bypasses default
+                span parsing when provided.
+        """
+        self.field_mapper = field_mapper
+
+    def __call__(self, evaluator_input: EvaluatorInput, context: Any = None) -> EvaluatorOutput:
+        """Handle an evaluation invocation.
+
+        Args:
+            evaluator_input: Parsed EvaluatorInput from the code-based evaluator flow.
+            context: Lambda context object (unused).
+
+        Returns:
+            EvaluatorOutput with score, label, and explanation or error fields.
+        """
+        try:
+            fields = self._extract_fields(evaluator_input)
+        except ValueError as e:
+            logger.error("Field extraction failed: %s", e)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="FIELD_EXTRACTION_ERROR",
+                errorMessage=str(e),
+            )
+
+        try:
+            self.validate_fields(fields)
+        except ValueError as e:
+            logger.error("Validation failed: %s", e)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="MISSING_REQUIRED_FIELD",
+                errorMessage=str(e),
+            )
+
+        try:
+            return self.execute(fields)
+        except Exception as e:
+            logger.error("Execution failed: %s", e, exc_info=True)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="METRIC_ERROR",
+                errorMessage=f"{type(self).__name__} failed: {e}",
+            )
+
+    def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]:
+        """Extract fields from the EvaluatorInput."""
+        if self.field_mapper is not None:
+            return self.field_mapper(evaluator_input)
+
+        result = parse_spans(evaluator_input.session_spans, evaluator_input.reference_inputs)
+        return result.to_dict()
+
+    @abc.abstractmethod
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that required fields are present.
+
+        Each adapter must explicitly declare its validation behavior.
+
+        Args:
+            fields: Extracted field dict.
+
+        Raises:
+            ValueError: If required fields are missing.
+        """
+
+    @abc.abstractmethod
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the evaluation and return an EvaluatorOutput.
+
+        Args:
+            fields: Extracted field dict with keys like "input", "actual_output", etc.
+
+        Returns:
+            EvaluatorOutput with evaluation results.
+        """
diff --git a/...edrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/...edrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
@@ -0,0 +1,5 @@
+"""DeepEval adapter for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter
+
+__all__ = ["DeepEvalAdapter"]
diff --git a/...bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py b/...bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
@@ -0,0 +1,89 @@
+"""DeepEval adapter for AgentCore code-based evaluators."""
+
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class DeepEvalAdapter(BaseAdapter):
+    """Adapter that runs a DeepEval metric against AgentCore evaluation events.
+
+    Example::
+
+        from deepeval.metrics import AnswerRelevancyMetric
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = AnswerRelevancyMetric(threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+    """
+
+    def __init__(
+        self,
+        metric: BaseMetric,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+        model: Optional[Any] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict of LLMTestCase field values. Bypasses default span
+                parsing when provided.
+            model: Optional model override for the metric's LLM.
+        """
+        super().__init__(field_mapper=field_mapper)
+        self.metric = metric
+        if model is not None:
+            self.metric.model = model
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that input and actual_output are present."""
+        missing = []
+        if not fields.get("input"):
+            missing.append("input")
+        if not fields.get("actual_output"):
+            missing.append("actual_output")
+        if missing:
+            metric_name = type(self.metric).__name__
+            raise ValueError(
+                f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
+                f"Provide a field_mapper or ensure spans contain the necessary data."
+            )
+
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the DeepEval metric and return formatted results."""
+        test_case = LLMTestCase(
+            input=fields.get("input", ""),
+            actual_output=fields.get("actual_output", ""),
+            expected_output=fields.get("expected_output"),
+            context=fields.get("context"),
+            retrieval_context=fields.get("retrieval_context"),
+        )
+
+        try:
+            self.metric.measure(test_case)
+        except Exception as e:
+            if type(e).__name__ == "MissingTestCaseParamsError":
+                return EvaluatorOutput(
+                    label="Error",
+                    errorCode="MISSING_REQUIRED_FIELD",
+                    errorMessage=f"{type(self.metric).__name__} requires fields not extracted from spans: {e}. "
+                    f"Provide a field_mapper to supply custom fields from your trace data.",
+                )
+            raise
+
+        score = self.metric.score
+        reason = getattr(self.metric, "reason", None) or ""
+        threshold = getattr(self.metric, "threshold", 0.5)
+        success = getattr(self.metric, "success", score is not None and score >= threshold)
+        label = "Pass" if success else "Fail"
+
+        return EvaluatorOutput(value=score, label=label, explanation=reason)
diff --git a/...ck_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/...ck_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
@@ -0,0 +1,8 @@
+"""Span parsers for extracting evaluation fields from Agent SDK trace formats."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.base import (
+    SpanParseResult,
+    parse_spans,
+)
+
+__all__ = ["SpanParseResult", "parse_spans"]
diff --git a/...edrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py b/...edrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
@@ -0,0 +1,63 @@
+"""Base span parsing logic and orchestration across format-specific parsers."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.strands import (
+    parse_strands_spans,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.otel_langchain import (
+    parse_otel_langchain_spans,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.openinference import (
+    parse_openinference_spans,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_PARSERS = [
+    parse_strands_spans,
+    parse_otel_langchain_spans,
+    parse_openinference_spans,
+]
+
+
+def parse_spans(
+    session_spans: List[Dict[str, Any]],
+    reference_inputs: Optional[List[Any]] = None,
+) -> SpanParseResult:
+    """Parse session spans using the first matching agent-level parser.
+
+    Iterates through format-specific parsers (Strands, OTel LangChain,
+    OpenInference) and returns the result from the first one that
+    successfully extracts data.
+
+    Args:
+        session_spans: Raw ADOT span dicts from the evaluation service.
+        reference_inputs: Optional ReferenceInput list for expected_output.
+
+    Returns:
+        SpanParseResult with extracted fields.
+
+    Raises:
+        ValueError: If no parser can extract data from the spans.
+    """
+    for parser in _PARSERS:
+        result = parser(session_spans)
+        if result is not None:
+            if reference_inputs:
+                ref = reference_inputs[0]
+                expected = getattr(ref, "expected_response_text", None)
+                if expected:
+                    result.expected_output = expected
+            return result
+
+    raise ValueError(
+        "Could not extract evaluation fields from spans. "
+        "No agent-level span with gen_ai.operation.name=='invoke_agent' and "
+        "valid span_events found. Provide a field_mapper for custom formats."
+    )