From 77d16a90a2bc14adb6f46c0635a5f82254c95d21 Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 10:39:26 -0700
Subject: [PATCH 01/13] Add DeepEvalHandler integration with unit tests

Introduces a new integrations/deepeval/ module that adapts AgentCore
Lambda evaluation events into DeepEval LLMTestCase objects, runs any
BaseMetric, and returns structured score/label/explanation responses.
---
 .../integrations/deepeval/__init__.py         |   5 +
 .../integrations/deepeval/handler.py          |  88 +++++
 .../integrations/deepeval/input_mapper.py     | 191 ++++++++++
 .../integrations/deepeval/__init__.py         |   0
 .../integrations/deepeval/test_handler.py     | 230 ++++++++++++
 .../deepeval/test_input_mapper.py             | 331 ++++++++++++++++++
 6 files changed, 845 insertions(+)
 create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
 create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
 create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
 create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
 create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
 create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py

diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
new file mode 100644
index 00000000..76f6461f
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
@@ -0,0 +1,5 @@
+"""DeepEval integration for AgentCore Evaluation."""
+
+from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler
+
+__all__ = ["DeepEvalHandler"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
new file mode 100644
index 00000000..b339b883
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
@@ -0,0 +1,88 @@
+"""DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics."""
+
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from deepeval.metrics import BaseMetric
+
+from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import (
+    ParsedEvaluationEvent,
+    build_test_case,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class DeepEvalHandler:
+    """Lambda handler that runs a DeepEval metric against AgentCore evaluation events.
+
+    Never raises unhandled exceptions — always returns a valid response dict.
+
+    Example::
+
+        from deepeval.metrics import AnswerRelevancyMetric
+
+        metric = AnswerRelevancyMetric(threshold=0.7)
+        handler = DeepEvalHandler(metric=metric)
+
+        # Use as Lambda handler
+        def lambda_handler(event, context):
+            return handler(event, context)
+    """
+
+    def __init__(
+        self,
+        metric: BaseMetric,
+        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+    ):
+        """Initialize the handler.
+
+        Args:
+            metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
+            field_mapper: Optional callable that receives the raw Lambda event and
+                returns a dict of LLMTestCase field values. Bypasses default span
+                extraction when provided.
+        """
+        self.metric = metric
+        self.field_mapper = field_mapper
+
+    def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]:
+        """Handle a Lambda invocation.
+
+        Args:
+            event: Raw Lambda event dict from the evaluation service.
+            context: Lambda context object (unused).
+
+        Returns:
+            Success: {"value": float, "label": str, "explanation": str}
+            Error: {"errorCode": str, "errorMessage": str}
+        """
+        try:
+            parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        except (KeyError, IndexError, TypeError) as e:
+            logger.error("Failed to parse evaluation event: %s", e)
+            return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}")
+
+        try:
+            test_case = build_test_case(parsed, self.metric, self.field_mapper)
+        except ValueError as e:
+            logger.error("Missing required fields: %s", e)
+            return _error_response("MISSING_REQUIRED_FIELD", str(e))
+
+        try:
+            self.metric.measure(test_case)
+        except Exception as e:
+            logger.error("Metric measurement failed: %s", e, exc_info=True)
+            return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}")
+
+        score = self.metric.score
+        reason = getattr(self.metric, "reason", None) or ""
+        threshold = getattr(self.metric, "threshold", 0.5)
+        label = "Pass" if score is not None and score >= threshold else "Fail"
+
+        return {"value": score, "label": label, "explanation": reason}
+
+
+def _error_response(code: str, message: str) -> Dict[str, str]:
+    """Build a standardized error response dict."""
+    return {"errorCode": code, "errorMessage": message}
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
new file mode 100644
index 00000000..50873cf5
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
@@ -0,0 +1,191 @@
+"""Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects."""
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+
+logger = logging.getLogger(__name__)
+
+_PARAM_TO_FIELD: Dict[LLMTestCaseParams, str] = {
+    LLMTestCaseParams.INPUT: "input",
+    LLMTestCaseParams.ACTUAL_OUTPUT: "actual_output",
+    LLMTestCaseParams.EXPECTED_OUTPUT: "expected_output",
+    LLMTestCaseParams.CONTEXT: "context",
+    LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrieval_context",
+}
+
+_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = {
+    "AnswerRelevancyMetric": ["input", "actual_output"],
+    "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"],
+    "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"],
+    "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
+    "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
+    "HallucinationMetric": ["input", "actual_output", "context"],
+    "BiasMetric": ["input", "actual_output"],
+    "ToxicityMetric": ["input", "actual_output"],
+    "GEval": ["input", "actual_output"],
+    "SummarizationMetric": ["input", "actual_output"],
+}
+
+
+@dataclass
+class ParsedEvaluationEvent:
+    """Parsed representation of the AgentCore Lambda evaluation event."""
+
+    evaluation_level: str
+    session_spans: List[Dict[str, Any]]
+    target_trace_id: Optional[str] = None
+    target_span_id: Optional[str] = None
+    reference_inputs: List[Dict[str, Any]] = field(default_factory=list)
+
+    @classmethod
+    def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent":
+        """Parse a raw Lambda event dict into a structured object.
+
+        Args:
+            event: Raw Lambda event payload from the evaluation service.
+
+        Returns:
+            ParsedEvaluationEvent with extracted fields.
+
+        Raises:
+            KeyError: If required top-level fields are missing.
+        """
+        evaluation_input = event["evaluationInput"]
+        target = event.get("evaluationTarget") or {}
+        trace_ids = target.get("traceIds") or []
+        span_ids = target.get("spanIds") or []
+
+        return cls(
+            evaluation_level=event["evaluationLevel"],
+            session_spans=evaluation_input["sessionSpans"],
+            target_trace_id=trace_ids[0] if trace_ids else None,
+            target_span_id=span_ids[0] if span_ids else None,
+            reference_inputs=event.get("evaluationReferenceInputs") or [],
+        )
+
+
+def _get_required_params(metric: BaseMetric) -> List[str]:
+    """Determine which LLMTestCase fields a metric requires.
+
+    Fallback chain:
+        1. metric._required_params (DeepEval internal attribute)
+        2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name
+        3. metric.evaluation_params (GEval special case)
+        4. Default: ["input", "actual_output"]
+    """
+    if hasattr(metric, "_required_params") and metric._required_params:
+        params = metric._required_params
+        return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params]
+
+    class_name = type(metric).__name__
+    if class_name in _METRIC_REQUIRED_PARAMS:
+        return _METRIC_REQUIRED_PARAMS[class_name]
+
+    if hasattr(metric, "evaluation_params") and metric.evaluation_params:
+        params = metric.evaluation_params
+        return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params]
+
+    return ["input", "actual_output"]
+
+
+def _extract_fields_from_spans(
+    parsed: ParsedEvaluationEvent,
+) -> Dict[str, Any]:
+    """Extract LLMTestCase fields from ADOT session spans.
+
+    Bridges Session → LLMTestCase fields:
+        - input ← user messages (role=="user")
+        - actual_output ← assistant messages (role=="assistant")
+        - retrieval_context ← tool messages (role=="tool")
+        - expected_output ← evaluationReferenceInputs[0].expectedResponse
+    """
+    user_messages: List[str] = []
+    assistant_messages: List[str] = []
+    tool_messages: List[str] = []
+
+    for span in parsed.session_spans:
+        attributes = span.get("attributes", {})
+        role = attributes.get("gen_ai.message.role", "")
+        content = attributes.get("gen_ai.message.content", "")
+
+        if not content:
+            content = attributes.get("gen_ai.completion", "")
+
+        if role == "user" and content:
+            user_messages.append(content)
+        elif role == "assistant" and content:
+            assistant_messages.append(content)
+        elif role == "tool" and content:
+            tool_messages.append(content)
+
+    fields: Dict[str, Any] = {}
+
+    if user_messages:
+        fields["input"] = "\n".join(user_messages)
+    if assistant_messages:
+        fields["actual_output"] = "\n".join(assistant_messages)
+    if tool_messages:
+        fields["retrieval_context"] = tool_messages
+
+    if parsed.reference_inputs:
+        expected = parsed.reference_inputs[0].get("expectedResponse")
+        if expected:
+            fields["expected_output"] = expected
+
+    return fields
+
+
+def build_test_case(
+    parsed: ParsedEvaluationEvent,
+    metric: BaseMetric,
+    field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+) -> LLMTestCase:
+    """Build a DeepEval LLMTestCase from a parsed evaluation event.
+
+    Args:
+        parsed: The parsed Lambda event.
+        metric: The DeepEval metric (used to determine required fields).
+        field_mapper: Optional callable that receives the raw Lambda event fields
+            and returns a dict of LLMTestCase field values. Bypasses default
+            span extraction when provided.
+
+    Returns:
+        An LLMTestCase ready for metric.measure().
+
+    Raises:
+        ValueError: If required fields for the metric cannot be populated.
+    """
+    if field_mapper is not None:
+        raw_event = {
+            "evaluationLevel": parsed.evaluation_level,
+            "evaluationInput": {"sessionSpans": parsed.session_spans},
+            "evaluationTarget": {
+                "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [],
+                "spanIds": [parsed.target_span_id] if parsed.target_span_id else [],
+            },
+            "evaluationReferenceInputs": parsed.reference_inputs,
+        }
+        fields = field_mapper(raw_event)
+    else:
+        fields = _extract_fields_from_spans(parsed)
+
+    required = _get_required_params(metric)
+    missing = [f for f in required if f not in fields or not fields[f]]
+    if missing:
+        metric_name = type(metric).__name__
+        raise ValueError(
+            f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
+            f"Provide a field_mapper or ensure spans contain the necessary data."
+        )
+
+    return LLMTestCase(
+        input=fields.get("input", ""),
+        actual_output=fields.get("actual_output", ""),
+        expected_output=fields.get("expected_output"),
+        context=fields.get("context"),
+        retrieval_context=fields.get("retrieval_context"),
+    )
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
new file mode 100644
index 00000000..77988ab7
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
@@ -0,0 +1,230 @@
+"""Tests for DeepEvalHandler."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler
+
+
+def _make_event(
+    level="TRACE",
+    trace_ids=None,
+    spans=None,
+    reference_inputs=None,
+):
+    """Build a raw Lambda event dict for testing."""
+    event = {
+        "schemaVersion": "1.0",
+        "evaluationLevel": level,
+        "evaluationInput": {
+            "sessionSpans": spans
+            or [
+                {
+                    "traceId": "abc123",
+                    "spanId": "span1",
+                    "attributes": {
+                        "gen_ai.message.role": "user",
+                        "gen_ai.message.content": "What is AI?",
+                    },
+                },
+                {
+                    "traceId": "abc123",
+                    "spanId": "span2",
+                    "attributes": {
+                        "gen_ai.message.role": "assistant",
+                        "gen_ai.message.content": "AI is artificial intelligence.",
+                    },
+                },
+            ]
+        },
+        "evaluationTarget": {},
+    }
+    if trace_ids is not None:
+        event["evaluationTarget"]["traceIds"] = trace_ids
+    if reference_inputs is not None:
+        event["evaluationReferenceInputs"] = reference_inputs
+    return event
+
+
+def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"):
+    """Create a mock metric that returns a fixed score on measure()."""
+    metric = MagicMock()
+    type(metric).__name__ = name
+    metric.threshold = threshold
+    metric.score = score
+    metric.reason = reason
+    metric._required_params = None
+    del metric._required_params
+    del metric.evaluation_params
+
+    def measure_side_effect(test_case):
+        metric.score = score
+        metric.reason = reason
+
+    metric.measure = MagicMock(side_effect=measure_side_effect)
+    return metric
+
+
+class TestDeepEvalHandlerSuccess:
+    def test_returns_pass_when_score_above_threshold(self):
+        metric = _mock_metric(score=0.9, threshold=0.7)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.9
+        assert result["label"] == "Pass"
+        assert result["explanation"] == "Looks good"
+
+    def test_returns_fail_when_score_below_threshold(self):
+        metric = _mock_metric(score=0.3, threshold=0.7)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.3
+        assert result["label"] == "Fail"
+
+    def test_returns_pass_at_exact_threshold(self):
+        metric = _mock_metric(score=0.7, threshold=0.7)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["label"] == "Pass"
+
+    def test_metric_measure_called_with_test_case(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+
+        handler(_make_event())
+
+        metric.measure.assert_called_once()
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "What is AI?"
+        assert test_case.actual_output == "AI is artificial intelligence."
+
+    def test_context_parameter_ignored(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+        mock_context = {"function_name": "my-lambda"}
+
+        result = handler(_make_event(), mock_context)
+
+        assert result["value"] == 0.85
+
+    def test_custom_field_mapper(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(
+            metric=metric,
+            field_mapper=lambda event: {
+                "input": "mapped input",
+                "actual_output": "mapped output",
+            },
+        )
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.85
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "mapped input"
+        assert test_case.actual_output == "mapped output"
+
+
+class TestDeepEvalHandlerErrors:
+    def test_invalid_event_returns_error(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler({})
+
+        assert result["errorCode"] == "INVALID_EVENT"
+        assert "errorMessage" in result
+        assert "value" not in result
+
+    def test_missing_evaluation_input_returns_error(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+
+        event = {"evaluationLevel": "TRACE", "evaluationTarget": {}}
+        result = handler(event)
+
+        assert result["errorCode"] == "INVALID_EVENT"
+
+    def test_missing_required_field_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "q"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s2",
+                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "a"},
+            },
+        ]
+        metric = _mock_metric(name="FaithfulnessMetric")
+        handler = DeepEvalHandler(metric=metric)
+
+        event = _make_event(spans=spans)
+        result = handler(event)
+
+        assert result["errorCode"] == "MISSING_REQUIRED_FIELD"
+        assert "retrieval_context" in result["errorMessage"]
+
+    def test_metric_measure_exception_returns_error(self):
+        metric = _mock_metric()
+        metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout"))
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["errorCode"] == "METRIC_ERROR"
+        assert "LLM timeout" in result["errorMessage"]
+
+    def test_never_raises_on_any_input(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+
+        for bad_input in [None, [], "string", 42, {"random": "keys"}]:
+            result = handler(bad_input)
+            assert "errorCode" in result or "value" in result
+
+
+class TestDeepEvalHandlerEdgeCases:
+    def test_metric_with_no_reason(self):
+        metric = _mock_metric(score=0.8, reason=None)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["explanation"] == ""
+
+    def test_metric_score_zero(self):
+        metric = _mock_metric(score=0.0, threshold=0.5)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.0
+        assert result["label"] == "Fail"
+
+    def test_metric_score_one(self):
+        metric = _mock_metric(score=1.0, threshold=0.5)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 1.0
+        assert result["label"] == "Pass"
+
+    def test_default_threshold_when_missing(self):
+        metric = _mock_metric(score=0.6)
+        del metric.threshold
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["label"] == "Pass"
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
new file mode 100644
index 00000000..efab5459
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
@@ -0,0 +1,331 @@
+"""Tests for deepeval input_mapper module."""
+
+from unittest.mock import MagicMock
+
+import pytest
+from deepeval.test_case import LLMTestCaseParams
+
+from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import (
+    ParsedEvaluationEvent,
+    _get_required_params,
+    build_test_case,
+)
+
+
+def _make_event(
+    level="TRACE",
+    trace_ids=None,
+    span_ids=None,
+    spans=None,
+    reference_inputs=None,
+):
+    """Build a raw Lambda event dict for testing."""
+    event = {
+        "schemaVersion": "1.0",
+        "evaluationLevel": level,
+        "evaluationInput": {
+            "sessionSpans": spans
+            or [
+                {
+                    "traceId": "abc123",
+                    "spanId": "span1",
+                    "attributes": {
+                        "gen_ai.message.role": "user",
+                        "gen_ai.message.content": "What is the capital of France?",
+                    },
+                },
+                {
+                    "traceId": "abc123",
+                    "spanId": "span2",
+                    "attributes": {
+                        "gen_ai.message.role": "assistant",
+                        "gen_ai.message.content": "The capital of France is Paris.",
+                    },
+                },
+            ]
+        },
+        "evaluationTarget": {},
+    }
+    if trace_ids is not None:
+        event["evaluationTarget"]["traceIds"] = trace_ids
+    if span_ids is not None:
+        event["evaluationTarget"]["spanIds"] = span_ids
+    if reference_inputs is not None:
+        event["evaluationReferenceInputs"] = reference_inputs
+    return event
+
+
+def _mock_metric(name="MockMetric", required_params=None, evaluation_params=None, threshold=0.5):
+    """Create a mock DeepEval metric."""
+    metric = MagicMock()
+    type(metric).__name__ = name
+    metric.threshold = threshold
+
+    if required_params is not None:
+        metric._required_params = required_params
+    else:
+        del metric._required_params
+
+    if evaluation_params is not None:
+        metric.evaluation_params = evaluation_params
+    else:
+        del metric.evaluation_params
+
+    return metric
+
+
+class TestParsedEvaluationEvent:
+    def test_from_lambda_event_trace_level(self):
+        event = _make_event(level="TRACE", trace_ids=["trace-1"])
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+
+        assert parsed.evaluation_level == "TRACE"
+        assert parsed.target_trace_id == "trace-1"
+        assert parsed.target_span_id is None
+        assert len(parsed.session_spans) == 2
+
+    def test_from_lambda_event_tool_call_level(self):
+        event = _make_event(level="TOOL_CALL", span_ids=["span-42"])
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+
+        assert parsed.evaluation_level == "TOOL_CALL"
+        assert parsed.target_span_id == "span-42"
+        assert parsed.target_trace_id is None
+
+    def test_from_lambda_event_session_level(self):
+        event = _make_event(level="SESSION")
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+
+        assert parsed.evaluation_level == "SESSION"
+        assert parsed.target_trace_id is None
+        assert parsed.target_span_id is None
+
+    def test_from_lambda_event_with_reference_inputs(self):
+        refs = [{"expectedResponse": "Paris is the capital of France."}]
+        event = _make_event(reference_inputs=refs)
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+
+        assert parsed.reference_inputs == refs
+
+    def test_from_lambda_event_missing_reference_inputs(self):
+        event = _make_event()
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+
+        assert parsed.reference_inputs == []
+
+    def test_from_lambda_event_missing_evaluation_level_raises(self):
+        event = _make_event()
+        del event["evaluationLevel"]
+
+        with pytest.raises(KeyError):
+            ParsedEvaluationEvent.from_lambda_event(event)
+
+    def test_from_lambda_event_missing_evaluation_input_raises(self):
+        event = _make_event()
+        del event["evaluationInput"]
+
+        with pytest.raises(KeyError):
+            ParsedEvaluationEvent.from_lambda_event(event)
+
+    def test_from_lambda_event_missing_target_key_defaults(self):
+        event = _make_event()
+        del event["evaluationTarget"]
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+
+        assert parsed.target_trace_id is None
+        assert parsed.target_span_id is None
+
+
+class TestGetRequiredParams:
+    def test_uses_required_params_attribute(self):
+        metric = _mock_metric(
+            required_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
+        )
+        result = _get_required_params(metric)
+
+        assert result == ["input", "actual_output"]
+
+    def test_falls_back_to_static_registry(self):
+        metric = _mock_metric(name="FaithfulnessMetric")
+        result = _get_required_params(metric)
+
+        assert result == ["input", "actual_output", "retrieval_context"]
+
+    def test_falls_back_to_evaluation_params(self):
+        metric = _mock_metric(
+            name="UnknownMetric",
+            evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
+        )
+        result = _get_required_params(metric)
+
+        assert result == ["input", "retrieval_context"]
+
+    def test_defaults_to_input_and_actual_output(self):
+        metric = _mock_metric(name="UnknownMetric")
+        result = _get_required_params(metric)
+
+        assert result == ["input", "actual_output"]
+
+    def test_empty_required_params_falls_through(self):
+        metric = _mock_metric(name="UnknownMetric", required_params=[])
+        result = _get_required_params(metric)
+
+        assert result == ["input", "actual_output"]
+
+
+class TestBuildTestCase:
+    def test_basic_span_extraction(self):
+        event = _make_event()
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="AnswerRelevancyMetric")
+
+        test_case = build_test_case(parsed, metric)
+
+        assert test_case.input == "What is the capital of France?"
+        assert test_case.actual_output == "The capital of France is Paris."
+
+    def test_retrieval_context_from_tool_spans(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s2",
+                "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 1"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s3",
+                "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 2"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s4",
+                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"},
+            },
+        ]
+        event = _make_event(spans=spans)
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="FaithfulnessMetric")
+
+        test_case = build_test_case(parsed, metric)
+
+        assert test_case.input == "query"
+        assert test_case.actual_output == "answer"
+        assert test_case.retrieval_context == ["doc chunk 1", "doc chunk 2"]
+
+    def test_expected_output_from_reference_inputs(self):
+        refs = [{"expectedResponse": "Paris"}]
+        event = _make_event(reference_inputs=refs)
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="AnswerRelevancyMetric")
+
+        test_case = build_test_case(parsed, metric)
+
+        assert test_case.expected_output == "Paris"
+
+    def test_missing_required_field_raises_value_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s2",
+                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"},
+            },
+        ]
+        event = _make_event(spans=spans)
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="FaithfulnessMetric")
+
+        with pytest.raises(ValueError, match="retrieval_context"):
+            build_test_case(parsed, metric)
+
+    def test_custom_field_mapper_bypasses_extraction(self):
+        event = _make_event()
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="AnswerRelevancyMetric")
+
+        def custom_mapper(raw_event):
+            return {
+                "input": "custom input",
+                "actual_output": "custom output",
+            }
+
+        test_case = build_test_case(parsed, metric, field_mapper=custom_mapper)
+
+        assert test_case.input == "custom input"
+        assert test_case.actual_output == "custom output"
+
+    def test_field_mapper_receives_reconstructed_event(self):
+        refs = [{"expectedResponse": "expected"}]
+        event = _make_event(level="TRACE", trace_ids=["t1"], reference_inputs=refs)
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="AnswerRelevancyMetric")
+
+        received_events = []
+
+        def capture_mapper(raw_event):
+            received_events.append(raw_event)
+            return {"input": "x", "actual_output": "y"}
+
+        build_test_case(parsed, metric, field_mapper=capture_mapper)
+
+        raw = received_events[0]
+        assert raw["evaluationLevel"] == "TRACE"
+        assert raw["evaluationTarget"]["traceIds"] == ["t1"]
+        assert raw["evaluationReferenceInputs"] == refs
+
+    def test_multiple_user_messages_concatenated(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "hello"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s2",
+                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "world"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s3",
+                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "hi"},
+            },
+        ]
+        event = _make_event(spans=spans)
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="AnswerRelevancyMetric")
+
+        test_case = build_test_case(parsed, metric)
+
+        assert test_case.input == "hello\nworld"
+
+    def test_gen_ai_completion_fallback(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.message.role": "user", "gen_ai.completion": "fallback input"},
+            },
+            {
+                "traceId": "t1",
+                "spanId": "s2",
+                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.completion": "fallback output"},
+            },
+        ]
+        event = _make_event(spans=spans)
+        parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        metric = _mock_metric(name="AnswerRelevancyMetric")
+
+        test_case = build_test_case(parsed, metric)
+
+        assert test_case.input == "fallback input"
+        assert test_case.actual_output == "fallback output"

From 402ea7891e0175a0d00255ee69fe62888ba2a631 Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 11:38:57 -0700
Subject: [PATCH 02/13] Fix span extraction to use real AgentCore
 _eval_log_records structure

---
 .../integrations/deepeval/input_mapper.py     |  94 +++-
 .../integrations/deepeval/test_handler.py     |  57 +--
 .../deepeval/test_input_mapper.py             | 402 ++++++++++++++----
 3 files changed, 415 insertions(+), 138 deletions(-)

diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
index 50873cf5..cd67845f 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
@@ -1,5 +1,6 @@
 """Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects."""
 
+import json
 import logging
 from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional
@@ -92,15 +93,36 @@ def _get_required_params(metric: BaseMetric) -> List[str]:
     return ["input", "actual_output"]
 
 
+def _get_message_content(message: Any) -> str:
+    """Extract text content from a message object.
+
+    Message content can be a dict with a "content" or "message" key, or a plain string.
+    Handles one level of nesting (e.g. {"content": {"content": "text"}}).
+    """
+    if isinstance(message, str):
+        return message
+    if isinstance(message, dict):
+        for key in ("content", "message"):
+            if key in message:
+                val = message[key]
+                if isinstance(val, str):
+                    return val
+                if isinstance(val, dict):
+                    return _get_message_content(val)
+                return str(val)
+    return ""
+
+
 def _extract_fields_from_spans(
     parsed: ParsedEvaluationEvent,
 ) -> Dict[str, Any]:
-    """Extract LLMTestCase fields from ADOT session spans.
+    """Extract LLMTestCase fields from AgentCore session spans.
 
-    Bridges Session → LLMTestCase fields:
-        - input ← user messages (role=="user")
-        - actual_output ← assistant messages (role=="assistant")
-        - retrieval_context ← tool messages (role=="tool")
+    Parses _eval_log_records from span attributes, filters by target_trace_id,
+    and extracts messages by role:
+        - input ← input messages where role=="user"
+        - actual_output ← output messages where role=="assistant"
+        - retrieval_context ← output messages where role=="tool"
         - expected_output ← evaluationReferenceInputs[0].expectedResponse
     """
     user_messages: List[str] = []
@@ -109,18 +131,56 @@ def _extract_fields_from_spans(
 
     for span in parsed.session_spans:
         attributes = span.get("attributes", {})
-        role = attributes.get("gen_ai.message.role", "")
-        content = attributes.get("gen_ai.message.content", "")
-
-        if not content:
-            content = attributes.get("gen_ai.completion", "")
-
-        if role == "user" and content:
-            user_messages.append(content)
-        elif role == "assistant" and content:
-            assistant_messages.append(content)
-        elif role == "tool" and content:
-            tool_messages.append(content)
+        log_records_raw = attributes.get("_eval_log_records")
+        if not log_records_raw:
+            continue
+
+        if isinstance(log_records_raw, str):
+            try:
+                log_records = json.loads(log_records_raw)
+            except (json.JSONDecodeError, TypeError):
+                logger.debug("Failed to parse _eval_log_records as JSON")
+                continue
+        else:
+            log_records = log_records_raw
+
+        if not isinstance(log_records, list):
+            continue
+
+        for record in log_records:
+            if not isinstance(record, dict):
+                continue
+
+            if parsed.target_trace_id:
+                record_trace_id = record.get("traceId") or record.get("trace_id")
+                if record_trace_id and record_trace_id != parsed.target_trace_id:
+                    continue
+
+            body = record.get("body", {})
+            if not isinstance(body, dict):
+                continue
+
+            input_data = body.get("input", {})
+            if isinstance(input_data, dict):
+                for msg in input_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "user" and content:
+                        user_messages.append(content)
+
+            output_data = body.get("output", {})
+            if isinstance(output_data, dict):
+                for msg in output_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "assistant" and content:
+                        assistant_messages.append(content)
+                    elif role == "tool" and content:
+                        tool_messages.append(content)
 
     fields: Dict[str, Any] = {}
 
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
index 77988ab7..c3fa98ae 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
@@ -1,5 +1,6 @@
 """Tests for DeepEvalHandler."""
 
+import json
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -14,30 +15,27 @@ def _make_event(
     reference_inputs=None,
 ):
     """Build a raw Lambda event dict for testing."""
+    if spans is None:
+        log_records = [
+            {
+                "body": {
+                    "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                    "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                }
+            }
+        ]
+        spans = [
+            {
+                "traceId": "abc123",
+                "spanId": "span1",
+                "attributes": {"_eval_log_records": json.dumps(log_records)},
+            }
+        ]
+
     event = {
         "schemaVersion": "1.0",
         "evaluationLevel": level,
-        "evaluationInput": {
-            "sessionSpans": spans
-            or [
-                {
-                    "traceId": "abc123",
-                    "spanId": "span1",
-                    "attributes": {
-                        "gen_ai.message.role": "user",
-                        "gen_ai.message.content": "What is AI?",
-                    },
-                },
-                {
-                    "traceId": "abc123",
-                    "spanId": "span2",
-                    "attributes": {
-                        "gen_ai.message.role": "assistant",
-                        "gen_ai.message.content": "AI is artificial intelligence.",
-                    },
-                },
-            ]
-        },
+        "evaluationInput": {"sessionSpans": spans},
         "evaluationTarget": {},
     }
     if trace_ids is not None:
@@ -153,17 +151,20 @@ def test_missing_evaluation_input_returns_error(self):
         assert result["errorCode"] == "INVALID_EVENT"
 
     def test_missing_required_field_returns_error(self):
+        log_records = [
+            {
+                "body": {
+                    "input": {"messages": [{"role": "user", "content": "q"}]},
+                    "output": {"messages": [{"role": "assistant", "content": "a"}]},
+                }
+            }
+        ]
         spans = [
             {
                 "traceId": "t1",
                 "spanId": "s1",
-                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "q"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s2",
-                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "a"},
-            },
+                "attributes": {"_eval_log_records": json.dumps(log_records)},
+            }
         ]
         metric = _mock_metric(name="FaithfulnessMetric")
         handler = DeepEvalHandler(metric=metric)
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
index efab5459..67447f48 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
@@ -1,5 +1,6 @@
 """Tests for deepeval input_mapper module."""
 
+import json
 from unittest.mock import MagicMock
 
 import pytest
@@ -7,11 +8,38 @@
 
 from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import (
     ParsedEvaluationEvent,
+    _extract_fields_from_spans,
     _get_required_params,
     build_test_case,
 )
 
 
+def _make_log_record(
+    input_messages=None,
+    output_messages=None,
+    trace_id=None,
+):
+    """Build a single log record dict."""
+    record = {"body": {}}
+    if input_messages is not None:
+        record["body"]["input"] = {"messages": input_messages}
+    if output_messages is not None:
+        record["body"]["output"] = {"messages": output_messages}
+    if trace_id is not None:
+        record["traceId"] = trace_id
+    return record
+
+
+def _make_span_with_log_records(log_records, span_id="span1", as_json_string=True):
+    """Build a span dict with _eval_log_records in attributes."""
+    value = json.dumps(log_records) if as_json_string else log_records
+    return {
+        "traceId": "abc123",
+        "spanId": span_id,
+        "attributes": {"_eval_log_records": value},
+    }
+
+
 def _make_event(
     level="TRACE",
     trace_ids=None,
@@ -20,30 +48,19 @@ def _make_event(
     reference_inputs=None,
 ):
     """Build a raw Lambda event dict for testing."""
+    if spans is None:
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "What is the capital of France?"}],
+                output_messages=[{"role": "assistant", "content": "The capital of France is Paris."}],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+
     event = {
         "schemaVersion": "1.0",
         "evaluationLevel": level,
-        "evaluationInput": {
-            "sessionSpans": spans
-            or [
-                {
-                    "traceId": "abc123",
-                    "spanId": "span1",
-                    "attributes": {
-                        "gen_ai.message.role": "user",
-                        "gen_ai.message.content": "What is the capital of France?",
-                    },
-                },
-                {
-                    "traceId": "abc123",
-                    "spanId": "span2",
-                    "attributes": {
-                        "gen_ai.message.role": "assistant",
-                        "gen_ai.message.content": "The capital of France is Paris.",
-                    },
-                },
-            ]
-        },
+        "evaluationInput": {"sessionSpans": spans},
         "evaluationTarget": {},
     }
     if trace_ids is not None:
@@ -82,7 +99,7 @@ def test_from_lambda_event_trace_level(self):
         assert parsed.evaluation_level == "TRACE"
         assert parsed.target_trace_id == "trace-1"
         assert parsed.target_span_id is None
-        assert len(parsed.session_spans) == 2
+        assert len(parsed.session_spans) == 1
 
     def test_from_lambda_event_tool_call_level(self):
         event = _make_event(level="TOOL_CALL", span_ids=["span-42"])
@@ -173,6 +190,250 @@ def test_empty_required_params_falls_through(self):
         assert result == ["input", "actual_output"]
 
 
+class TestExtractFieldsFromSpans:
+    def test_basic_extraction(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "hello"}],
+                output_messages=[{"role": "assistant", "content": "world"}],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "hello"
+        assert fields["actual_output"] == "world"
+
+    def test_tool_messages_become_retrieval_context(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "query"}],
+                output_messages=[
+                    {"role": "tool", "content": "doc chunk 1"},
+                    {"role": "tool", "content": "doc chunk 2"},
+                    {"role": "assistant", "content": "answer"},
+                ],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"]
+        assert fields["actual_output"] == "answer"
+
+    def test_message_content_as_dict_with_content_key(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": {"content": "nested content"}}],
+                output_messages=[{"role": "assistant", "content": {"content": "nested output"}}],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "nested content"
+        assert fields["actual_output"] == "nested output"
+
+    def test_message_content_as_dict_with_message_key(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "message": "msg key input"}],
+                output_messages=[{"role": "assistant", "message": "msg key output"}],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "msg key input"
+        assert fields["actual_output"] == "msg key output"
+
+    def test_message_content_as_plain_string_in_content_field(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "plain string"}],
+                output_messages=[{"role": "assistant", "content": "plain response"}],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "plain string"
+        assert fields["actual_output"] == "plain response"
+
+    def test_target_trace_id_filters_records(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "relevant"}],
+                output_messages=[{"role": "assistant", "content": "relevant answer"}],
+                trace_id="target-trace",
+            ),
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "irrelevant"}],
+                output_messages=[{"role": "assistant", "content": "irrelevant answer"}],
+                trace_id="other-trace",
+            ),
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE",
+            session_spans=spans,
+            target_trace_id="target-trace",
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "relevant"
+        assert fields["actual_output"] == "relevant answer"
+
+    def test_no_target_trace_id_includes_all_records(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "first"}],
+                output_messages=[{"role": "assistant", "content": "first answer"}],
+                trace_id="trace-1",
+            ),
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "second"}],
+                output_messages=[{"role": "assistant", "content": "second answer"}],
+                trace_id="trace-2",
+            ),
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="SESSION", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "first\nsecond"
+        assert fields["actual_output"] == "first answer\nsecond answer"
+
+    def test_log_records_as_parsed_list(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "from list"}],
+                output_messages=[{"role": "assistant", "content": "from list answer"}],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records, as_json_string=False)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "from list"
+        assert fields["actual_output"] == "from list answer"
+
+    def test_invalid_json_log_records_skipped(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"_eval_log_records": "not valid json{{{"},
+            }
+        ]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields == {}
+
+    def test_span_without_log_records_skipped(self):
+        spans = [{"traceId": "t1", "spanId": "s1", "attributes": {}}]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields == {}
+
+    def test_multiple_spans_aggregated(self):
+        log_records_1 = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "q1"}],
+                output_messages=[{"role": "assistant", "content": "a1"}],
+            )
+        ]
+        log_records_2 = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "q2"}],
+                output_messages=[{"role": "assistant", "content": "a2"}],
+            )
+        ]
+        spans = [
+            _make_span_with_log_records(log_records_1, span_id="s1"),
+            _make_span_with_log_records(log_records_2, span_id="s2"),
+        ]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="SESSION", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "q1\nq2"
+        assert fields["actual_output"] == "a1\na2"
+
+    def test_reference_inputs_expected_output(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "q"}],
+                output_messages=[{"role": "assistant", "content": "a"}],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE",
+            session_spans=spans,
+            reference_inputs=[{"expectedResponse": "expected answer"}],
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["expected_output"] == "expected answer"
+
+    def test_record_without_matching_trace_id_key_included(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "no trace id record"}],
+                output_messages=[{"role": "assistant", "content": "response"}],
+            ),
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE",
+            session_spans=spans,
+            target_trace_id="target-trace",
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["input"] == "no trace id record"
+
+
 class TestBuildTestCase:
     def test_basic_span_extraction(self):
         event = _make_event()
@@ -184,29 +445,18 @@ def test_basic_span_extraction(self):
         assert test_case.input == "What is the capital of France?"
         assert test_case.actual_output == "The capital of France is Paris."
 
-    def test_retrieval_context_from_tool_spans(self):
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s2",
-                "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 1"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s3",
-                "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 2"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s4",
-                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"},
-            },
+    def test_retrieval_context_from_tool_messages(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "query"}],
+                output_messages=[
+                    {"role": "tool", "content": "doc chunk 1"},
+                    {"role": "tool", "content": "doc chunk 2"},
+                    {"role": "assistant", "content": "answer"},
+                ],
+            )
         ]
+        spans = [_make_span_with_log_records(log_records)]
         event = _make_event(spans=spans)
         parsed = ParsedEvaluationEvent.from_lambda_event(event)
         metric = _mock_metric(name="FaithfulnessMetric")
@@ -228,18 +478,13 @@ def test_expected_output_from_reference_inputs(self):
         assert test_case.expected_output == "Paris"
 
     def test_missing_required_field_raises_value_error(self):
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s2",
-                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"},
-            },
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "query"}],
+                output_messages=[{"role": "assistant", "content": "answer"}],
+            )
         ]
+        spans = [_make_span_with_log_records(log_records)]
         event = _make_event(spans=spans)
         parsed = ParsedEvaluationEvent.from_lambda_event(event)
         metric = _mock_metric(name="FaithfulnessMetric")
@@ -283,23 +528,16 @@ def capture_mapper(raw_event):
         assert raw["evaluationReferenceInputs"] == refs
 
     def test_multiple_user_messages_concatenated(self):
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "hello"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s2",
-                "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "world"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s3",
-                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "hi"},
-            },
+        log_records = [
+            _make_log_record(
+                input_messages=[
+                    {"role": "user", "content": "hello"},
+                    {"role": "user", "content": "world"},
+                ],
+                output_messages=[{"role": "assistant", "content": "hi"}],
+            )
         ]
+        spans = [_make_span_with_log_records(log_records)]
         event = _make_event(spans=spans)
         parsed = ParsedEvaluationEvent.from_lambda_event(event)
         metric = _mock_metric(name="AnswerRelevancyMetric")
@@ -307,25 +545,3 @@ def test_multiple_user_messages_concatenated(self):
         test_case = build_test_case(parsed, metric)
 
         assert test_case.input == "hello\nworld"
-
-    def test_gen_ai_completion_fallback(self):
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"gen_ai.message.role": "user", "gen_ai.completion": "fallback input"},
-            },
-            {
-                "traceId": "t1",
-                "spanId": "s2",
-                "attributes": {"gen_ai.message.role": "assistant", "gen_ai.completion": "fallback output"},
-            },
-        ]
-        event = _make_event(spans=spans)
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="AnswerRelevancyMetric")
-
-        test_case = build_test_case(parsed, metric)
-
-        assert test_case.input == "fallback input"
-        assert test_case.actual_output == "fallback output"

From e9ef47d40b40027bdd917dc551698404a834efad Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 12:14:10 -0700
Subject: [PATCH 03/13] Set context field from tool messages for
 HallucinationMetric support

---
 .../integrations/deepeval/input_mapper.py     |  1 +
 .../deepeval/test_input_mapper.py             | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
index cd67845f..39182636 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
@@ -190,6 +190,7 @@ def _extract_fields_from_spans(
         fields["actual_output"] = "\n".join(assistant_messages)
     if tool_messages:
         fields["retrieval_context"] = tool_messages
+        fields["context"] = tool_messages
 
     if parsed.reference_inputs:
         expected = parsed.reference_inputs[0].get("expectedResponse")
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
index 67447f48..ca661128 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
@@ -229,6 +229,26 @@ def test_tool_messages_become_retrieval_context(self):
         assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"]
         assert fields["actual_output"] == "answer"
 
+    def test_tool_messages_also_set_context_for_hallucination_metric(self):
+        log_records = [
+            _make_log_record(
+                input_messages=[{"role": "user", "content": "query"}],
+                output_messages=[
+                    {"role": "tool", "content": "context chunk"},
+                    {"role": "assistant", "content": "answer"},
+                ],
+            )
+        ]
+        spans = [_make_span_with_log_records(log_records)]
+        parsed = ParsedEvaluationEvent(
+            evaluation_level="TRACE", session_spans=spans
+        )
+
+        fields = _extract_fields_from_spans(parsed)
+
+        assert fields["context"] == ["context chunk"]
+        assert fields["context"] == fields["retrieval_context"]
+
     def test_message_content_as_dict_with_content_key(self):
         log_records = [
             _make_log_record(

From f97827e892f8c431d0e5d258bd16cc92ef05d447 Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 12:36:35 -0700
Subject: [PATCH 04/13] Use metric.success for label instead of manual
 threshold comparison

---
 .../integrations/deepeval/handler.py          |  3 +-
 .../integrations/deepeval/test_handler.py     | 29 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
index b339b883..4893889c 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
@@ -78,7 +78,8 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]
         score = self.metric.score
         reason = getattr(self.metric, "reason", None) or ""
         threshold = getattr(self.metric, "threshold", 0.5)
-        label = "Pass" if score is not None and score >= threshold else "Fail"
+        success = getattr(self.metric, "success", score is not None and score >= threshold)
+        label = "Pass" if success else "Fail"
 
         return {"value": score, "label": label, "explanation": reason}
 
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
index c3fa98ae..009f5e54 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
@@ -55,6 +55,7 @@ def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetri
     metric._required_params = None
     del metric._required_params
     del metric.evaluation_params
+    del metric.success
 
     def measure_side_effect(test_case):
         metric.score = score
@@ -229,3 +230,31 @@ def test_default_threshold_when_missing(self):
         result = handler(_make_event())
 
         assert result["label"] == "Pass"
+
+    def test_label_uses_metric_success_true(self):
+        metric = _mock_metric(score=0.3, threshold=0.7)
+        metric.success = True
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.3
+        assert result["label"] == "Pass"
+
+    def test_label_uses_metric_success_false(self):
+        metric = _mock_metric(score=0.9, threshold=0.7)
+        metric.success = False
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.9
+        assert result["label"] == "Fail"
+
+    def test_label_falls_back_to_threshold_when_no_success(self):
+        metric = _mock_metric(score=0.8, threshold=0.7)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["label"] == "Pass"

From 9d256aea85fdc1718349562691bb3055301b9410 Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 12:42:07 -0700
Subject: [PATCH 05/13] Add model override and timeout enforcement to
 DeepEvalHandler

---
 .../integrations/deepeval/handler.py          | 49 ++++++++++++++-
 .../integrations/deepeval/test_handler.py     | 61 +++++++++++++++++++
 2 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
index 4893889c..c71ed6da 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
@@ -1,6 +1,7 @@
 """DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics."""
 
 import logging
+import threading
 from typing import Any, Callable, Dict, Optional
 
 from deepeval.metrics import BaseMetric
@@ -30,10 +31,14 @@ def lambda_handler(event, context):
             return handler(event, context)
     """
 
+    DEFAULT_TIMEOUT = 290
+
     def __init__(
         self,
         metric: BaseMetric,
         field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+        model: Optional[str] = None,
+        timeout: Optional[int] = None,
     ):
         """Initialize the handler.
 
@@ -42,9 +47,15 @@ def __init__(
             field_mapper: Optional callable that receives the raw Lambda event and
                 returns a dict of LLMTestCase field values. Bypasses default span
                 extraction when provided.
+            model: Optional model identifier to override the metric's LLM
+                (e.g. a Bedrock model string instead of the default OpenAI model).
+            timeout: Maximum seconds to allow for metric.measure(). Defaults to 290
+                (slightly under Lambda's 300s max). Set to None to disable.
         """
         self.metric = metric
         self.field_mapper = field_mapper
+        self.model = model
+        self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT
 
     def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]:
         """Handle a Lambda invocation.
@@ -69,8 +80,16 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]
             logger.error("Missing required fields: %s", e)
             return _error_response("MISSING_REQUIRED_FIELD", str(e))
 
+        if self.model is not None:
+            self.metric.model = self.model
+
         try:
-            self.metric.measure(test_case)
+            self._measure_with_timeout(test_case)
+        except _MetricTimeout:
+            return _error_response(
+                "METRIC_TIMEOUT",
+                f"{type(self.metric).__name__} exceeded {self.timeout}s timeout.",
+            )
         except Exception as e:
             logger.error("Metric measurement failed: %s", e, exc_info=True)
             return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}")
@@ -83,6 +102,34 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]
 
         return {"value": score, "label": label, "explanation": reason}
 
+    def _measure_with_timeout(self, test_case: Any) -> None:
+        """Run metric.measure with a thread-based timeout."""
+        if self.timeout <= 0:
+            self.metric.measure(test_case)
+            return
+
+        exception_holder: list = []
+
+        def target():
+            try:
+                self.metric.measure(test_case)
+            except Exception as e:
+                exception_holder.append(e)
+
+        thread = threading.Thread(target=target, daemon=True)
+        thread.start()
+        thread.join(timeout=self.timeout)
+
+        if thread.is_alive():
+            raise _MetricTimeout()
+
+        if exception_holder:
+            raise exception_holder[0]
+
+
+class _MetricTimeout(Exception):
+    """Raised when metric.measure exceeds the configured timeout."""
+
 
 def _error_response(code: str, message: str) -> Dict[str, str]:
     """Build a standardized error response dict."""
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
index 009f5e54..9867969b 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
@@ -1,6 +1,7 @@
 """Tests for DeepEvalHandler."""
 
 import json
+import time
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -258,3 +259,63 @@ def test_label_falls_back_to_threshold_when_no_success(self):
         result = handler(_make_event())
 
         assert result["label"] == "Pass"
+
+    def test_model_override_sets_metric_model(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3")
+
+        handler(_make_event())
+
+        assert metric.model == "bedrock/anthropic.claude-3"
+
+    def test_no_model_override_leaves_metric_unchanged(self):
+        metric = _mock_metric()
+        metric.model = "original-model"
+        handler = DeepEvalHandler(metric=metric)
+
+        handler(_make_event())
+
+        assert metric.model == "original-model"
+
+
+class TestDeepEvalHandlerTimeout:
+    def test_timeout_returns_error(self):
+        metric = _mock_metric()
+        metric.measure = MagicMock(side_effect=lambda tc: time.sleep(5))
+        handler = DeepEvalHandler(metric=metric, timeout=1)
+
+        result = handler(_make_event())
+
+        assert result["errorCode"] == "METRIC_TIMEOUT"
+        assert "1s timeout" in result["errorMessage"]
+
+    def test_no_timeout_when_measure_completes_in_time(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric, timeout=10)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.85
+        assert "errorCode" not in result
+
+    def test_default_timeout_is_290(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+
+        assert handler.timeout == 290
+
+    def test_custom_timeout_value(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric, timeout=60)
+
+        assert handler.timeout == 60
+
+    def test_metric_exception_still_propagates_with_timeout(self):
+        metric = _mock_metric()
+        metric.measure = MagicMock(side_effect=RuntimeError("LLM error"))
+        handler = DeepEvalHandler(metric=metric, timeout=10)
+
+        result = handler(_make_event())
+
+        assert result["errorCode"] == "METRIC_ERROR"
+        assert "LLM error" in result["errorMessage"]

From c142d50cd4b367b49df45b2a04e332f363d4fccc Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 12:56:33 -0700
Subject: [PATCH 06/13] Add model override, timeout enforcement, use
 metric.success, fix SingleTurnParams deprecation

---
 .../evaluation/integrations/deepeval/handler.py    |  7 ++++---
 .../integrations/deepeval/input_mapper.py          | 14 +++++++-------
 .../integrations/deepeval/test_input_mapper.py     |  6 +++---
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
index c71ed6da..ed261727 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
@@ -37,7 +37,7 @@ def __init__(
         self,
         metric: BaseMetric,
         field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
-        model: Optional[str] = None,
+        model: Optional[Any] = None,
         timeout: Optional[int] = None,
     ):
         """Initialize the handler.
@@ -47,8 +47,9 @@ def __init__(
             field_mapper: Optional callable that receives the raw Lambda event and
                 returns a dict of LLMTestCase field values. Bypasses default span
                 extraction when provided.
-            model: Optional model identifier to override the metric's LLM
-                (e.g. a Bedrock model string instead of the default OpenAI model).
+            model: Optional model override for the metric's LLM. Can be a string
+                model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM
+                subclass instance.
             timeout: Maximum seconds to allow for metric.measure(). Defaults to 290
                 (slightly under Lambda's 300s max). Set to None to disable.
         """
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
index 39182636..47e75c0c 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
@@ -6,16 +6,16 @@
 from typing import Any, Callable, Dict, List, Optional
 
 from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
 
 logger = logging.getLogger(__name__)
 
-_PARAM_TO_FIELD: Dict[LLMTestCaseParams, str] = {
-    LLMTestCaseParams.INPUT: "input",
-    LLMTestCaseParams.ACTUAL_OUTPUT: "actual_output",
-    LLMTestCaseParams.EXPECTED_OUTPUT: "expected_output",
-    LLMTestCaseParams.CONTEXT: "context",
-    LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrieval_context",
+_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = {
+    SingleTurnParams.INPUT: "input",
+    SingleTurnParams.ACTUAL_OUTPUT: "actual_output",
+    SingleTurnParams.EXPECTED_OUTPUT: "expected_output",
+    SingleTurnParams.CONTEXT: "context",
+    SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context",
 }
 
 _METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = {
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
index ca661128..6d2a5420 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
@@ -4,7 +4,7 @@
 from unittest.mock import MagicMock
 
 import pytest
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
 
 from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import (
     ParsedEvaluationEvent,
@@ -156,7 +156,7 @@ def test_from_lambda_event_missing_target_key_defaults(self):
 class TestGetRequiredParams:
     def test_uses_required_params_attribute(self):
         metric = _mock_metric(
-            required_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
+            required_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT]
         )
         result = _get_required_params(metric)
 
@@ -171,7 +171,7 @@ def test_falls_back_to_static_registry(self):
     def test_falls_back_to_evaluation_params(self):
         metric = _mock_metric(
             name="UnknownMetric",
-            evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
+            evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.RETRIEVAL_CONTEXT],
         )
         result = _get_required_params(metric)
 

From 3ccc98cf9ae0939dfbb7ca07d6290f82752d1a46 Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 16:42:01 -0700
Subject: [PATCH 07/13] Fix _get_required_params to handle GEval unmappable
 typing params

---
 .deepeval/.deepeval_telemetry.txt                    |  2 ++
 .../evaluation/integrations/deepeval/input_mapper.py |  3 ++-
 .../integrations/deepeval/test_input_mapper.py       | 12 ++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 .deepeval/.deepeval_telemetry.txt

diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt
new file mode 100644
index 00000000..916744ae
--- /dev/null
+++ b/.deepeval/.deepeval_telemetry.txt
@@ -0,0 +1,2 @@
+DEEPEVAL_ID=f26d66a4-b0b0-4096-859f-89f1ddf7ceee
+DEEPEVAL_STATUS=old
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
index 47e75c0c..941afce2 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
@@ -80,7 +80,8 @@ def _get_required_params(metric: BaseMetric) -> List[str]:
     """
     if hasattr(metric, "_required_params") and metric._required_params:
         params = metric._required_params
-        return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params]
+        if all(p in _PARAM_TO_FIELD for p in params):
+            return [_PARAM_TO_FIELD[p] for p in params]
 
     class_name = type(metric).__name__
     if class_name in _METRIC_REQUIRED_PARAMS:
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
index 6d2a5420..1d90a689 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
@@ -183,6 +183,18 @@ def test_defaults_to_input_and_actual_output(self):
 
         assert result == ["input", "actual_output"]
 
+    def test_unmappable_required_params_skips_to_static_registry(self):
+        metric = _mock_metric(name="GEval", required_params=["SomeTypingObject", "AnotherType"])
+        result = _get_required_params(metric)
+
+        assert result == ["input", "actual_output"]
+
+    def test_unmappable_required_params_falls_to_default(self):
+        metric = _mock_metric(name="UnknownMetric", required_params=["SomeTypingObject"])
+        result = _get_required_params(metric)
+
+        assert result == ["input", "actual_output"]
+
     def test_empty_required_params_falls_through(self):
         metric = _mock_metric(name="UnknownMetric", required_params=[])
         result = _get_required_params(metric)

From a884f912d4f3faa18fb3c978ed1ad1a41db1f19b Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Mon, 15 Jun 2026 16:50:12 -0700
Subject: [PATCH 08/13] Add .deepeval/ to gitignore

---
 .deepeval/.deepeval_telemetry.txt | 2 --
 .gitignore                        | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)
 delete mode 100644 .deepeval/.deepeval_telemetry.txt

diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt
deleted file mode 100644
index 916744ae..00000000
--- a/.deepeval/.deepeval_telemetry.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-DEEPEVAL_ID=f26d66a4-b0b0-4096-859f-89f1ddf7ceee
-DEEPEVAL_STATUS=old
diff --git a/.gitignore b/.gitignore
index 01fe8e22..161403e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -229,3 +229,4 @@ local_settings.py
 Dockerfile
 CLAUDE.md
 .omc/
+.deepeval/

From 6ac198cc5ece549997dd9589695277225944fd1e Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Tue, 16 Jun 2026 15:26:05 -0700
Subject: [PATCH 09/13] Move model override to init to avoid per-call mutation

---
 .../evaluation/integrations/deepeval/handler.py             | 6 ++----
 .../evaluation/integrations/deepeval/test_handler.py        | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
index ed261727..0e91bafe 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
@@ -55,8 +55,9 @@ def __init__(
         """
         self.metric = metric
         self.field_mapper = field_mapper
-        self.model = model
         self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT
+        if model is not None:
+            self.metric.model = model
 
     def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]:
         """Handle a Lambda invocation.
@@ -81,9 +82,6 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]
             logger.error("Missing required fields: %s", e)
             return _error_response("MISSING_REQUIRED_FIELD", str(e))
 
-        if self.model is not None:
-            self.metric.model = self.model
-
         try:
             self._measure_with_timeout(test_case)
         except _MetricTimeout:
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
index 9867969b..77961f14 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
@@ -264,8 +264,6 @@ def test_model_override_sets_metric_model(self):
         metric = _mock_metric()
         handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3")
 
-        handler(_make_event())
-
         assert metric.model == "bedrock/anthropic.claude-3"
 
     def test_no_model_override_leaves_metric_unchanged(self):

From 8d415e5791ba260a7253c196a92c861c7e44e34f Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Wed, 24 Jun 2026 16:25:17 -0700
Subject: [PATCH 10/13] Refactor to BaseAdapter framework with
 DeepEval/Autoevals adapters and EvaluatorInput support

---
 .../evaluation/integrations/__init__.py       |   4 +
 .../integrations/autoevals/__init__.py        |   5 +
 .../integrations/autoevals/adapter.py         |  72 +++++
 .../evaluation/integrations/base.py           | 302 ++++++++++++++++++
 .../integrations/deepeval/__init__.py         |   4 +-
 .../integrations/deepeval/adapter.py          | 189 +++++++++++
 .../integrations/deepeval/handler.py          | 135 --------
 .../integrations/deepeval/input_mapper.py     | 253 ---------------
 .../integrations/autoevals/__init__.py        |   0
 .../integrations/autoevals/test_adapter.py    | 217 +++++++++++++
 .../integrations/deepeval/test_handler.py     | 112 ++++++-
 .../deepeval/test_input_mapper.py             |   8 +-
 12 files changed, 906 insertions(+), 395 deletions(-)
 create mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
 create mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py
 create mode 100644 src/bedrock_agentcore/evaluation/integrations/base.py
 create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py
 delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
 delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
 create mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
 create mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py

diff --git a/src/bedrock_agentcore/evaluation/integrations/__init__.py b/src/bedrock_agentcore/evaluation/integrations/__init__.py
index 33048d5d..a1ff7691 100644
--- a/src/bedrock_agentcore/evaluation/integrations/__init__.py
+++ b/src/bedrock_agentcore/evaluation/integrations/__init__.py
@@ -1 +1,5 @@
 """AgentCore Evaluation integrations."""
+
+from bedrock_agentcore.evaluation.integrations.base import BaseAdapter, ParsedEvaluationEvent
+
+__all__ = ["BaseAdapter", "ParsedEvaluationEvent"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
new file mode 100644
index 00000000..0bc3b4ff
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
@@ -0,0 +1,5 @@
+"""Autoevals integration for AgentCore Evaluation."""
+
+from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter
+
+__all__ = ["AutoevalsAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py
new file mode 100644
index 00000000..fe89435e
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py
@@ -0,0 +1,72 @@
+"""Autoevals adapter for AgentCore evaluation integrations."""
+
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from bedrock_agentcore.evaluation.integrations.base import BaseAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class AutoevalsAdapter(BaseAdapter):
+    """Adapter that runs an Autoevals scorer against AgentCore evaluation events.
+
+    Example::
+
+        from autoevals import Factuality
+
+        scorer = Factuality()
+        handler = AutoevalsAdapter(scorer=scorer)
+
+        # Use as Lambda handler
+        def lambda_handler(event, context):
+            return handler(event, context)
+    """
+
+    def __init__(
+        self,
+        scorer: Any,
+        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+        timeout: Optional[int] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()).
+            field_mapper: Optional callable that receives the raw Lambda event and
+                returns a dict of field values. Bypasses default span extraction.
+            timeout: Maximum seconds to allow for scorer.eval(). Defaults to 290.
+        """
+        super().__init__(field_mapper=field_mapper, timeout=timeout)
+        self.scorer = scorer
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that input and actual_output are present."""
+        missing = []
+        if not fields.get("input"):
+            missing.append("input")
+        if not fields.get("actual_output"):
+            missing.append("actual_output")
+        if missing:
+            scorer_name = type(self.scorer).__name__
+            raise ValueError(
+                f"Field(s) {missing} required by {scorer_name} but not found in evaluation event. "
+                f"Provide a field_mapper or ensure spans contain the necessary data."
+            )
+
+    def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]:
+        """Run the Autoevals scorer and return formatted results."""
+        kwargs: Dict[str, Any] = {
+            "input": fields.get("input", ""),
+            "output": fields.get("actual_output", ""),
+        }
+        if fields.get("expected_output"):
+            kwargs["expected"] = fields["expected_output"]
+
+        result = self.scorer.eval(**kwargs)
+
+        score = result.score
+        label = "Pass" if score is not None and score >= 0.5 else "Fail"
+        explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else ""
+
+        return {"value": score, "label": label, "explanation": explanation}
diff --git a/src/bedrock_agentcore/evaluation/integrations/base.py b/src/bedrock_agentcore/evaluation/integrations/base.py
new file mode 100644
index 00000000..a10f6606
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/integrations/base.py
@@ -0,0 +1,302 @@
+"""Base adapter for AgentCore evaluation integrations."""
+
+import abc
+import json
+import logging
+import threading
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Union
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ParsedEvaluationEvent:
+    """Parsed representation of the AgentCore Lambda evaluation event."""
+
+    evaluation_level: str
+    session_spans: List[Dict[str, Any]]
+    target_trace_id: Optional[str] = None
+    target_span_id: Optional[str] = None
+    reference_inputs: List[Dict[str, Any]] = field(default_factory=list)
+
+    @classmethod
+    def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent":
+        """Parse a raw Lambda event dict into a structured object.
+
+        Args:
+            event: Raw Lambda event payload from the evaluation service.
+
+        Returns:
+            ParsedEvaluationEvent with extracted fields.
+
+        Raises:
+            KeyError: If required top-level fields are missing.
+        """
+        evaluation_input = event["evaluationInput"]
+        target = event.get("evaluationTarget") or {}
+        trace_ids = target.get("traceIds") or []
+        span_ids = target.get("spanIds") or []
+
+        return cls(
+            evaluation_level=event["evaluationLevel"],
+            session_spans=evaluation_input["sessionSpans"],
+            target_trace_id=trace_ids[0] if trace_ids else None,
+            target_span_id=span_ids[0] if span_ids else None,
+            reference_inputs=event.get("evaluationReferenceInputs") or [],
+        )
+
+
+def _get_message_content(message: Any) -> str:
+    """Extract text content from a message object.
+
+    Message content can be a dict with a "content" or "message" key, or a plain string.
+    Handles one level of nesting (e.g. {"content": {"content": "text"}}).
+    """
+    if isinstance(message, str):
+        return message
+    if isinstance(message, dict):
+        for key in ("content", "message"):
+            if key in message:
+                val = message[key]
+                if isinstance(val, str):
+                    return val
+                if isinstance(val, dict):
+                    return _get_message_content(val)
+                return str(val)
+    return ""
+
+
+def extract_fields_from_spans(
+    parsed: ParsedEvaluationEvent,
+) -> Dict[str, Any]:
+    """Extract evaluation fields from AgentCore session spans.
+
+    Parses _eval_log_records from span attributes, filters by target_trace_id,
+    and extracts messages by role:
+        - input ← input messages where role=="user"
+        - actual_output ← output messages where role=="assistant"
+        - retrieval_context ← output messages where role=="tool"
+        - context ← same as retrieval_context
+        - expected_output ← evaluationReferenceInputs[0].expectedResponse
+    """
+    user_messages: List[str] = []
+    assistant_messages: List[str] = []
+    tool_messages: List[str] = []
+
+    for span in parsed.session_spans:
+        attributes = span.get("attributes", {})
+        log_records_raw = attributes.get("_eval_log_records")
+        if not log_records_raw:
+            continue
+
+        if isinstance(log_records_raw, str):
+            try:
+                log_records = json.loads(log_records_raw)
+            except (json.JSONDecodeError, TypeError):
+                logger.debug("Failed to parse _eval_log_records as JSON")
+                continue
+        else:
+            log_records = log_records_raw
+
+        if not isinstance(log_records, list):
+            continue
+
+        for record in log_records:
+            if not isinstance(record, dict):
+                continue
+
+            if parsed.target_trace_id:
+                record_trace_id = record.get("traceId") or record.get("trace_id")
+                if record_trace_id and record_trace_id != parsed.target_trace_id:
+                    continue
+
+            body = record.get("body", {})
+            if not isinstance(body, dict):
+                continue
+
+            input_data = body.get("input", {})
+            if isinstance(input_data, dict):
+                for msg in input_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "user" and content:
+                        user_messages.append(content)
+
+            output_data = body.get("output", {})
+            if isinstance(output_data, dict):
+                for msg in output_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "assistant" and content:
+                        assistant_messages.append(content)
+                    elif role == "tool" and content:
+                        tool_messages.append(content)
+
+    fields: Dict[str, Any] = {}
+
+    if user_messages:
+        fields["input"] = "\n".join(user_messages)
+    if assistant_messages:
+        fields["actual_output"] = "\n".join(assistant_messages)
+    if tool_messages:
+        fields["retrieval_context"] = tool_messages
+        fields["context"] = tool_messages
+
+    if parsed.reference_inputs:
+        expected = parsed.reference_inputs[0].get("expectedResponse")
+        if expected:
+            fields["expected_output"] = expected
+
+    return fields
+
+
+class _ExecutionTimeout(Exception):
+    """Raised when execution exceeds the configured timeout."""
+
+
+def _error_response(code: str, message: str) -> Dict[str, str]:
+    """Build a standardized error response dict."""
+    return {"errorCode": code, "errorMessage": message}
+
+
+class BaseAdapter(abc.ABC):
+    """Base adapter for evaluation framework integrations.
+
+    Subclasses only need to implement execute(fields) which runs the actual
+    evaluation logic and returns (score, label, explanation).
+
+    Never raises unhandled exceptions — always returns a valid response dict.
+    """
+
+    DEFAULT_TIMEOUT = 290
+
+    def __init__(
+        self,
+        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+        timeout: Optional[int] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            field_mapper: Optional callable that receives the raw Lambda event and
+                returns a dict of field values. Bypasses default span extraction.
+            timeout: Maximum seconds to allow for execute(). Defaults to 290
+                (slightly under Lambda's 300s max).
+        """
+        self.field_mapper = field_mapper
+        self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT
+
+    def __call__(self, event: Union[Dict[str, Any], EvaluatorInput], context: Any = None) -> Dict[str, Any]:
+        """Handle a Lambda invocation.
+
+        Args:
+            event: Either a raw Lambda event dict or an EvaluatorInput instance
+                from bedrock_agentcore.evaluation.custom_code_based_evaluators.models.
+            context: Lambda context object (unused).
+
+        Returns:
+            Success: {"value": float, "label": str, "explanation": str}
+            Error: {"errorCode": str, "errorMessage": str}
+        """
+        try:
+            if isinstance(event, EvaluatorInput):
+                parsed = ParsedEvaluationEvent(
+                    evaluation_level=event.evaluation_level,
+                    session_spans=event.session_spans,
+                    target_trace_id=event.target_trace_id,
+                    target_span_id=event.target_span_id,
+                    reference_inputs=getattr(event, "reference_inputs", []) or [],
+                )
+            else:
+                parsed = ParsedEvaluationEvent.from_lambda_event(event)
+        except (KeyError, IndexError, TypeError) as e:
+            logger.error("Failed to parse evaluation event: %s", e)
+            return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}")
+
+        try:
+            fields = self._extract_fields(parsed)
+        except ValueError as e:
+            logger.error("Missing required fields: %s", e)
+            return _error_response("MISSING_REQUIRED_FIELD", str(e))
+
+        try:
+            result = self._execute_with_timeout(fields)
+        except _ExecutionTimeout:
+            return _error_response(
+                "METRIC_TIMEOUT",
+                f"{type(self).__name__} exceeded {self.timeout}s timeout.",
+            )
+        except Exception as e:
+            logger.error("Execution failed: %s", e, exc_info=True)
+            return _error_response("METRIC_ERROR", f"{type(self).__name__} failed: {e}")
+
+        return result
+
+    def _extract_fields(self, parsed: ParsedEvaluationEvent) -> Dict[str, Any]:
+        """Extract fields from event, using field_mapper if provided."""
+        if self.field_mapper is not None:
+            raw_event = {
+                "evaluationLevel": parsed.evaluation_level,
+                "evaluationInput": {"sessionSpans": parsed.session_spans},
+                "evaluationTarget": {
+                    "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [],
+                    "spanIds": [parsed.target_span_id] if parsed.target_span_id else [],
+                },
+                "evaluationReferenceInputs": parsed.reference_inputs,
+            }
+            return self.field_mapper(raw_event)
+
+        fields = extract_fields_from_spans(parsed)
+        self.validate_fields(fields)
+        return fields
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that required fields are present.
+
+        Override in subclasses to enforce field requirements.
+        Default implementation does nothing.
+        """
+
+    @abc.abstractmethod
+    def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]:
+        """Run the evaluation and return the response dict.
+
+        Args:
+            fields: Extracted field dict with keys like "input", "actual_output", etc.
+
+        Returns:
+            {"value": float, "label": str, "explanation": str}
+        """
+
+    def _execute_with_timeout(self, fields: Dict[str, Any]) -> Dict[str, Any]:
+        """Run execute() with a thread-based timeout."""
+        if self.timeout <= 0:
+            return self.execute(fields)
+
+        result_holder: list = []
+        exception_holder: list = []
+
+        def target():
+            try:
+                result_holder.append(self.execute(fields))
+            except Exception as e:
+                exception_holder.append(e)
+
+        thread = threading.Thread(target=target, daemon=True)
+        thread.start()
+        thread.join(timeout=self.timeout)
+
+        if thread.is_alive():
+            raise _ExecutionTimeout()
+
+        if exception_holder:
+            raise exception_holder[0]
+
+        return result_holder[0]
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
index 76f6461f..adb6ba44 100644
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
@@ -1,5 +1,5 @@
 """DeepEval integration for AgentCore Evaluation."""
 
-from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler
+from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler
 
-__all__ = ["DeepEvalHandler"]
+__all__ = ["DeepEvalAdapter", "DeepEvalHandler"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py
new file mode 100644
index 00000000..e8748782
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py
@@ -0,0 +1,189 @@
+"""DeepEval adapter for AgentCore evaluation integrations."""
+
+import logging
+from typing import Any, Callable, Dict, List, Optional
+
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase, SingleTurnParams
+
+from bedrock_agentcore.evaluation.integrations.base import (
+    BaseAdapter,
+    ParsedEvaluationEvent,
+    extract_fields_from_spans,
+)
+
+logger = logging.getLogger(__name__)
+
+_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = {
+    SingleTurnParams.INPUT: "input",
+    SingleTurnParams.ACTUAL_OUTPUT: "actual_output",
+    SingleTurnParams.EXPECTED_OUTPUT: "expected_output",
+    SingleTurnParams.CONTEXT: "context",
+    SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context",
+}
+
+_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = {
+    "AnswerRelevancyMetric": ["input", "actual_output"],
+    "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"],
+    "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"],
+    "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
+    "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
+    "HallucinationMetric": ["input", "actual_output", "context"],
+    "BiasMetric": ["input", "actual_output"],
+    "ToxicityMetric": ["input", "actual_output"],
+    "GEval": ["input", "actual_output"],
+    "SummarizationMetric": ["input", "actual_output"],
+}
+
+
+def _get_required_params(metric: BaseMetric) -> List[str]:
+    """Determine which LLMTestCase fields a metric requires.
+
+    Fallback chain:
+        1. metric._required_params (DeepEval internal attribute)
+        2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name
+        3. metric.evaluation_params (GEval special case)
+        4. Default: ["input", "actual_output"]
+    """
+    if hasattr(metric, "_required_params") and metric._required_params:
+        params = metric._required_params
+        if all(p in _PARAM_TO_FIELD for p in params):
+            return [_PARAM_TO_FIELD[p] for p in params]
+
+    class_name = type(metric).__name__
+    if class_name in _METRIC_REQUIRED_PARAMS:
+        return _METRIC_REQUIRED_PARAMS[class_name]
+
+    if hasattr(metric, "evaluation_params") and metric.evaluation_params:
+        params = metric.evaluation_params
+        return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params]
+
+    return ["input", "actual_output"]
+
+
+class DeepEvalAdapter(BaseAdapter):
+    """Adapter that runs a DeepEval metric against AgentCore evaluation events.
+
+    Example::
+
+        from deepeval.metrics import AnswerRelevancyMetric
+
+        metric = AnswerRelevancyMetric(threshold=0.7)
+        handler = DeepEvalAdapter(metric=metric)
+
+        # Use as Lambda handler
+        def lambda_handler(event, context):
+            return handler(event, context)
+    """
+
+    def __init__(
+        self,
+        metric: BaseMetric,
+        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+        model: Optional[Any] = None,
+        timeout: Optional[int] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
+            field_mapper: Optional callable that receives the raw Lambda event and
+                returns a dict of LLMTestCase field values. Bypasses default span
+                extraction when provided.
+            model: Optional model override for the metric's LLM. Can be a string
+                model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM
+                subclass instance.
+            timeout: Maximum seconds to allow for metric.measure(). Defaults to 290
+                (slightly under Lambda's 300s max).
+        """
+        super().__init__(field_mapper=field_mapper, timeout=timeout)
+        self.metric = metric
+        if model is not None:
+            self.metric.model = model
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that fields required by the metric are present."""
+        required = _get_required_params(self.metric)
+        missing = [f for f in required if f not in fields or not fields[f]]
+        if missing:
+            metric_name = type(self.metric).__name__
+            raise ValueError(
+                f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
+                f"Provide a field_mapper or ensure spans contain the necessary data."
+            )
+
+    def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]:
+        """Run the DeepEval metric and return formatted results."""
+        test_case = LLMTestCase(
+            input=fields.get("input", ""),
+            actual_output=fields.get("actual_output", ""),
+            expected_output=fields.get("expected_output"),
+            context=fields.get("context"),
+            retrieval_context=fields.get("retrieval_context"),
+        )
+
+        self.metric.measure(test_case)
+
+        score = self.metric.score
+        reason = getattr(self.metric, "reason", None) or ""
+        threshold = getattr(self.metric, "threshold", 0.5)
+        success = getattr(self.metric, "success", score is not None and score >= threshold)
+        label = "Pass" if success else "Fail"
+
+        return {"value": score, "label": label, "explanation": reason}
+
+
+def build_test_case(
+    parsed: ParsedEvaluationEvent,
+    metric: BaseMetric,
+    field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
+) -> LLMTestCase:
+    """Build a DeepEval LLMTestCase from a parsed evaluation event.
+
+    Args:
+        parsed: The parsed Lambda event.
+        metric: The DeepEval metric (used to determine required fields).
+        field_mapper: Optional callable that receives the raw Lambda event fields
+            and returns a dict of LLMTestCase field values. Bypasses default
+            span extraction when provided.
+
+    Returns:
+        An LLMTestCase ready for metric.measure().
+
+    Raises:
+        ValueError: If required fields for the metric cannot be populated.
+    """
+    if field_mapper is not None:
+        raw_event = {
+            "evaluationLevel": parsed.evaluation_level,
+            "evaluationInput": {"sessionSpans": parsed.session_spans},
+            "evaluationTarget": {
+                "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [],
+                "spanIds": [parsed.target_span_id] if parsed.target_span_id else [],
+            },
+            "evaluationReferenceInputs": parsed.reference_inputs,
+        }
+        fields = field_mapper(raw_event)
+    else:
+        fields = extract_fields_from_spans(parsed)
+
+    required = _get_required_params(metric)
+    missing = [f for f in required if f not in fields or not fields[f]]
+    if missing:
+        metric_name = type(metric).__name__
+        raise ValueError(
+            f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
+            f"Provide a field_mapper or ensure spans contain the necessary data."
+        )
+
+    return LLMTestCase(
+        input=fields.get("input", ""),
+        actual_output=fields.get("actual_output", ""),
+        expected_output=fields.get("expected_output"),
+        context=fields.get("context"),
+        retrieval_context=fields.get("retrieval_context"),
+    )
+
+
+# Backward-compatible alias
+DeepEvalHandler = DeepEvalAdapter
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
deleted file mode 100644
index 0e91bafe..00000000
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py
+++ /dev/null
@@ -1,135 +0,0 @@
-"""DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics."""
-
-import logging
-import threading
-from typing import Any, Callable, Dict, Optional
-
-from deepeval.metrics import BaseMetric
-
-from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import (
-    ParsedEvaluationEvent,
-    build_test_case,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class DeepEvalHandler:
-    """Lambda handler that runs a DeepEval metric against AgentCore evaluation events.
-
-    Never raises unhandled exceptions — always returns a valid response dict.
-
-    Example::
-
-        from deepeval.metrics import AnswerRelevancyMetric
-
-        metric = AnswerRelevancyMetric(threshold=0.7)
-        handler = DeepEvalHandler(metric=metric)
-
-        # Use as Lambda handler
-        def lambda_handler(event, context):
-            return handler(event, context)
-    """
-
-    DEFAULT_TIMEOUT = 290
-
-    def __init__(
-        self,
-        metric: BaseMetric,
-        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
-        model: Optional[Any] = None,
-        timeout: Optional[int] = None,
-    ):
-        """Initialize the handler.
-
-        Args:
-            metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
-            field_mapper: Optional callable that receives the raw Lambda event and
-                returns a dict of LLMTestCase field values. Bypasses default span
-                extraction when provided.
-            model: Optional model override for the metric's LLM. Can be a string
-                model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM
-                subclass instance.
-            timeout: Maximum seconds to allow for metric.measure(). Defaults to 290
-                (slightly under Lambda's 300s max). Set to None to disable.
-        """
-        self.metric = metric
-        self.field_mapper = field_mapper
-        self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT
-        if model is not None:
-            self.metric.model = model
-
-    def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]:
-        """Handle a Lambda invocation.
-
-        Args:
-            event: Raw Lambda event dict from the evaluation service.
-            context: Lambda context object (unused).
-
-        Returns:
-            Success: {"value": float, "label": str, "explanation": str}
-            Error: {"errorCode": str, "errorMessage": str}
-        """
-        try:
-            parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        except (KeyError, IndexError, TypeError) as e:
-            logger.error("Failed to parse evaluation event: %s", e)
-            return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}")
-
-        try:
-            test_case = build_test_case(parsed, self.metric, self.field_mapper)
-        except ValueError as e:
-            logger.error("Missing required fields: %s", e)
-            return _error_response("MISSING_REQUIRED_FIELD", str(e))
-
-        try:
-            self._measure_with_timeout(test_case)
-        except _MetricTimeout:
-            return _error_response(
-                "METRIC_TIMEOUT",
-                f"{type(self.metric).__name__} exceeded {self.timeout}s timeout.",
-            )
-        except Exception as e:
-            logger.error("Metric measurement failed: %s", e, exc_info=True)
-            return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}")
-
-        score = self.metric.score
-        reason = getattr(self.metric, "reason", None) or ""
-        threshold = getattr(self.metric, "threshold", 0.5)
-        success = getattr(self.metric, "success", score is not None and score >= threshold)
-        label = "Pass" if success else "Fail"
-
-        return {"value": score, "label": label, "explanation": reason}
-
-    def _measure_with_timeout(self, test_case: Any) -> None:
-        """Run metric.measure with a thread-based timeout."""
-        if self.timeout <= 0:
-            self.metric.measure(test_case)
-            return
-
-        exception_holder: list = []
-
-        def target():
-            try:
-                self.metric.measure(test_case)
-            except Exception as e:
-                exception_holder.append(e)
-
-        thread = threading.Thread(target=target, daemon=True)
-        thread.start()
-        thread.join(timeout=self.timeout)
-
-        if thread.is_alive():
-            raise _MetricTimeout()
-
-        if exception_holder:
-            raise exception_holder[0]
-
-
-class _MetricTimeout(Exception):
-    """Raised when metric.measure exceeds the configured timeout."""
-
-
-def _error_response(code: str, message: str) -> Dict[str, str]:
-    """Build a standardized error response dict."""
-    return {"errorCode": code, "errorMessage": message}
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
deleted file mode 100644
index 941afce2..00000000
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py
+++ /dev/null
@@ -1,253 +0,0 @@
-"""Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects."""
-
-import json
-import logging
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional
-
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, SingleTurnParams
-
-logger = logging.getLogger(__name__)
-
-_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = {
-    SingleTurnParams.INPUT: "input",
-    SingleTurnParams.ACTUAL_OUTPUT: "actual_output",
-    SingleTurnParams.EXPECTED_OUTPUT: "expected_output",
-    SingleTurnParams.CONTEXT: "context",
-    SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context",
-}
-
-_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = {
-    "AnswerRelevancyMetric": ["input", "actual_output"],
-    "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"],
-    "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"],
-    "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
-    "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
-    "HallucinationMetric": ["input", "actual_output", "context"],
-    "BiasMetric": ["input", "actual_output"],
-    "ToxicityMetric": ["input", "actual_output"],
-    "GEval": ["input", "actual_output"],
-    "SummarizationMetric": ["input", "actual_output"],
-}
-
-
-@dataclass
-class ParsedEvaluationEvent:
-    """Parsed representation of the AgentCore Lambda evaluation event."""
-
-    evaluation_level: str
-    session_spans: List[Dict[str, Any]]
-    target_trace_id: Optional[str] = None
-    target_span_id: Optional[str] = None
-    reference_inputs: List[Dict[str, Any]] = field(default_factory=list)
-
-    @classmethod
-    def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent":
-        """Parse a raw Lambda event dict into a structured object.
-
-        Args:
-            event: Raw Lambda event payload from the evaluation service.
-
-        Returns:
-            ParsedEvaluationEvent with extracted fields.
-
-        Raises:
-            KeyError: If required top-level fields are missing.
-        """
-        evaluation_input = event["evaluationInput"]
-        target = event.get("evaluationTarget") or {}
-        trace_ids = target.get("traceIds") or []
-        span_ids = target.get("spanIds") or []
-
-        return cls(
-            evaluation_level=event["evaluationLevel"],
-            session_spans=evaluation_input["sessionSpans"],
-            target_trace_id=trace_ids[0] if trace_ids else None,
-            target_span_id=span_ids[0] if span_ids else None,
-            reference_inputs=event.get("evaluationReferenceInputs") or [],
-        )
-
-
-def _get_required_params(metric: BaseMetric) -> List[str]:
-    """Determine which LLMTestCase fields a metric requires.
-
-    Fallback chain:
-        1. metric._required_params (DeepEval internal attribute)
-        2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name
-        3. metric.evaluation_params (GEval special case)
-        4. Default: ["input", "actual_output"]
-    """
-    if hasattr(metric, "_required_params") and metric._required_params:
-        params = metric._required_params
-        if all(p in _PARAM_TO_FIELD for p in params):
-            return [_PARAM_TO_FIELD[p] for p in params]
-
-    class_name = type(metric).__name__
-    if class_name in _METRIC_REQUIRED_PARAMS:
-        return _METRIC_REQUIRED_PARAMS[class_name]
-
-    if hasattr(metric, "evaluation_params") and metric.evaluation_params:
-        params = metric.evaluation_params
-        return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params]
-
-    return ["input", "actual_output"]
-
-
-def _get_message_content(message: Any) -> str:
-    """Extract text content from a message object.
-
-    Message content can be a dict with a "content" or "message" key, or a plain string.
-    Handles one level of nesting (e.g. {"content": {"content": "text"}}).
-    """
-    if isinstance(message, str):
-        return message
-    if isinstance(message, dict):
-        for key in ("content", "message"):
-            if key in message:
-                val = message[key]
-                if isinstance(val, str):
-                    return val
-                if isinstance(val, dict):
-                    return _get_message_content(val)
-                return str(val)
-    return ""
-
-
-def _extract_fields_from_spans(
-    parsed: ParsedEvaluationEvent,
-) -> Dict[str, Any]:
-    """Extract LLMTestCase fields from AgentCore session spans.
-
-    Parses _eval_log_records from span attributes, filters by target_trace_id,
-    and extracts messages by role:
-        - input ← input messages where role=="user"
-        - actual_output ← output messages where role=="assistant"
-        - retrieval_context ← output messages where role=="tool"
-        - expected_output ← evaluationReferenceInputs[0].expectedResponse
-    """
-    user_messages: List[str] = []
-    assistant_messages: List[str] = []
-    tool_messages: List[str] = []
-
-    for span in parsed.session_spans:
-        attributes = span.get("attributes", {})
-        log_records_raw = attributes.get("_eval_log_records")
-        if not log_records_raw:
-            continue
-
-        if isinstance(log_records_raw, str):
-            try:
-                log_records = json.loads(log_records_raw)
-            except (json.JSONDecodeError, TypeError):
-                logger.debug("Failed to parse _eval_log_records as JSON")
-                continue
-        else:
-            log_records = log_records_raw
-
-        if not isinstance(log_records, list):
-            continue
-
-        for record in log_records:
-            if not isinstance(record, dict):
-                continue
-
-            if parsed.target_trace_id:
-                record_trace_id = record.get("traceId") or record.get("trace_id")
-                if record_trace_id and record_trace_id != parsed.target_trace_id:
-                    continue
-
-            body = record.get("body", {})
-            if not isinstance(body, dict):
-                continue
-
-            input_data = body.get("input", {})
-            if isinstance(input_data, dict):
-                for msg in input_data.get("messages", []):
-                    if not isinstance(msg, dict):
-                        continue
-                    role = msg.get("role", "")
-                    content = _get_message_content(msg)
-                    if role == "user" and content:
-                        user_messages.append(content)
-
-            output_data = body.get("output", {})
-            if isinstance(output_data, dict):
-                for msg in output_data.get("messages", []):
-                    if not isinstance(msg, dict):
-                        continue
-                    role = msg.get("role", "")
-                    content = _get_message_content(msg)
-                    if role == "assistant" and content:
-                        assistant_messages.append(content)
-                    elif role == "tool" and content:
-                        tool_messages.append(content)
-
-    fields: Dict[str, Any] = {}
-
-    if user_messages:
-        fields["input"] = "\n".join(user_messages)
-    if assistant_messages:
-        fields["actual_output"] = "\n".join(assistant_messages)
-    if tool_messages:
-        fields["retrieval_context"] = tool_messages
-        fields["context"] = tool_messages
-
-    if parsed.reference_inputs:
-        expected = parsed.reference_inputs[0].get("expectedResponse")
-        if expected:
-            fields["expected_output"] = expected
-
-    return fields
-
-
-def build_test_case(
-    parsed: ParsedEvaluationEvent,
-    metric: BaseMetric,
-    field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
-) -> LLMTestCase:
-    """Build a DeepEval LLMTestCase from a parsed evaluation event.
-
-    Args:
-        parsed: The parsed Lambda event.
-        metric: The DeepEval metric (used to determine required fields).
-        field_mapper: Optional callable that receives the raw Lambda event fields
-            and returns a dict of LLMTestCase field values. Bypasses default
-            span extraction when provided.
-
-    Returns:
-        An LLMTestCase ready for metric.measure().
-
-    Raises:
-        ValueError: If required fields for the metric cannot be populated.
-    """
-    if field_mapper is not None:
-        raw_event = {
-            "evaluationLevel": parsed.evaluation_level,
-            "evaluationInput": {"sessionSpans": parsed.session_spans},
-            "evaluationTarget": {
-                "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [],
-                "spanIds": [parsed.target_span_id] if parsed.target_span_id else [],
-            },
-            "evaluationReferenceInputs": parsed.reference_inputs,
-        }
-        fields = field_mapper(raw_event)
-    else:
-        fields = _extract_fields_from_spans(parsed)
-
-    required = _get_required_params(metric)
-    missing = [f for f in required if f not in fields or not fields[f]]
-    if missing:
-        metric_name = type(metric).__name__
-        raise ValueError(
-            f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
-            f"Provide a field_mapper or ensure spans contain the necessary data."
-        )
-
-    return LLMTestCase(
-        input=fields.get("input", ""),
-        actual_output=fields.get("actual_output", ""),
-        expected_output=fields.get("expected_output"),
-        context=fields.get("context"),
-        retrieval_context=fields.get("retrieval_context"),
-    )
diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py
new file mode 100644
index 00000000..17f674bd
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py
@@ -0,0 +1,217 @@
+"""Tests for AutoevalsAdapter."""
+
+import json
+import time
+from unittest.mock import MagicMock
+
+import pytest
+
+from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter
+
+
+def _make_event(
+    level="TRACE",
+    trace_ids=None,
+    spans=None,
+    reference_inputs=None,
+):
+    """Build a raw Lambda event dict for testing."""
+    if spans is None:
+        log_records = [
+            {
+                "body": {
+                    "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                    "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                }
+            }
+        ]
+        spans = [
+            {
+                "traceId": "abc123",
+                "spanId": "span1",
+                "attributes": {"_eval_log_records": json.dumps(log_records)},
+            }
+        ]
+
+    event = {
+        "schemaVersion": "1.0",
+        "evaluationLevel": level,
+        "evaluationInput": {"sessionSpans": spans},
+        "evaluationTarget": {},
+    }
+    if trace_ids is not None:
+        event["evaluationTarget"]["traceIds"] = trace_ids
+    if reference_inputs is not None:
+        event["evaluationReferenceInputs"] = reference_inputs
+    return event
+
+
+def _mock_scorer(score=0.9, rationale="Good answer"):
+    """Create a mock Autoevals scorer."""
+    scorer = MagicMock()
+    type(scorer).__name__ = "MockScorer"
+
+    result = MagicMock()
+    result.score = score
+    result.metadata = {"rationale": rationale}
+
+    scorer.eval = MagicMock(return_value=result)
+    return scorer
+
+
+class TestAutoevalsAdapterSuccess:
+    def test_returns_pass_when_score_above_half(self):
+        scorer = _mock_scorer(score=0.8)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_event())
+
+        assert result["value"] == 0.8
+        assert result["label"] == "Pass"
+        assert result["explanation"] == "Good answer"
+
+    def test_returns_fail_when_score_below_half(self):
+        scorer = _mock_scorer(score=0.3)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_event())
+
+        assert result["value"] == 0.3
+        assert result["label"] == "Fail"
+
+    def test_scorer_eval_called_with_input_and_output(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        adapter(_make_event())
+
+        scorer.eval.assert_called_once()
+        call_kwargs = scorer.eval.call_args[1]
+        assert call_kwargs["input"] == "What is AI?"
+        assert call_kwargs["output"] == "AI is artificial intelligence."
+
+    def test_expected_output_passed_as_expected(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        refs = [{"expectedResponse": "AI stands for artificial intelligence."}]
+        result = adapter(_make_event(reference_inputs=refs))
+
+        call_kwargs = scorer.eval.call_args[1]
+        assert call_kwargs["expected"] == "AI stands for artificial intelligence."
+
+    def test_no_expected_output_omits_expected_kwarg(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        adapter(_make_event())
+
+        call_kwargs = scorer.eval.call_args[1]
+        assert "expected" not in call_kwargs
+
+    def test_custom_field_mapper(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(
+            scorer=scorer,
+            field_mapper=lambda event: {
+                "input": "custom input",
+                "actual_output": "custom output",
+            },
+        )
+
+        result = adapter(_make_event())
+
+        call_kwargs = scorer.eval.call_args[1]
+        assert call_kwargs["input"] == "custom input"
+        assert call_kwargs["output"] == "custom output"
+
+
+class TestAutoevalsAdapterErrors:
+    def test_invalid_event_returns_error(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter({})
+
+        assert result["errorCode"] == "INVALID_EVENT"
+
+    def test_missing_input_returns_error(self):
+        log_records = [
+            {
+                "body": {
+                    "output": {"messages": [{"role": "assistant", "content": "answer"}]},
+                }
+            }
+        ]
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"_eval_log_records": json.dumps(log_records)},
+            }
+        ]
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_event(spans=spans))
+
+        assert result["errorCode"] == "MISSING_REQUIRED_FIELD"
+        assert "input" in result["errorMessage"]
+
+    def test_scorer_exception_returns_error(self):
+        scorer = _mock_scorer()
+        scorer.eval = MagicMock(side_effect=RuntimeError("API error"))
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_event())
+
+        assert result["errorCode"] == "METRIC_ERROR"
+        assert "API error" in result["errorMessage"]
+
+    def test_never_raises_on_bad_input(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        for bad_input in [None, [], "string", 42]:
+            result = adapter(bad_input)
+            assert "errorCode" in result
+
+
+class TestAutoevalsAdapterTimeout:
+    def test_timeout_returns_error(self):
+        scorer = _mock_scorer()
+        scorer.eval = MagicMock(side_effect=lambda **kw: time.sleep(5))
+        adapter = AutoevalsAdapter(scorer=scorer, timeout=1)
+
+        result = adapter(_make_event())
+
+        assert result["errorCode"] == "METRIC_TIMEOUT"
+
+    def test_default_timeout_is_290(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        assert adapter.timeout == 290
+
+
+class TestAutoevalsAdapterEdgeCases:
+    def test_score_none_returns_fail(self):
+        scorer = _mock_scorer(score=None)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_event())
+
+        assert result["label"] == "Fail"
+
+    def test_no_metadata_returns_empty_explanation(self):
+        scorer = MagicMock()
+        type(scorer).__name__ = "MockScorer"
+        result_obj = MagicMock(spec=[])
+        result_obj.score = 0.9
+        scorer.eval = MagicMock(return_value=result_obj)
+
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_event())
+
+        assert result["explanation"] == ""
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
index 77961f14..67bfda3d 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
@@ -1,4 +1,4 @@
-"""Tests for DeepEvalHandler."""
+"""Tests for DeepEvalHandler and DeepEvalAdapter."""
 
 import json
 import time
@@ -6,7 +6,9 @@
 
 import pytest
 
-from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler
+from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler
+from bedrock_agentcore.evaluation.integrations.base import BaseAdapter
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput
 
 
 def _make_event(
@@ -317,3 +319,109 @@ def test_metric_exception_still_propagates_with_timeout(self):
 
         assert result["errorCode"] == "METRIC_ERROR"
         assert "LLM error" in result["errorMessage"]
+
+
+class TestBackwardCompatibility:
+    def test_handler_is_alias_for_adapter(self):
+        assert DeepEvalHandler is DeepEvalAdapter
+
+    def test_adapter_is_subclass_of_base(self):
+        assert issubclass(DeepEvalAdapter, BaseAdapter)
+
+    def test_import_from_init(self):
+        from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalHandler as H
+        from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalAdapter as A
+
+        assert H is A
+
+    def test_handler_works_same_as_before(self):
+        metric = _mock_metric(score=0.9, threshold=0.7)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(_make_event())
+
+        assert result["value"] == 0.9
+        assert result["label"] == "Pass"
+
+
+class TestEvaluatorInputAcceptance:
+    def _make_evaluator_input(self):
+        log_records = [
+            {
+                "body": {
+                    "input": {"messages": [{"role": "user", "content": "Hello"}]},
+                    "output": {"messages": [{"role": "assistant", "content": "Hi there"}]},
+                }
+            }
+        ]
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"_eval_log_records": json.dumps(log_records)},
+            }
+        ]
+        return EvaluatorInput(
+            evaluation_level="TRACE",
+            session_spans=spans,
+            target_trace_id="t1",
+            target_span_id=None,
+        )
+
+    def test_accepts_evaluator_input(self):
+        metric = _mock_metric(score=0.95)
+        handler = DeepEvalHandler(metric=metric)
+
+        result = handler(self._make_evaluator_input())
+
+        assert result["value"] == 0.95
+        assert result["label"] == "Pass"
+
+    def test_evaluator_input_extracts_fields_correctly(self):
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+
+        handler(self._make_evaluator_input())
+
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "Hello"
+        assert test_case.actual_output == "Hi there"
+
+    def test_evaluator_input_with_trace_id_filtering(self):
+        log_records = [
+            {
+                "traceId": "target",
+                "body": {
+                    "input": {"messages": [{"role": "user", "content": "relevant"}]},
+                    "output": {"messages": [{"role": "assistant", "content": "yes"}]},
+                },
+            },
+            {
+                "traceId": "other",
+                "body": {
+                    "input": {"messages": [{"role": "user", "content": "irrelevant"}]},
+                    "output": {"messages": [{"role": "assistant", "content": "no"}]},
+                },
+            },
+        ]
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"_eval_log_records": json.dumps(log_records)},
+            }
+        ]
+        evaluator_input = EvaluatorInput(
+            evaluation_level="TRACE",
+            session_spans=spans,
+            target_trace_id="target",
+        )
+
+        metric = _mock_metric()
+        handler = DeepEvalHandler(metric=metric)
+
+        handler(evaluator_input)
+
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "relevant"
+        assert test_case.actual_output == "yes"
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
index 1d90a689..2d6fbaea 100644
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
+++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
@@ -1,4 +1,4 @@
-"""Tests for deepeval input_mapper module."""
+"""Tests for deepeval input mapping and test case building."""
 
 import json
 from unittest.mock import MagicMock
@@ -6,9 +6,11 @@
 import pytest
 from deepeval.test_case import SingleTurnParams
 
-from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import (
+from bedrock_agentcore.evaluation.integrations.base import (
     ParsedEvaluationEvent,
-    _extract_fields_from_spans,
+    extract_fields_from_spans as _extract_fields_from_spans,
+)
+from bedrock_agentcore.evaluation.integrations.deepeval.adapter import (
     _get_required_params,
     build_test_case,
 )

From 8627ab09409cf69c52fe937e1442ef75635df68e Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Sat, 27 Jun 2026 08:35:38 -0700
Subject: [PATCH 11/13] Major refactor: move to custom_code_based_evaluators,
 add span parser layer, simplify per TJ/Irene feedback

---
 pyproject.toml                                |   6 +
 .../third_party/__init__.py                   |   5 +
 .../third_party/autoevals/__init__.py         |   5 +
 .../third_party}/autoevals/adapter.py         |  31 +-
 .../third_party/base.py                       | 110 ++++
 .../third_party/deepeval/__init__.py          |   5 +
 .../third_party/deepeval/adapter.py           |  78 +++
 .../third_party/span_parsers/__init__.py      |   8 +
 .../third_party/span_parsers/base.py          |  62 ++
 .../third_party/span_parsers/common.py        | 145 +++++
 .../third_party/span_parsers/openinference.py |  27 +
 .../span_parsers/otel_langchain.py            |  27 +
 .../third_party/span_parsers/strands.py       |  26 +
 .../evaluation/integrations/__init__.py       |   4 -
 .../integrations/autoevals/__init__.py        |   5 -
 .../evaluation/integrations/base.py           | 302 ---------
 .../integrations/deepeval/__init__.py         |   5 -
 .../integrations/deepeval/adapter.py          | 189 ------
 .../third_party}/__init__.py                  |   0
 .../third_party/autoevals}/__init__.py        |   0
 .../third_party/autoevals/test_adapter.py     | 201 ++++++
 .../third_party/deepeval/__init__.py          |   0
 .../third_party/deepeval/test_adapter.py      | 218 +++++++
 .../third_party/span_parsers/__init__.py      |   0
 .../span_parsers/test_span_parsers.py         | 194 ++++++
 .../integrations/autoevals/test_adapter.py    | 217 -------
 .../integrations/deepeval/test_handler.py     | 427 -------------
 .../deepeval/test_input_mapper.py             | 581 ------------------
 .../evaluation/test_third_party_adapters.py   | 171 ++++++
 29 files changed, 1303 insertions(+), 1746 deletions(-)
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
 rename src/bedrock_agentcore/evaluation/{integrations => custom_code_based_evaluators/third_party}/autoevals/adapter.py (63%)
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py
 create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py
 delete mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
 delete mode 100644 src/bedrock_agentcore/evaluation/integrations/base.py
 delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
 delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py
 rename tests/bedrock_agentcore/evaluation/{integrations/autoevals => custom_code_based_evaluators/third_party}/__init__.py (100%)
 rename tests/bedrock_agentcore/evaluation/{integrations/deepeval => custom_code_based_evaluators/third_party/autoevals}/__init__.py (100%)
 create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py
 create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
 create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
 create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
 create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
 delete mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py
 delete mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
 delete mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
 create mode 100644 tests_integ/evaluation/test_third_party_adapters.py

diff --git a/pyproject.toml b/pyproject.toml
index 61520a5b..b1fc5e90 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -173,3 +173,9 @@ simulation = [
 datasets = [
     "requests>=2.31.0",
 ]
+deepeval = [
+    "deepeval>=2.0.0",
+]
+autoevals = [
+    "autoevals>=0.0.50",
+]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
new file mode 100644
index 00000000..06ba3d0a
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
@@ -0,0 +1,5 @@
+"""Third-party evaluation adapters for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+__all__ = ["BaseAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
new file mode 100644
index 00000000..40e25fc1
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
@@ -0,0 +1,5 @@
+"""Autoevals adapter for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter
+
+__all__ = ["AutoevalsAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
similarity index 63%
rename from src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py
rename to src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
index fe89435e..fa2acba3 100644
--- a/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
@@ -1,9 +1,10 @@
-"""Autoevals adapter for AgentCore evaluation integrations."""
+"""Autoevals adapter for AgentCore code-based evaluators."""
 
 import logging
 from typing import Any, Callable, Dict, Optional
 
-from bedrock_agentcore.evaluation.integrations.base import BaseAdapter
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
 
 logger = logging.getLogger(__name__)
 
@@ -14,31 +15,29 @@ class AutoevalsAdapter(BaseAdapter):
     Example::
 
         from autoevals import Factuality
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
 
         scorer = Factuality()
-        handler = AutoevalsAdapter(scorer=scorer)
-
-        # Use as Lambda handler
-        def lambda_handler(event, context):
-            return handler(event, context)
+        adapter = AutoevalsAdapter(scorer=scorer)
     """
 
     def __init__(
         self,
         scorer: Any,
-        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
-        timeout: Optional[int] = None,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+        threshold: float = 0.5,
     ):
         """Initialize the adapter.
 
         Args:
             scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()).
-            field_mapper: Optional callable that receives the raw Lambda event and
-                returns a dict of field values. Bypasses default span extraction.
-            timeout: Maximum seconds to allow for scorer.eval(). Defaults to 290.
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict of field values. Bypasses default span parsing.
+            threshold: Score threshold for Pass/Fail determination. Defaults to 0.5.
         """
-        super().__init__(field_mapper=field_mapper, timeout=timeout)
+        super().__init__(field_mapper=field_mapper)
         self.scorer = scorer
+        self.threshold = threshold
 
     def validate_fields(self, fields: Dict[str, Any]) -> None:
         """Validate that input and actual_output are present."""
@@ -54,7 +53,7 @@ def validate_fields(self, fields: Dict[str, Any]) -> None:
                 f"Provide a field_mapper or ensure spans contain the necessary data."
             )
 
-    def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]:
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
         """Run the Autoevals scorer and return formatted results."""
         kwargs: Dict[str, Any] = {
             "input": fields.get("input", ""),
@@ -66,7 +65,7 @@ def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]:
         result = self.scorer.eval(**kwargs)
 
         score = result.score
-        label = "Pass" if score is not None and score >= 0.5 else "Fail"
+        label = "Pass" if score is not None and score >= self.threshold else "Fail"
         explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else ""
 
-        return {"value": score, "label": label, "explanation": explanation}
+        return EvaluatorOutput(value=score, label=label, explanation=explanation)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
new file mode 100644
index 00000000..1f28d2a5
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
@@ -0,0 +1,110 @@
+"""Base adapter for third-party evaluation framework integrations."""
+
+import abc
+import logging
+from typing import Any, Callable, Dict, List, Optional, Union
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
+    SpanParseResult,
+    parse_spans,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class BaseAdapter(abc.ABC):
+    """Base adapter for third-party evaluation framework integrations.
+
+    Accepts an EvaluatorInput (from the code_based_evaluators flow),
+    extracts fields from spans using the built-in parser layer, runs the
+    evaluation via execute(), and returns an EvaluatorOutput.
+
+    Never raises unhandled exceptions — always returns a valid EvaluatorOutput.
+    """
+
+    def __init__(
+        self,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict of field values. Bypasses default span parsing
+                when provided.
+        """
+        self.field_mapper = field_mapper
+
+    def __call__(self, evaluator_input: EvaluatorInput, context: Any = None) -> EvaluatorOutput:
+        """Handle an evaluation invocation.
+
+        Args:
+            evaluator_input: Parsed EvaluatorInput from the code-based evaluator flow.
+            context: Lambda context object (unused).
+
+        Returns:
+            EvaluatorOutput with score, label, and explanation or error fields.
+        """
+        try:
+            fields = self._extract_fields(evaluator_input)
+        except ValueError as e:
+            logger.error("Field extraction failed: %s", e)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="FIELD_EXTRACTION_ERROR",
+                errorMessage=str(e),
+            )
+
+        try:
+            self.validate_fields(fields)
+        except ValueError as e:
+            logger.error("Validation failed: %s", e)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="MISSING_REQUIRED_FIELD",
+                errorMessage=str(e),
+            )
+
+        try:
+            return self.execute(fields)
+        except Exception as e:
+            logger.error("Execution failed: %s", e, exc_info=True)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="METRIC_ERROR",
+                errorMessage=f"{type(self).__name__} failed: {e}",
+            )
+
+    def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]:
+        """Extract fields from the EvaluatorInput."""
+        if self.field_mapper is not None:
+            return self.field_mapper(evaluator_input)
+
+        reference_inputs = getattr(evaluator_input, "reference_inputs", None)
+        result = parse_spans(evaluator_input.session_spans, reference_inputs)
+        return result.to_dict()
+
+    @abc.abstractmethod
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that required fields are present.
+
+        Each adapter must explicitly declare its validation behavior.
+
+        Args:
+            fields: Extracted field dict.
+
+        Raises:
+            ValueError: If required fields are missing.
+        """
+
+    @abc.abstractmethod
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the evaluation and return an EvaluatorOutput.
+
+        Args:
+            fields: Extracted field dict with keys like "input", "actual_output", etc.
+
+        Returns:
+            EvaluatorOutput with evaluation results.
+        """
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
new file mode 100644
index 00000000..99cf10d5
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
@@ -0,0 +1,5 @@
+"""DeepEval adapter for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter
+
+__all__ = ["DeepEvalAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
new file mode 100644
index 00000000..725584ef
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
@@ -0,0 +1,78 @@
+"""DeepEval adapter for AgentCore code-based evaluators."""
+
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class DeepEvalAdapter(BaseAdapter):
+    """Adapter that runs a DeepEval metric against AgentCore evaluation events.
+
+    Example::
+
+        from deepeval.metrics import AnswerRelevancyMetric
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = AnswerRelevancyMetric(threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+    """
+
+    def __init__(
+        self,
+        metric: BaseMetric,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+        model: Optional[Any] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict of LLMTestCase field values. Bypasses default span
+                parsing when provided.
+            model: Optional model override for the metric's LLM.
+        """
+        super().__init__(field_mapper=field_mapper)
+        self.metric = metric
+        if model is not None:
+            self.metric.model = model
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """No pre-validation; let DeepEval raise on missing params."""
+
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the DeepEval metric and return formatted results."""
+        test_case = LLMTestCase(
+            input=fields.get("input", ""),
+            actual_output=fields.get("actual_output", ""),
+            expected_output=fields.get("expected_output"),
+            context=fields.get("context"),
+            retrieval_context=fields.get("retrieval_context"),
+        )
+
+        try:
+            self.metric.measure(test_case)
+        except Exception as e:
+            error_type = type(e).__name__
+            if "MissingTestCaseParams" in error_type or "missing" in str(e).lower():
+                return EvaluatorOutput(
+                    label="Error",
+                    errorCode="MISSING_REQUIRED_FIELD",
+                    errorMessage=f"{type(self.metric).__name__} requires fields not available: {e}",
+                )
+            raise
+
+        score = self.metric.score
+        reason = getattr(self.metric, "reason", None) or ""
+        threshold = getattr(self.metric, "threshold", 0.5)
+        success = getattr(self.metric, "success", score is not None and score >= threshold)
+        label = "Pass" if success else "Fail"
+
+        return EvaluatorOutput(value=score, label=label, explanation=reason)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
new file mode 100644
index 00000000..5388df83
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
@@ -0,0 +1,8 @@
+"""Span parsers for extracting evaluation fields from Agent SDK trace formats."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.base import (
+    SpanParseResult,
+    parse_spans,
+)
+
+__all__ = ["SpanParseResult", "parse_spans"]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
new file mode 100644
index 00000000..3b88ff11
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
@@ -0,0 +1,62 @@
+"""Base span parsing logic and orchestration across format-specific parsers."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.strands import (
+    parse_strands_spans,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.otel_langchain import (
+    parse_otel_langchain_spans,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.openinference import (
+    parse_openinference_spans,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_PARSERS = [
+    parse_strands_spans,
+    parse_otel_langchain_spans,
+    parse_openinference_spans,
+]
+
+
+def parse_spans(
+    session_spans: List[Dict[str, Any]],
+    reference_inputs: Optional[List[Dict[str, Any]]] = None,
+) -> SpanParseResult:
+    """Parse session spans using the first matching agent-level parser.
+
+    Iterates through format-specific parsers (Strands, OTel LangChain,
+    OpenInference) and returns the result from the first one that
+    successfully extracts data.
+
+    Args:
+        session_spans: Raw ADOT span dicts from the evaluation service.
+        reference_inputs: Optional reference inputs for expected_output.
+
+    Returns:
+        SpanParseResult with extracted fields.
+
+    Raises:
+        ValueError: If no parser can extract data from the spans.
+    """
+    for parser in _PARSERS:
+        result = parser(session_spans)
+        if result is not None:
+            if reference_inputs:
+                expected = reference_inputs[0].get("expectedResponse")
+                if expected:
+                    result.expected_output = expected
+            return result
+
+    raise ValueError(
+        "Could not extract evaluation fields from spans. "
+        "No agent-level span with gen_ai.operation.name=='invoke_agent' and "
+        "valid span_events found. Provide a field_mapper for custom formats."
+    )
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
new file mode 100644
index 00000000..6d69dbc6
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
@@ -0,0 +1,145 @@
+"""Common span parsing utilities shared across format-specific parsers."""
+
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SpanParseResult:
+    """Result of parsing spans into evaluation fields."""
+
+    input: Optional[str] = None
+    actual_output: Optional[str] = None
+    retrieval_context: Optional[List[str]] = None
+    context: Optional[List[str]] = None
+    expected_output: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dict, omitting None values."""
+        result: Dict[str, Any] = {}
+        if self.input is not None:
+            result["input"] = self.input
+        if self.actual_output is not None:
+            result["actual_output"] = self.actual_output
+        if self.retrieval_context is not None:
+            result["retrieval_context"] = self.retrieval_context
+        if self.context is not None:
+            result["context"] = self.context
+        if self.expected_output is not None:
+            result["expected_output"] = self.expected_output
+        return result
+
+
+def _get_message_content(message: Any) -> str:
+    """Extract text content from a message object."""
+    if isinstance(message, str):
+        return message
+    if isinstance(message, dict):
+        for key in ("content", "message"):
+            if key in message:
+                val = message[key]
+                if isinstance(val, str):
+                    return val
+                if isinstance(val, dict):
+                    return _get_message_content(val)
+                if isinstance(val, list):
+                    parts = []
+                    for item in val:
+                        if isinstance(item, str):
+                            parts.append(item)
+                        elif isinstance(item, dict) and "text" in item:
+                            parts.append(item["text"])
+                    if parts:
+                        return "\n".join(parts)
+                return str(val)
+    return ""
+
+
+def _parse_span_event_body(body: Any) -> Dict[str, Any]:
+    """Parse the body of a span event, handling both dict and JSON string."""
+    if isinstance(body, str):
+        try:
+            return json.loads(body)
+        except (json.JSONDecodeError, TypeError):
+            return {}
+    if isinstance(body, dict):
+        return body
+    return {}
+
+
+def extract_from_agent_span_events(
+    session_spans: List[Dict[str, Any]],
+) -> Optional[SpanParseResult]:
+    """Extract evaluation fields from agent-level span events.
+
+    Looks for spans where attributes.gen_ai.operation.name == "invoke_agent",
+    then inspects span_events for input/output messages.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent span with valid events found, None otherwise.
+    """
+    user_messages: List[str] = []
+    assistant_messages: List[str] = []
+    tool_messages: List[str] = []
+
+    found_agent_span = False
+
+    for span in session_spans:
+        attributes = span.get("attributes", {})
+        operation_name = attributes.get("gen_ai.operation.name")
+        if operation_name != "invoke_agent":
+            continue
+
+        found_agent_span = True
+        span_events = span.get("span_events", [])
+
+        for event in span_events:
+            body = _parse_span_event_body(event.get("body"))
+            if not body:
+                continue
+
+            input_data = body.get("input", {})
+            if isinstance(input_data, dict):
+                for msg in input_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "user" and content:
+                        user_messages.append(content)
+
+            output_data = body.get("output", {})
+            if isinstance(output_data, dict):
+                for msg in output_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "assistant" and content:
+                        assistant_messages.append(content)
+                    elif role == "tool" and content:
+                        tool_messages.append(content)
+
+    if not found_agent_span:
+        return None
+
+    if not user_messages and not assistant_messages:
+        return None
+
+    result = SpanParseResult()
+    if user_messages:
+        result.input = user_messages[0]
+    if assistant_messages:
+        result.actual_output = assistant_messages[-1]
+    if tool_messages:
+        result.retrieval_context = tool_messages
+        result.context = tool_messages
+
+    return result
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py
new file mode 100644
index 00000000..e500740e
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py
@@ -0,0 +1,27 @@
+"""OpenInference LangChain span parser."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+    extract_from_agent_span_events,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def parse_openinference_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]:
+    """Parse spans from OpenInference LangChain instrumentation format.
+
+    Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent")
+    and span_events extraction. OpenInference-specific divergence can be added here
+    as schemas evolve.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent spans found, None otherwise.
+    """
+    return extract_from_agent_span_events(session_spans)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py
new file mode 100644
index 00000000..f1e211c5
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py
@@ -0,0 +1,27 @@
+"""OTel LangChain span parser."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+    extract_from_agent_span_events,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def parse_otel_langchain_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]:
+    """Parse spans from OTel LangChain instrumentation format.
+
+    Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent")
+    and span_events extraction. LangChain-specific divergence can be added here
+    as schemas evolve.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent spans found, None otherwise.
+    """
+    return extract_from_agent_span_events(session_spans)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py
new file mode 100644
index 00000000..3789ad9c
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py
@@ -0,0 +1,26 @@
+"""Strands Agent SDK span parser."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+    extract_from_agent_span_events,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def parse_strands_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]:
+    """Parse spans from Strands Agent SDK format.
+
+    Looks for spans with gen_ai.operation.name == "invoke_agent" and
+    extracts input/output from span_events.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent spans found, None otherwise.
+    """
+    return extract_from_agent_span_events(session_spans)
diff --git a/src/bedrock_agentcore/evaluation/integrations/__init__.py b/src/bedrock_agentcore/evaluation/integrations/__init__.py
index a1ff7691..33048d5d 100644
--- a/src/bedrock_agentcore/evaluation/integrations/__init__.py
+++ b/src/bedrock_agentcore/evaluation/integrations/__init__.py
@@ -1,5 +1 @@
 """AgentCore Evaluation integrations."""
-
-from bedrock_agentcore.evaluation.integrations.base import BaseAdapter, ParsedEvaluationEvent
-
-__all__ = ["BaseAdapter", "ParsedEvaluationEvent"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
deleted file mode 100644
index 0bc3b4ff..00000000
--- a/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Autoevals integration for AgentCore Evaluation."""
-
-from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter
-
-__all__ = ["AutoevalsAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/base.py b/src/bedrock_agentcore/evaluation/integrations/base.py
deleted file mode 100644
index a10f6606..00000000
--- a/src/bedrock_agentcore/evaluation/integrations/base.py
+++ /dev/null
@@ -1,302 +0,0 @@
-"""Base adapter for AgentCore evaluation integrations."""
-
-import abc
-import json
-import logging
-import threading
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Union
-
-from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ParsedEvaluationEvent:
-    """Parsed representation of the AgentCore Lambda evaluation event."""
-
-    evaluation_level: str
-    session_spans: List[Dict[str, Any]]
-    target_trace_id: Optional[str] = None
-    target_span_id: Optional[str] = None
-    reference_inputs: List[Dict[str, Any]] = field(default_factory=list)
-
-    @classmethod
-    def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent":
-        """Parse a raw Lambda event dict into a structured object.
-
-        Args:
-            event: Raw Lambda event payload from the evaluation service.
-
-        Returns:
-            ParsedEvaluationEvent with extracted fields.
-
-        Raises:
-            KeyError: If required top-level fields are missing.
-        """
-        evaluation_input = event["evaluationInput"]
-        target = event.get("evaluationTarget") or {}
-        trace_ids = target.get("traceIds") or []
-        span_ids = target.get("spanIds") or []
-
-        return cls(
-            evaluation_level=event["evaluationLevel"],
-            session_spans=evaluation_input["sessionSpans"],
-            target_trace_id=trace_ids[0] if trace_ids else None,
-            target_span_id=span_ids[0] if span_ids else None,
-            reference_inputs=event.get("evaluationReferenceInputs") or [],
-        )
-
-
-def _get_message_content(message: Any) -> str:
-    """Extract text content from a message object.
-
-    Message content can be a dict with a "content" or "message" key, or a plain string.
-    Handles one level of nesting (e.g. {"content": {"content": "text"}}).
-    """
-    if isinstance(message, str):
-        return message
-    if isinstance(message, dict):
-        for key in ("content", "message"):
-            if key in message:
-                val = message[key]
-                if isinstance(val, str):
-                    return val
-                if isinstance(val, dict):
-                    return _get_message_content(val)
-                return str(val)
-    return ""
-
-
-def extract_fields_from_spans(
-    parsed: ParsedEvaluationEvent,
-) -> Dict[str, Any]:
-    """Extract evaluation fields from AgentCore session spans.
-
-    Parses _eval_log_records from span attributes, filters by target_trace_id,
-    and extracts messages by role:
-        - input ← input messages where role=="user"
-        - actual_output ← output messages where role=="assistant"
-        - retrieval_context ← output messages where role=="tool"
-        - context ← same as retrieval_context
-        - expected_output ← evaluationReferenceInputs[0].expectedResponse
-    """
-    user_messages: List[str] = []
-    assistant_messages: List[str] = []
-    tool_messages: List[str] = []
-
-    for span in parsed.session_spans:
-        attributes = span.get("attributes", {})
-        log_records_raw = attributes.get("_eval_log_records")
-        if not log_records_raw:
-            continue
-
-        if isinstance(log_records_raw, str):
-            try:
-                log_records = json.loads(log_records_raw)
-            except (json.JSONDecodeError, TypeError):
-                logger.debug("Failed to parse _eval_log_records as JSON")
-                continue
-        else:
-            log_records = log_records_raw
-
-        if not isinstance(log_records, list):
-            continue
-
-        for record in log_records:
-            if not isinstance(record, dict):
-                continue
-
-            if parsed.target_trace_id:
-                record_trace_id = record.get("traceId") or record.get("trace_id")
-                if record_trace_id and record_trace_id != parsed.target_trace_id:
-                    continue
-
-            body = record.get("body", {})
-            if not isinstance(body, dict):
-                continue
-
-            input_data = body.get("input", {})
-            if isinstance(input_data, dict):
-                for msg in input_data.get("messages", []):
-                    if not isinstance(msg, dict):
-                        continue
-                    role = msg.get("role", "")
-                    content = _get_message_content(msg)
-                    if role == "user" and content:
-                        user_messages.append(content)
-
-            output_data = body.get("output", {})
-            if isinstance(output_data, dict):
-                for msg in output_data.get("messages", []):
-                    if not isinstance(msg, dict):
-                        continue
-                    role = msg.get("role", "")
-                    content = _get_message_content(msg)
-                    if role == "assistant" and content:
-                        assistant_messages.append(content)
-                    elif role == "tool" and content:
-                        tool_messages.append(content)
-
-    fields: Dict[str, Any] = {}
-
-    if user_messages:
-        fields["input"] = "\n".join(user_messages)
-    if assistant_messages:
-        fields["actual_output"] = "\n".join(assistant_messages)
-    if tool_messages:
-        fields["retrieval_context"] = tool_messages
-        fields["context"] = tool_messages
-
-    if parsed.reference_inputs:
-        expected = parsed.reference_inputs[0].get("expectedResponse")
-        if expected:
-            fields["expected_output"] = expected
-
-    return fields
-
-
-class _ExecutionTimeout(Exception):
-    """Raised when execution exceeds the configured timeout."""
-
-
-def _error_response(code: str, message: str) -> Dict[str, str]:
-    """Build a standardized error response dict."""
-    return {"errorCode": code, "errorMessage": message}
-
-
-class BaseAdapter(abc.ABC):
-    """Base adapter for evaluation framework integrations.
-
-    Subclasses only need to implement execute(fields) which runs the actual
-    evaluation logic and returns (score, label, explanation).
-
-    Never raises unhandled exceptions — always returns a valid response dict.
-    """
-
-    DEFAULT_TIMEOUT = 290
-
-    def __init__(
-        self,
-        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
-        timeout: Optional[int] = None,
-    ):
-        """Initialize the adapter.
-
-        Args:
-            field_mapper: Optional callable that receives the raw Lambda event and
-                returns a dict of field values. Bypasses default span extraction.
-            timeout: Maximum seconds to allow for execute(). Defaults to 290
-                (slightly under Lambda's 300s max).
-        """
-        self.field_mapper = field_mapper
-        self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT
-
-    def __call__(self, event: Union[Dict[str, Any], EvaluatorInput], context: Any = None) -> Dict[str, Any]:
-        """Handle a Lambda invocation.
-
-        Args:
-            event: Either a raw Lambda event dict or an EvaluatorInput instance
-                from bedrock_agentcore.evaluation.custom_code_based_evaluators.models.
-            context: Lambda context object (unused).
-
-        Returns:
-            Success: {"value": float, "label": str, "explanation": str}
-            Error: {"errorCode": str, "errorMessage": str}
-        """
-        try:
-            if isinstance(event, EvaluatorInput):
-                parsed = ParsedEvaluationEvent(
-                    evaluation_level=event.evaluation_level,
-                    session_spans=event.session_spans,
-                    target_trace_id=event.target_trace_id,
-                    target_span_id=event.target_span_id,
-                    reference_inputs=getattr(event, "reference_inputs", []) or [],
-                )
-            else:
-                parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        except (KeyError, IndexError, TypeError) as e:
-            logger.error("Failed to parse evaluation event: %s", e)
-            return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}")
-
-        try:
-            fields = self._extract_fields(parsed)
-        except ValueError as e:
-            logger.error("Missing required fields: %s", e)
-            return _error_response("MISSING_REQUIRED_FIELD", str(e))
-
-        try:
-            result = self._execute_with_timeout(fields)
-        except _ExecutionTimeout:
-            return _error_response(
-                "METRIC_TIMEOUT",
-                f"{type(self).__name__} exceeded {self.timeout}s timeout.",
-            )
-        except Exception as e:
-            logger.error("Execution failed: %s", e, exc_info=True)
-            return _error_response("METRIC_ERROR", f"{type(self).__name__} failed: {e}")
-
-        return result
-
-    def _extract_fields(self, parsed: ParsedEvaluationEvent) -> Dict[str, Any]:
-        """Extract fields from event, using field_mapper if provided."""
-        if self.field_mapper is not None:
-            raw_event = {
-                "evaluationLevel": parsed.evaluation_level,
-                "evaluationInput": {"sessionSpans": parsed.session_spans},
-                "evaluationTarget": {
-                    "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [],
-                    "spanIds": [parsed.target_span_id] if parsed.target_span_id else [],
-                },
-                "evaluationReferenceInputs": parsed.reference_inputs,
-            }
-            return self.field_mapper(raw_event)
-
-        fields = extract_fields_from_spans(parsed)
-        self.validate_fields(fields)
-        return fields
-
-    def validate_fields(self, fields: Dict[str, Any]) -> None:
-        """Validate that required fields are present.
-
-        Override in subclasses to enforce field requirements.
-        Default implementation does nothing.
-        """
-
-    @abc.abstractmethod
-    def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]:
-        """Run the evaluation and return the response dict.
-
-        Args:
-            fields: Extracted field dict with keys like "input", "actual_output", etc.
-
-        Returns:
-            {"value": float, "label": str, "explanation": str}
-        """
-
-    def _execute_with_timeout(self, fields: Dict[str, Any]) -> Dict[str, Any]:
-        """Run execute() with a thread-based timeout."""
-        if self.timeout <= 0:
-            return self.execute(fields)
-
-        result_holder: list = []
-        exception_holder: list = []
-
-        def target():
-            try:
-                result_holder.append(self.execute(fields))
-            except Exception as e:
-                exception_holder.append(e)
-
-        thread = threading.Thread(target=target, daemon=True)
-        thread.start()
-        thread.join(timeout=self.timeout)
-
-        if thread.is_alive():
-            raise _ExecutionTimeout()
-
-        if exception_holder:
-            raise exception_holder[0]
-
-        return result_holder[0]
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
deleted file mode 100644
index adb6ba44..00000000
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""DeepEval integration for AgentCore Evaluation."""
-
-from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler
-
-__all__ = ["DeepEvalAdapter", "DeepEvalHandler"]
diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py
deleted file mode 100644
index e8748782..00000000
--- a/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""DeepEval adapter for AgentCore evaluation integrations."""
-
-import logging
-from typing import Any, Callable, Dict, List, Optional
-
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, SingleTurnParams
-
-from bedrock_agentcore.evaluation.integrations.base import (
-    BaseAdapter,
-    ParsedEvaluationEvent,
-    extract_fields_from_spans,
-)
-
-logger = logging.getLogger(__name__)
-
-_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = {
-    SingleTurnParams.INPUT: "input",
-    SingleTurnParams.ACTUAL_OUTPUT: "actual_output",
-    SingleTurnParams.EXPECTED_OUTPUT: "expected_output",
-    SingleTurnParams.CONTEXT: "context",
-    SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context",
-}
-
-_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = {
-    "AnswerRelevancyMetric": ["input", "actual_output"],
-    "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"],
-    "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"],
-    "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
-    "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"],
-    "HallucinationMetric": ["input", "actual_output", "context"],
-    "BiasMetric": ["input", "actual_output"],
-    "ToxicityMetric": ["input", "actual_output"],
-    "GEval": ["input", "actual_output"],
-    "SummarizationMetric": ["input", "actual_output"],
-}
-
-
-def _get_required_params(metric: BaseMetric) -> List[str]:
-    """Determine which LLMTestCase fields a metric requires.
-
-    Fallback chain:
-        1. metric._required_params (DeepEval internal attribute)
-        2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name
-        3. metric.evaluation_params (GEval special case)
-        4. Default: ["input", "actual_output"]
-    """
-    if hasattr(metric, "_required_params") and metric._required_params:
-        params = metric._required_params
-        if all(p in _PARAM_TO_FIELD for p in params):
-            return [_PARAM_TO_FIELD[p] for p in params]
-
-    class_name = type(metric).__name__
-    if class_name in _METRIC_REQUIRED_PARAMS:
-        return _METRIC_REQUIRED_PARAMS[class_name]
-
-    if hasattr(metric, "evaluation_params") and metric.evaluation_params:
-        params = metric.evaluation_params
-        return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params]
-
-    return ["input", "actual_output"]
-
-
-class DeepEvalAdapter(BaseAdapter):
-    """Adapter that runs a DeepEval metric against AgentCore evaluation events.
-
-    Example::
-
-        from deepeval.metrics import AnswerRelevancyMetric
-
-        metric = AnswerRelevancyMetric(threshold=0.7)
-        handler = DeepEvalAdapter(metric=metric)
-
-        # Use as Lambda handler
-        def lambda_handler(event, context):
-            return handler(event, context)
-    """
-
-    def __init__(
-        self,
-        metric: BaseMetric,
-        field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
-        model: Optional[Any] = None,
-        timeout: Optional[int] = None,
-    ):
-        """Initialize the adapter.
-
-        Args:
-            metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
-            field_mapper: Optional callable that receives the raw Lambda event and
-                returns a dict of LLMTestCase field values. Bypasses default span
-                extraction when provided.
-            model: Optional model override for the metric's LLM. Can be a string
-                model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM
-                subclass instance.
-            timeout: Maximum seconds to allow for metric.measure(). Defaults to 290
-                (slightly under Lambda's 300s max).
-        """
-        super().__init__(field_mapper=field_mapper, timeout=timeout)
-        self.metric = metric
-        if model is not None:
-            self.metric.model = model
-
-    def validate_fields(self, fields: Dict[str, Any]) -> None:
-        """Validate that fields required by the metric are present."""
-        required = _get_required_params(self.metric)
-        missing = [f for f in required if f not in fields or not fields[f]]
-        if missing:
-            metric_name = type(self.metric).__name__
-            raise ValueError(
-                f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
-                f"Provide a field_mapper or ensure spans contain the necessary data."
-            )
-
-    def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]:
-        """Run the DeepEval metric and return formatted results."""
-        test_case = LLMTestCase(
-            input=fields.get("input", ""),
-            actual_output=fields.get("actual_output", ""),
-            expected_output=fields.get("expected_output"),
-            context=fields.get("context"),
-            retrieval_context=fields.get("retrieval_context"),
-        )
-
-        self.metric.measure(test_case)
-
-        score = self.metric.score
-        reason = getattr(self.metric, "reason", None) or ""
-        threshold = getattr(self.metric, "threshold", 0.5)
-        success = getattr(self.metric, "success", score is not None and score >= threshold)
-        label = "Pass" if success else "Fail"
-
-        return {"value": score, "label": label, "explanation": reason}
-
-
-def build_test_case(
-    parsed: ParsedEvaluationEvent,
-    metric: BaseMetric,
-    field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
-) -> LLMTestCase:
-    """Build a DeepEval LLMTestCase from a parsed evaluation event.
-
-    Args:
-        parsed: The parsed Lambda event.
-        metric: The DeepEval metric (used to determine required fields).
-        field_mapper: Optional callable that receives the raw Lambda event fields
-            and returns a dict of LLMTestCase field values. Bypasses default
-            span extraction when provided.
-
-    Returns:
-        An LLMTestCase ready for metric.measure().
-
-    Raises:
-        ValueError: If required fields for the metric cannot be populated.
-    """
-    if field_mapper is not None:
-        raw_event = {
-            "evaluationLevel": parsed.evaluation_level,
-            "evaluationInput": {"sessionSpans": parsed.session_spans},
-            "evaluationTarget": {
-                "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [],
-                "spanIds": [parsed.target_span_id] if parsed.target_span_id else [],
-            },
-            "evaluationReferenceInputs": parsed.reference_inputs,
-        }
-        fields = field_mapper(raw_event)
-    else:
-        fields = extract_fields_from_spans(parsed)
-
-    required = _get_required_params(metric)
-    missing = [f for f in required if f not in fields or not fields[f]]
-    if missing:
-        metric_name = type(metric).__name__
-        raise ValueError(
-            f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
-            f"Provide a field_mapper or ensure spans contain the necessary data."
-        )
-
-    return LLMTestCase(
-        input=fields.get("input", ""),
-        actual_output=fields.get("actual_output", ""),
-        expected_output=fields.get("expected_output"),
-        context=fields.get("context"),
-        retrieval_context=fields.get("retrieval_context"),
-    )
-
-
-# Backward-compatible alias
-DeepEvalHandler = DeepEvalAdapter
diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
similarity index 100%
rename from tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py
rename to tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
similarity index 100%
rename from tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py
rename to tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py
new file mode 100644
index 00000000..2f640817
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py
@@ -0,0 +1,201 @@
+"""Tests for AutoevalsAdapter."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter
+
+
+def _make_evaluator_input(spans=None):
+    """Build an EvaluatorInput with agent-level spans."""
+    if spans is None:
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                            "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                        }
+                    }
+                ],
+            }
+        ]
+    return EvaluatorInput(
+        evaluation_level="TRACE",
+        session_spans=spans,
+        target_trace_id="t1",
+    )
+
+
+def _mock_scorer(score=0.9, rationale="Good answer"):
+    """Create a mock Autoevals scorer."""
+    scorer = MagicMock()
+    type(scorer).__name__ = "MockScorer"
+
+    result = MagicMock()
+    result.score = score
+    result.metadata = {"rationale": rationale}
+
+    scorer.eval = MagicMock(return_value=result)
+    return scorer
+
+
+class TestAutoevalsAdapterSuccess:
+    def test_returns_pass_when_score_above_threshold(self):
+        scorer = _mock_scorer(score=0.8)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value == 0.8
+        assert result.label == "Pass"
+        assert result.explanation == "Good answer"
+
+    def test_returns_fail_when_score_below_threshold(self):
+        scorer = _mock_scorer(score=0.3)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.3
+        assert result.label == "Fail"
+
+    def test_custom_threshold(self):
+        scorer = _mock_scorer(score=0.6)
+        adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Fail"
+
+    def test_custom_threshold_pass(self):
+        scorer = _mock_scorer(score=0.8)
+        adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Pass"
+
+    def test_default_threshold_is_half(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        assert adapter.threshold == 0.5
+
+    def test_scorer_eval_called_with_input_and_output(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        adapter(_make_evaluator_input())
+
+        scorer.eval.assert_called_once()
+        call_kwargs = scorer.eval.call_args[1]
+        assert call_kwargs["input"] == "What is AI?"
+        assert call_kwargs["output"] == "AI is artificial intelligence."
+
+    def test_custom_field_mapper(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(
+            scorer=scorer,
+            field_mapper=lambda ev: {
+                "input": "custom input",
+                "actual_output": "custom output",
+            },
+        )
+
+        result = adapter(_make_evaluator_input())
+
+        call_kwargs = scorer.eval.call_args[1]
+        assert call_kwargs["input"] == "custom input"
+        assert call_kwargs["output"] == "custom output"
+
+
+class TestAutoevalsAdapterErrors:
+    def test_no_agent_spans_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "chat"},
+                "span_events": [],
+            }
+        ]
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert result.errorCode == "FIELD_EXTRACTION_ERROR"
+
+    def test_missing_input_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "output": {"messages": [{"role": "assistant", "content": "answer"}]},
+                        }
+                    }
+                ],
+            }
+        ]
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert result.errorCode == "MISSING_REQUIRED_FIELD"
+        assert "input" in result.errorMessage
+
+    def test_scorer_exception_returns_error(self):
+        scorer = _mock_scorer()
+        scorer.eval = MagicMock(side_effect=RuntimeError("API error"))
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.errorCode == "METRIC_ERROR"
+        assert "API error" in result.errorMessage
+
+    def test_never_raises(self):
+        scorer = _mock_scorer()
+        scorer.eval = MagicMock(side_effect=Exception("unexpected"))
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode is not None
+
+
+class TestAutoevalsAdapterEdgeCases:
+    def test_score_none_returns_fail(self):
+        scorer = _mock_scorer(score=None)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Fail"
+
+    def test_no_metadata_returns_empty_explanation(self):
+        scorer = MagicMock()
+        type(scorer).__name__ = "MockScorer"
+        result_obj = MagicMock(spec=[])
+        result_obj.score = 0.9
+        scorer.eval = MagicMock(return_value=result_obj)
+
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.explanation == ""
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
new file mode 100644
index 00000000..3c8a3d39
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
@@ -0,0 +1,218 @@
+"""Tests for DeepEvalAdapter."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter
+
+
+def _make_evaluator_input(spans=None):
+    """Build an EvaluatorInput with agent-level spans."""
+    if spans is None:
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                            "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                        }
+                    }
+                ],
+            }
+        ]
+    return EvaluatorInput(
+        evaluation_level="TRACE",
+        session_spans=spans,
+        target_trace_id="t1",
+    )
+
+
+def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"):
+    """Create a mock metric that returns a fixed score on measure()."""
+    metric = MagicMock()
+    type(metric).__name__ = name
+    metric.threshold = threshold
+    metric.score = score
+    metric.reason = reason
+    del metric.success
+
+    def measure_side_effect(test_case):
+        metric.score = score
+        metric.reason = reason
+
+    metric.measure = MagicMock(side_effect=measure_side_effect)
+    return metric
+
+
+class TestDeepEvalAdapterSuccess:
+    def test_returns_pass_when_score_above_threshold(self):
+        metric = _mock_metric(score=0.9, threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value == 0.9
+        assert result.label == "Pass"
+        assert result.explanation == "Looks good"
+
+    def test_returns_fail_when_score_below_threshold(self):
+        metric = _mock_metric(score=0.3, threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.3
+        assert result.label == "Fail"
+
+    def test_returns_pass_at_exact_threshold(self):
+        metric = _mock_metric(score=0.7, threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Pass"
+
+    def test_metric_measure_called_with_test_case(self):
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        adapter(_make_evaluator_input())
+
+        metric.measure.assert_called_once()
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "What is AI?"
+        assert test_case.actual_output == "AI is artificial intelligence."
+
+    def test_custom_field_mapper(self):
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(
+            metric=metric,
+            field_mapper=lambda ev: {
+                "input": "mapped input",
+                "actual_output": "mapped output",
+            },
+        )
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.85
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "mapped input"
+        assert test_case.actual_output == "mapped output"
+
+    def test_model_override_sets_metric_model(self):
+        metric = _mock_metric()
+        DeepEvalAdapter(metric=metric, model="bedrock/anthropic.claude-3")
+
+        assert metric.model == "bedrock/anthropic.claude-3"
+
+    def test_label_uses_metric_success_true(self):
+        metric = _mock_metric(score=0.3, threshold=0.7)
+        metric.success = True
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.3
+        assert result.label == "Pass"
+
+    def test_label_uses_metric_success_false(self):
+        metric = _mock_metric(score=0.9, threshold=0.7)
+        metric.success = False
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.9
+        assert result.label == "Fail"
+
+
+class TestDeepEvalAdapterErrors:
+    def test_no_agent_spans_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "chat"},
+                "span_events": [],
+            }
+        ]
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode == "FIELD_EXTRACTION_ERROR"
+        assert result.label == "Error"
+
+    def test_metric_measure_exception_returns_error(self):
+        metric = _mock_metric()
+        metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout"))
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.errorCode == "METRIC_ERROR"
+        assert "LLM timeout" in result.errorMessage
+
+    def test_missing_params_error_caught(self):
+        metric = _mock_metric()
+
+        class MissingTestCaseParamsError(Exception):
+            pass
+
+        metric.measure = MagicMock(
+            side_effect=MissingTestCaseParamsError("retrieval_context is required")
+        )
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.errorCode == "MISSING_REQUIRED_FIELD"
+        assert "retrieval_context" in result.errorMessage
+
+    def test_never_raises(self):
+        metric = _mock_metric()
+        metric.measure = MagicMock(side_effect=Exception("unexpected"))
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode is not None
+
+
+class TestDeepEvalAdapterEdgeCases:
+    def test_metric_with_no_reason(self):
+        metric = _mock_metric(score=0.8, reason=None)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.explanation == ""
+
+    def test_metric_score_zero(self):
+        metric = _mock_metric(score=0.0, threshold=0.5)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.0
+        assert result.label == "Fail"
+
+    def test_default_threshold_when_missing(self):
+        metric = _mock_metric(score=0.6)
+        del metric.threshold
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Pass"
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
new file mode 100644
index 00000000..de2a1bb5
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
@@ -0,0 +1,194 @@
+"""Tests for span parsers."""
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
+    SpanParseResult,
+    parse_spans,
+)
+
+
+def _make_agent_span(input_messages=None, output_messages=None, span_id="span1"):
+    """Build an agent-level span with span_events."""
+    span_events = []
+    body = {}
+    if input_messages is not None:
+        body["input"] = {"messages": input_messages}
+    if output_messages is not None:
+        body["output"] = {"messages": output_messages}
+    if body:
+        span_events.append({"body": body})
+
+    return {
+        "traceId": "abc123",
+        "spanId": span_id,
+        "attributes": {"gen_ai.operation.name": "invoke_agent"},
+        "span_events": span_events,
+    }
+
+
+class TestParseSpansSuccess:
+    def test_extracts_input_and_output(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "What is AI?"}],
+                output_messages=[{"role": "assistant", "content": "Artificial intelligence."}],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.input == "What is AI?"
+        assert result.actual_output == "Artificial intelligence."
+
+    def test_extracts_tool_messages_as_retrieval_context(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "query"}],
+                output_messages=[
+                    {"role": "tool", "content": "doc chunk 1"},
+                    {"role": "tool", "content": "doc chunk 2"},
+                    {"role": "assistant", "content": "answer"},
+                ],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.retrieval_context == ["doc chunk 1", "doc chunk 2"]
+        assert result.context == ["doc chunk 1", "doc chunk 2"]
+        assert result.actual_output == "answer"
+
+    def test_uses_first_user_message_as_input(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[
+                    {"role": "user", "content": "first"},
+                    {"role": "user", "content": "second"},
+                ],
+                output_messages=[{"role": "assistant", "content": "reply"}],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.input == "first"
+
+    def test_uses_last_assistant_message_as_output(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "q"}],
+                output_messages=[
+                    {"role": "assistant", "content": "first reply"},
+                    {"role": "assistant", "content": "final reply"},
+                ],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.actual_output == "final reply"
+
+    def test_expected_output_from_reference_inputs(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "q"}],
+                output_messages=[{"role": "assistant", "content": "a"}],
+            )
+        ]
+        refs = [{"expectedResponse": "expected answer"}]
+
+        result = parse_spans(spans, reference_inputs=refs)
+
+        assert result.expected_output == "expected answer"
+
+    def test_nested_content_dict(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": {"content": "nested"}}],
+                output_messages=[{"role": "assistant", "content": {"content": "nested out"}}],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.input == "nested"
+        assert result.actual_output == "nested out"
+
+    def test_body_as_json_string(self):
+        import json
+
+        body = {
+            "input": {"messages": [{"role": "user", "content": "hello"}]},
+            "output": {"messages": [{"role": "assistant", "content": "hi"}]},
+        }
+        span = {
+            "traceId": "t1",
+            "spanId": "s1",
+            "attributes": {"gen_ai.operation.name": "invoke_agent"},
+            "span_events": [{"body": json.dumps(body)}],
+        }
+
+        result = parse_spans([span])
+
+        assert result.input == "hello"
+        assert result.actual_output == "hi"
+
+    def test_to_dict_omits_none(self):
+        result = SpanParseResult(input="q", actual_output="a")
+        d = result.to_dict()
+
+        assert d == {"input": "q", "actual_output": "a"}
+        assert "retrieval_context" not in d
+
+
+class TestParseSpansErrors:
+    def test_no_agent_spans_raises(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "other_op"},
+                "span_events": [],
+            }
+        ]
+
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans(spans)
+
+    def test_empty_spans_raises(self):
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans([])
+
+    def test_agent_span_without_events_raises(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [],
+            }
+        ]
+
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans(spans)
+
+    def test_non_agent_spans_ignored(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "chat"},
+                "span_events": [
+                    {
+                        "body": {
+                            "input": {"messages": [{"role": "user", "content": "q"}]},
+                            "output": {"messages": [{"role": "assistant", "content": "a"}]},
+                        }
+                    }
+                ],
+            }
+        ]
+
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans(spans)
diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py
deleted file mode 100644
index 17f674bd..00000000
--- a/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py
+++ /dev/null
@@ -1,217 +0,0 @@
-"""Tests for AutoevalsAdapter."""
-
-import json
-import time
-from unittest.mock import MagicMock
-
-import pytest
-
-from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter
-
-
-def _make_event(
-    level="TRACE",
-    trace_ids=None,
-    spans=None,
-    reference_inputs=None,
-):
-    """Build a raw Lambda event dict for testing."""
-    if spans is None:
-        log_records = [
-            {
-                "body": {
-                    "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
-                    "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
-                }
-            }
-        ]
-        spans = [
-            {
-                "traceId": "abc123",
-                "spanId": "span1",
-                "attributes": {"_eval_log_records": json.dumps(log_records)},
-            }
-        ]
-
-    event = {
-        "schemaVersion": "1.0",
-        "evaluationLevel": level,
-        "evaluationInput": {"sessionSpans": spans},
-        "evaluationTarget": {},
-    }
-    if trace_ids is not None:
-        event["evaluationTarget"]["traceIds"] = trace_ids
-    if reference_inputs is not None:
-        event["evaluationReferenceInputs"] = reference_inputs
-    return event
-
-
-def _mock_scorer(score=0.9, rationale="Good answer"):
-    """Create a mock Autoevals scorer."""
-    scorer = MagicMock()
-    type(scorer).__name__ = "MockScorer"
-
-    result = MagicMock()
-    result.score = score
-    result.metadata = {"rationale": rationale}
-
-    scorer.eval = MagicMock(return_value=result)
-    return scorer
-
-
-class TestAutoevalsAdapterSuccess:
-    def test_returns_pass_when_score_above_half(self):
-        scorer = _mock_scorer(score=0.8)
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        result = adapter(_make_event())
-
-        assert result["value"] == 0.8
-        assert result["label"] == "Pass"
-        assert result["explanation"] == "Good answer"
-
-    def test_returns_fail_when_score_below_half(self):
-        scorer = _mock_scorer(score=0.3)
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        result = adapter(_make_event())
-
-        assert result["value"] == 0.3
-        assert result["label"] == "Fail"
-
-    def test_scorer_eval_called_with_input_and_output(self):
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        adapter(_make_event())
-
-        scorer.eval.assert_called_once()
-        call_kwargs = scorer.eval.call_args[1]
-        assert call_kwargs["input"] == "What is AI?"
-        assert call_kwargs["output"] == "AI is artificial intelligence."
-
-    def test_expected_output_passed_as_expected(self):
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        refs = [{"expectedResponse": "AI stands for artificial intelligence."}]
-        result = adapter(_make_event(reference_inputs=refs))
-
-        call_kwargs = scorer.eval.call_args[1]
-        assert call_kwargs["expected"] == "AI stands for artificial intelligence."
-
-    def test_no_expected_output_omits_expected_kwarg(self):
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        adapter(_make_event())
-
-        call_kwargs = scorer.eval.call_args[1]
-        assert "expected" not in call_kwargs
-
-    def test_custom_field_mapper(self):
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(
-            scorer=scorer,
-            field_mapper=lambda event: {
-                "input": "custom input",
-                "actual_output": "custom output",
-            },
-        )
-
-        result = adapter(_make_event())
-
-        call_kwargs = scorer.eval.call_args[1]
-        assert call_kwargs["input"] == "custom input"
-        assert call_kwargs["output"] == "custom output"
-
-
-class TestAutoevalsAdapterErrors:
-    def test_invalid_event_returns_error(self):
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        result = adapter({})
-
-        assert result["errorCode"] == "INVALID_EVENT"
-
-    def test_missing_input_returns_error(self):
-        log_records = [
-            {
-                "body": {
-                    "output": {"messages": [{"role": "assistant", "content": "answer"}]},
-                }
-            }
-        ]
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"_eval_log_records": json.dumps(log_records)},
-            }
-        ]
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        result = adapter(_make_event(spans=spans))
-
-        assert result["errorCode"] == "MISSING_REQUIRED_FIELD"
-        assert "input" in result["errorMessage"]
-
-    def test_scorer_exception_returns_error(self):
-        scorer = _mock_scorer()
-        scorer.eval = MagicMock(side_effect=RuntimeError("API error"))
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        result = adapter(_make_event())
-
-        assert result["errorCode"] == "METRIC_ERROR"
-        assert "API error" in result["errorMessage"]
-
-    def test_never_raises_on_bad_input(self):
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        for bad_input in [None, [], "string", 42]:
-            result = adapter(bad_input)
-            assert "errorCode" in result
-
-
-class TestAutoevalsAdapterTimeout:
-    def test_timeout_returns_error(self):
-        scorer = _mock_scorer()
-        scorer.eval = MagicMock(side_effect=lambda **kw: time.sleep(5))
-        adapter = AutoevalsAdapter(scorer=scorer, timeout=1)
-
-        result = adapter(_make_event())
-
-        assert result["errorCode"] == "METRIC_TIMEOUT"
-
-    def test_default_timeout_is_290(self):
-        scorer = _mock_scorer()
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        assert adapter.timeout == 290
-
-
-class TestAutoevalsAdapterEdgeCases:
-    def test_score_none_returns_fail(self):
-        scorer = _mock_scorer(score=None)
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        result = adapter(_make_event())
-
-        assert result["label"] == "Fail"
-
-    def test_no_metadata_returns_empty_explanation(self):
-        scorer = MagicMock()
-        type(scorer).__name__ = "MockScorer"
-        result_obj = MagicMock(spec=[])
-        result_obj.score = 0.9
-        scorer.eval = MagicMock(return_value=result_obj)
-
-        adapter = AutoevalsAdapter(scorer=scorer)
-
-        result = adapter(_make_event())
-
-        assert result["explanation"] == ""
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
deleted file mode 100644
index 67bfda3d..00000000
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py
+++ /dev/null
@@ -1,427 +0,0 @@
-"""Tests for DeepEvalHandler and DeepEvalAdapter."""
-
-import json
-import time
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler
-from bedrock_agentcore.evaluation.integrations.base import BaseAdapter
-from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput
-
-
-def _make_event(
-    level="TRACE",
-    trace_ids=None,
-    spans=None,
-    reference_inputs=None,
-):
-    """Build a raw Lambda event dict for testing."""
-    if spans is None:
-        log_records = [
-            {
-                "body": {
-                    "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
-                    "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
-                }
-            }
-        ]
-        spans = [
-            {
-                "traceId": "abc123",
-                "spanId": "span1",
-                "attributes": {"_eval_log_records": json.dumps(log_records)},
-            }
-        ]
-
-    event = {
-        "schemaVersion": "1.0",
-        "evaluationLevel": level,
-        "evaluationInput": {"sessionSpans": spans},
-        "evaluationTarget": {},
-    }
-    if trace_ids is not None:
-        event["evaluationTarget"]["traceIds"] = trace_ids
-    if reference_inputs is not None:
-        event["evaluationReferenceInputs"] = reference_inputs
-    return event
-
-
-def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"):
-    """Create a mock metric that returns a fixed score on measure()."""
-    metric = MagicMock()
-    type(metric).__name__ = name
-    metric.threshold = threshold
-    metric.score = score
-    metric.reason = reason
-    metric._required_params = None
-    del metric._required_params
-    del metric.evaluation_params
-    del metric.success
-
-    def measure_side_effect(test_case):
-        metric.score = score
-        metric.reason = reason
-
-    metric.measure = MagicMock(side_effect=measure_side_effect)
-    return metric
-
-
-class TestDeepEvalHandlerSuccess:
-    def test_returns_pass_when_score_above_threshold(self):
-        metric = _mock_metric(score=0.9, threshold=0.7)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.9
-        assert result["label"] == "Pass"
-        assert result["explanation"] == "Looks good"
-
-    def test_returns_fail_when_score_below_threshold(self):
-        metric = _mock_metric(score=0.3, threshold=0.7)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.3
-        assert result["label"] == "Fail"
-
-    def test_returns_pass_at_exact_threshold(self):
-        metric = _mock_metric(score=0.7, threshold=0.7)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["label"] == "Pass"
-
-    def test_metric_measure_called_with_test_case(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-
-        handler(_make_event())
-
-        metric.measure.assert_called_once()
-        test_case = metric.measure.call_args[0][0]
-        assert test_case.input == "What is AI?"
-        assert test_case.actual_output == "AI is artificial intelligence."
-
-    def test_context_parameter_ignored(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-        mock_context = {"function_name": "my-lambda"}
-
-        result = handler(_make_event(), mock_context)
-
-        assert result["value"] == 0.85
-
-    def test_custom_field_mapper(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(
-            metric=metric,
-            field_mapper=lambda event: {
-                "input": "mapped input",
-                "actual_output": "mapped output",
-            },
-        )
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.85
-        test_case = metric.measure.call_args[0][0]
-        assert test_case.input == "mapped input"
-        assert test_case.actual_output == "mapped output"
-
-
-class TestDeepEvalHandlerErrors:
-    def test_invalid_event_returns_error(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler({})
-
-        assert result["errorCode"] == "INVALID_EVENT"
-        assert "errorMessage" in result
-        assert "value" not in result
-
-    def test_missing_evaluation_input_returns_error(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-
-        event = {"evaluationLevel": "TRACE", "evaluationTarget": {}}
-        result = handler(event)
-
-        assert result["errorCode"] == "INVALID_EVENT"
-
-    def test_missing_required_field_returns_error(self):
-        log_records = [
-            {
-                "body": {
-                    "input": {"messages": [{"role": "user", "content": "q"}]},
-                    "output": {"messages": [{"role": "assistant", "content": "a"}]},
-                }
-            }
-        ]
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"_eval_log_records": json.dumps(log_records)},
-            }
-        ]
-        metric = _mock_metric(name="FaithfulnessMetric")
-        handler = DeepEvalHandler(metric=metric)
-
-        event = _make_event(spans=spans)
-        result = handler(event)
-
-        assert result["errorCode"] == "MISSING_REQUIRED_FIELD"
-        assert "retrieval_context" in result["errorMessage"]
-
-    def test_metric_measure_exception_returns_error(self):
-        metric = _mock_metric()
-        metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout"))
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["errorCode"] == "METRIC_ERROR"
-        assert "LLM timeout" in result["errorMessage"]
-
-    def test_never_raises_on_any_input(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-
-        for bad_input in [None, [], "string", 42, {"random": "keys"}]:
-            result = handler(bad_input)
-            assert "errorCode" in result or "value" in result
-
-
-class TestDeepEvalHandlerEdgeCases:
-    def test_metric_with_no_reason(self):
-        metric = _mock_metric(score=0.8, reason=None)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["explanation"] == ""
-
-    def test_metric_score_zero(self):
-        metric = _mock_metric(score=0.0, threshold=0.5)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.0
-        assert result["label"] == "Fail"
-
-    def test_metric_score_one(self):
-        metric = _mock_metric(score=1.0, threshold=0.5)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 1.0
-        assert result["label"] == "Pass"
-
-    def test_default_threshold_when_missing(self):
-        metric = _mock_metric(score=0.6)
-        del metric.threshold
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["label"] == "Pass"
-
-    def test_label_uses_metric_success_true(self):
-        metric = _mock_metric(score=0.3, threshold=0.7)
-        metric.success = True
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.3
-        assert result["label"] == "Pass"
-
-    def test_label_uses_metric_success_false(self):
-        metric = _mock_metric(score=0.9, threshold=0.7)
-        metric.success = False
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.9
-        assert result["label"] == "Fail"
-
-    def test_label_falls_back_to_threshold_when_no_success(self):
-        metric = _mock_metric(score=0.8, threshold=0.7)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["label"] == "Pass"
-
-    def test_model_override_sets_metric_model(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3")
-
-        assert metric.model == "bedrock/anthropic.claude-3"
-
-    def test_no_model_override_leaves_metric_unchanged(self):
-        metric = _mock_metric()
-        metric.model = "original-model"
-        handler = DeepEvalHandler(metric=metric)
-
-        handler(_make_event())
-
-        assert metric.model == "original-model"
-
-
-class TestDeepEvalHandlerTimeout:
-    def test_timeout_returns_error(self):
-        metric = _mock_metric()
-        metric.measure = MagicMock(side_effect=lambda tc: time.sleep(5))
-        handler = DeepEvalHandler(metric=metric, timeout=1)
-
-        result = handler(_make_event())
-
-        assert result["errorCode"] == "METRIC_TIMEOUT"
-        assert "1s timeout" in result["errorMessage"]
-
-    def test_no_timeout_when_measure_completes_in_time(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric, timeout=10)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.85
-        assert "errorCode" not in result
-
-    def test_default_timeout_is_290(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-
-        assert handler.timeout == 290
-
-    def test_custom_timeout_value(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric, timeout=60)
-
-        assert handler.timeout == 60
-
-    def test_metric_exception_still_propagates_with_timeout(self):
-        metric = _mock_metric()
-        metric.measure = MagicMock(side_effect=RuntimeError("LLM error"))
-        handler = DeepEvalHandler(metric=metric, timeout=10)
-
-        result = handler(_make_event())
-
-        assert result["errorCode"] == "METRIC_ERROR"
-        assert "LLM error" in result["errorMessage"]
-
-
-class TestBackwardCompatibility:
-    def test_handler_is_alias_for_adapter(self):
-        assert DeepEvalHandler is DeepEvalAdapter
-
-    def test_adapter_is_subclass_of_base(self):
-        assert issubclass(DeepEvalAdapter, BaseAdapter)
-
-    def test_import_from_init(self):
-        from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalHandler as H
-        from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalAdapter as A
-
-        assert H is A
-
-    def test_handler_works_same_as_before(self):
-        metric = _mock_metric(score=0.9, threshold=0.7)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(_make_event())
-
-        assert result["value"] == 0.9
-        assert result["label"] == "Pass"
-
-
-class TestEvaluatorInputAcceptance:
-    def _make_evaluator_input(self):
-        log_records = [
-            {
-                "body": {
-                    "input": {"messages": [{"role": "user", "content": "Hello"}]},
-                    "output": {"messages": [{"role": "assistant", "content": "Hi there"}]},
-                }
-            }
-        ]
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"_eval_log_records": json.dumps(log_records)},
-            }
-        ]
-        return EvaluatorInput(
-            evaluation_level="TRACE",
-            session_spans=spans,
-            target_trace_id="t1",
-            target_span_id=None,
-        )
-
-    def test_accepts_evaluator_input(self):
-        metric = _mock_metric(score=0.95)
-        handler = DeepEvalHandler(metric=metric)
-
-        result = handler(self._make_evaluator_input())
-
-        assert result["value"] == 0.95
-        assert result["label"] == "Pass"
-
-    def test_evaluator_input_extracts_fields_correctly(self):
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-
-        handler(self._make_evaluator_input())
-
-        test_case = metric.measure.call_args[0][0]
-        assert test_case.input == "Hello"
-        assert test_case.actual_output == "Hi there"
-
-    def test_evaluator_input_with_trace_id_filtering(self):
-        log_records = [
-            {
-                "traceId": "target",
-                "body": {
-                    "input": {"messages": [{"role": "user", "content": "relevant"}]},
-                    "output": {"messages": [{"role": "assistant", "content": "yes"}]},
-                },
-            },
-            {
-                "traceId": "other",
-                "body": {
-                    "input": {"messages": [{"role": "user", "content": "irrelevant"}]},
-                    "output": {"messages": [{"role": "assistant", "content": "no"}]},
-                },
-            },
-        ]
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"_eval_log_records": json.dumps(log_records)},
-            }
-        ]
-        evaluator_input = EvaluatorInput(
-            evaluation_level="TRACE",
-            session_spans=spans,
-            target_trace_id="target",
-        )
-
-        metric = _mock_metric()
-        handler = DeepEvalHandler(metric=metric)
-
-        handler(evaluator_input)
-
-        test_case = metric.measure.call_args[0][0]
-        assert test_case.input == "relevant"
-        assert test_case.actual_output == "yes"
diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
deleted file mode 100644
index 2d6fbaea..00000000
--- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py
+++ /dev/null
@@ -1,581 +0,0 @@
-"""Tests for deepeval input mapping and test case building."""
-
-import json
-from unittest.mock import MagicMock
-
-import pytest
-from deepeval.test_case import SingleTurnParams
-
-from bedrock_agentcore.evaluation.integrations.base import (
-    ParsedEvaluationEvent,
-    extract_fields_from_spans as _extract_fields_from_spans,
-)
-from bedrock_agentcore.evaluation.integrations.deepeval.adapter import (
-    _get_required_params,
-    build_test_case,
-)
-
-
-def _make_log_record(
-    input_messages=None,
-    output_messages=None,
-    trace_id=None,
-):
-    """Build a single log record dict."""
-    record = {"body": {}}
-    if input_messages is not None:
-        record["body"]["input"] = {"messages": input_messages}
-    if output_messages is not None:
-        record["body"]["output"] = {"messages": output_messages}
-    if trace_id is not None:
-        record["traceId"] = trace_id
-    return record
-
-
-def _make_span_with_log_records(log_records, span_id="span1", as_json_string=True):
-    """Build a span dict with _eval_log_records in attributes."""
-    value = json.dumps(log_records) if as_json_string else log_records
-    return {
-        "traceId": "abc123",
-        "spanId": span_id,
-        "attributes": {"_eval_log_records": value},
-    }
-
-
-def _make_event(
-    level="TRACE",
-    trace_ids=None,
-    span_ids=None,
-    spans=None,
-    reference_inputs=None,
-):
-    """Build a raw Lambda event dict for testing."""
-    if spans is None:
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "What is the capital of France?"}],
-                output_messages=[{"role": "assistant", "content": "The capital of France is Paris."}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-
-    event = {
-        "schemaVersion": "1.0",
-        "evaluationLevel": level,
-        "evaluationInput": {"sessionSpans": spans},
-        "evaluationTarget": {},
-    }
-    if trace_ids is not None:
-        event["evaluationTarget"]["traceIds"] = trace_ids
-    if span_ids is not None:
-        event["evaluationTarget"]["spanIds"] = span_ids
-    if reference_inputs is not None:
-        event["evaluationReferenceInputs"] = reference_inputs
-    return event
-
-
-def _mock_metric(name="MockMetric", required_params=None, evaluation_params=None, threshold=0.5):
-    """Create a mock DeepEval metric."""
-    metric = MagicMock()
-    type(metric).__name__ = name
-    metric.threshold = threshold
-
-    if required_params is not None:
-        metric._required_params = required_params
-    else:
-        del metric._required_params
-
-    if evaluation_params is not None:
-        metric.evaluation_params = evaluation_params
-    else:
-        del metric.evaluation_params
-
-    return metric
-
-
-class TestParsedEvaluationEvent:
-    def test_from_lambda_event_trace_level(self):
-        event = _make_event(level="TRACE", trace_ids=["trace-1"])
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-
-        assert parsed.evaluation_level == "TRACE"
-        assert parsed.target_trace_id == "trace-1"
-        assert parsed.target_span_id is None
-        assert len(parsed.session_spans) == 1
-
-    def test_from_lambda_event_tool_call_level(self):
-        event = _make_event(level="TOOL_CALL", span_ids=["span-42"])
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-
-        assert parsed.evaluation_level == "TOOL_CALL"
-        assert parsed.target_span_id == "span-42"
-        assert parsed.target_trace_id is None
-
-    def test_from_lambda_event_session_level(self):
-        event = _make_event(level="SESSION")
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-
-        assert parsed.evaluation_level == "SESSION"
-        assert parsed.target_trace_id is None
-        assert parsed.target_span_id is None
-
-    def test_from_lambda_event_with_reference_inputs(self):
-        refs = [{"expectedResponse": "Paris is the capital of France."}]
-        event = _make_event(reference_inputs=refs)
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-
-        assert parsed.reference_inputs == refs
-
-    def test_from_lambda_event_missing_reference_inputs(self):
-        event = _make_event()
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-
-        assert parsed.reference_inputs == []
-
-    def test_from_lambda_event_missing_evaluation_level_raises(self):
-        event = _make_event()
-        del event["evaluationLevel"]
-
-        with pytest.raises(KeyError):
-            ParsedEvaluationEvent.from_lambda_event(event)
-
-    def test_from_lambda_event_missing_evaluation_input_raises(self):
-        event = _make_event()
-        del event["evaluationInput"]
-
-        with pytest.raises(KeyError):
-            ParsedEvaluationEvent.from_lambda_event(event)
-
-    def test_from_lambda_event_missing_target_key_defaults(self):
-        event = _make_event()
-        del event["evaluationTarget"]
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-
-        assert parsed.target_trace_id is None
-        assert parsed.target_span_id is None
-
-
-class TestGetRequiredParams:
-    def test_uses_required_params_attribute(self):
-        metric = _mock_metric(
-            required_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT]
-        )
-        result = _get_required_params(metric)
-
-        assert result == ["input", "actual_output"]
-
-    def test_falls_back_to_static_registry(self):
-        metric = _mock_metric(name="FaithfulnessMetric")
-        result = _get_required_params(metric)
-
-        assert result == ["input", "actual_output", "retrieval_context"]
-
-    def test_falls_back_to_evaluation_params(self):
-        metric = _mock_metric(
-            name="UnknownMetric",
-            evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.RETRIEVAL_CONTEXT],
-        )
-        result = _get_required_params(metric)
-
-        assert result == ["input", "retrieval_context"]
-
-    def test_defaults_to_input_and_actual_output(self):
-        metric = _mock_metric(name="UnknownMetric")
-        result = _get_required_params(metric)
-
-        assert result == ["input", "actual_output"]
-
-    def test_unmappable_required_params_skips_to_static_registry(self):
-        metric = _mock_metric(name="GEval", required_params=["SomeTypingObject", "AnotherType"])
-        result = _get_required_params(metric)
-
-        assert result == ["input", "actual_output"]
-
-    def test_unmappable_required_params_falls_to_default(self):
-        metric = _mock_metric(name="UnknownMetric", required_params=["SomeTypingObject"])
-        result = _get_required_params(metric)
-
-        assert result == ["input", "actual_output"]
-
-    def test_empty_required_params_falls_through(self):
-        metric = _mock_metric(name="UnknownMetric", required_params=[])
-        result = _get_required_params(metric)
-
-        assert result == ["input", "actual_output"]
-
-
-class TestExtractFieldsFromSpans:
-    def test_basic_extraction(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "hello"}],
-                output_messages=[{"role": "assistant", "content": "world"}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "hello"
-        assert fields["actual_output"] == "world"
-
-    def test_tool_messages_become_retrieval_context(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "query"}],
-                output_messages=[
-                    {"role": "tool", "content": "doc chunk 1"},
-                    {"role": "tool", "content": "doc chunk 2"},
-                    {"role": "assistant", "content": "answer"},
-                ],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"]
-        assert fields["actual_output"] == "answer"
-
-    def test_tool_messages_also_set_context_for_hallucination_metric(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "query"}],
-                output_messages=[
-                    {"role": "tool", "content": "context chunk"},
-                    {"role": "assistant", "content": "answer"},
-                ],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["context"] == ["context chunk"]
-        assert fields["context"] == fields["retrieval_context"]
-
-    def test_message_content_as_dict_with_content_key(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": {"content": "nested content"}}],
-                output_messages=[{"role": "assistant", "content": {"content": "nested output"}}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "nested content"
-        assert fields["actual_output"] == "nested output"
-
-    def test_message_content_as_dict_with_message_key(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "message": "msg key input"}],
-                output_messages=[{"role": "assistant", "message": "msg key output"}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "msg key input"
-        assert fields["actual_output"] == "msg key output"
-
-    def test_message_content_as_plain_string_in_content_field(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "plain string"}],
-                output_messages=[{"role": "assistant", "content": "plain response"}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "plain string"
-        assert fields["actual_output"] == "plain response"
-
-    def test_target_trace_id_filters_records(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "relevant"}],
-                output_messages=[{"role": "assistant", "content": "relevant answer"}],
-                trace_id="target-trace",
-            ),
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "irrelevant"}],
-                output_messages=[{"role": "assistant", "content": "irrelevant answer"}],
-                trace_id="other-trace",
-            ),
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE",
-            session_spans=spans,
-            target_trace_id="target-trace",
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "relevant"
-        assert fields["actual_output"] == "relevant answer"
-
-    def test_no_target_trace_id_includes_all_records(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "first"}],
-                output_messages=[{"role": "assistant", "content": "first answer"}],
-                trace_id="trace-1",
-            ),
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "second"}],
-                output_messages=[{"role": "assistant", "content": "second answer"}],
-                trace_id="trace-2",
-            ),
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="SESSION", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "first\nsecond"
-        assert fields["actual_output"] == "first answer\nsecond answer"
-
-    def test_log_records_as_parsed_list(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "from list"}],
-                output_messages=[{"role": "assistant", "content": "from list answer"}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records, as_json_string=False)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "from list"
-        assert fields["actual_output"] == "from list answer"
-
-    def test_invalid_json_log_records_skipped(self):
-        spans = [
-            {
-                "traceId": "t1",
-                "spanId": "s1",
-                "attributes": {"_eval_log_records": "not valid json{{{"},
-            }
-        ]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields == {}
-
-    def test_span_without_log_records_skipped(self):
-        spans = [{"traceId": "t1", "spanId": "s1", "attributes": {}}]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields == {}
-
-    def test_multiple_spans_aggregated(self):
-        log_records_1 = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "q1"}],
-                output_messages=[{"role": "assistant", "content": "a1"}],
-            )
-        ]
-        log_records_2 = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "q2"}],
-                output_messages=[{"role": "assistant", "content": "a2"}],
-            )
-        ]
-        spans = [
-            _make_span_with_log_records(log_records_1, span_id="s1"),
-            _make_span_with_log_records(log_records_2, span_id="s2"),
-        ]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="SESSION", session_spans=spans
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "q1\nq2"
-        assert fields["actual_output"] == "a1\na2"
-
-    def test_reference_inputs_expected_output(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "q"}],
-                output_messages=[{"role": "assistant", "content": "a"}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE",
-            session_spans=spans,
-            reference_inputs=[{"expectedResponse": "expected answer"}],
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["expected_output"] == "expected answer"
-
-    def test_record_without_matching_trace_id_key_included(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "no trace id record"}],
-                output_messages=[{"role": "assistant", "content": "response"}],
-            ),
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        parsed = ParsedEvaluationEvent(
-            evaluation_level="TRACE",
-            session_spans=spans,
-            target_trace_id="target-trace",
-        )
-
-        fields = _extract_fields_from_spans(parsed)
-
-        assert fields["input"] == "no trace id record"
-
-
-class TestBuildTestCase:
-    def test_basic_span_extraction(self):
-        event = _make_event()
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="AnswerRelevancyMetric")
-
-        test_case = build_test_case(parsed, metric)
-
-        assert test_case.input == "What is the capital of France?"
-        assert test_case.actual_output == "The capital of France is Paris."
-
-    def test_retrieval_context_from_tool_messages(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "query"}],
-                output_messages=[
-                    {"role": "tool", "content": "doc chunk 1"},
-                    {"role": "tool", "content": "doc chunk 2"},
-                    {"role": "assistant", "content": "answer"},
-                ],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        event = _make_event(spans=spans)
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="FaithfulnessMetric")
-
-        test_case = build_test_case(parsed, metric)
-
-        assert test_case.input == "query"
-        assert test_case.actual_output == "answer"
-        assert test_case.retrieval_context == ["doc chunk 1", "doc chunk 2"]
-
-    def test_expected_output_from_reference_inputs(self):
-        refs = [{"expectedResponse": "Paris"}]
-        event = _make_event(reference_inputs=refs)
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="AnswerRelevancyMetric")
-
-        test_case = build_test_case(parsed, metric)
-
-        assert test_case.expected_output == "Paris"
-
-    def test_missing_required_field_raises_value_error(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[{"role": "user", "content": "query"}],
-                output_messages=[{"role": "assistant", "content": "answer"}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        event = _make_event(spans=spans)
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="FaithfulnessMetric")
-
-        with pytest.raises(ValueError, match="retrieval_context"):
-            build_test_case(parsed, metric)
-
-    def test_custom_field_mapper_bypasses_extraction(self):
-        event = _make_event()
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="AnswerRelevancyMetric")
-
-        def custom_mapper(raw_event):
-            return {
-                "input": "custom input",
-                "actual_output": "custom output",
-            }
-
-        test_case = build_test_case(parsed, metric, field_mapper=custom_mapper)
-
-        assert test_case.input == "custom input"
-        assert test_case.actual_output == "custom output"
-
-    def test_field_mapper_receives_reconstructed_event(self):
-        refs = [{"expectedResponse": "expected"}]
-        event = _make_event(level="TRACE", trace_ids=["t1"], reference_inputs=refs)
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="AnswerRelevancyMetric")
-
-        received_events = []
-
-        def capture_mapper(raw_event):
-            received_events.append(raw_event)
-            return {"input": "x", "actual_output": "y"}
-
-        build_test_case(parsed, metric, field_mapper=capture_mapper)
-
-        raw = received_events[0]
-        assert raw["evaluationLevel"] == "TRACE"
-        assert raw["evaluationTarget"]["traceIds"] == ["t1"]
-        assert raw["evaluationReferenceInputs"] == refs
-
-    def test_multiple_user_messages_concatenated(self):
-        log_records = [
-            _make_log_record(
-                input_messages=[
-                    {"role": "user", "content": "hello"},
-                    {"role": "user", "content": "world"},
-                ],
-                output_messages=[{"role": "assistant", "content": "hi"}],
-            )
-        ]
-        spans = [_make_span_with_log_records(log_records)]
-        event = _make_event(spans=spans)
-        parsed = ParsedEvaluationEvent.from_lambda_event(event)
-        metric = _mock_metric(name="AnswerRelevancyMetric")
-
-        test_case = build_test_case(parsed, metric)
-
-        assert test_case.input == "hello\nworld"
diff --git a/tests_integ/evaluation/test_third_party_adapters.py b/tests_integ/evaluation/test_third_party_adapters.py
new file mode 100644
index 00000000..a9f0eac6
--- /dev/null
+++ b/tests_integ/evaluation/test_third_party_adapters.py
@@ -0,0 +1,171 @@
+"""Integration tests for third-party evaluation adapters.
+
+These tests require `deepeval` and `autoevals` packages to be installed.
+They verify the full adapter flow from EvaluatorInput through span parsing
+to metric execution, using real library metrics (not mocks).
+
+SETUP:
+    pip install deepeval autoevals
+
+RUN:
+    pytest tests_integ/evaluation/test_third_party_adapters.py -v
+"""
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+
+
+def _make_agent_evaluator_input(
+    user_prompt="What is the capital of France?",
+    agent_response="The capital of France is Paris.",
+    tool_messages=None,
+):
+    """Build an EvaluatorInput with agent-level spans."""
+    output_messages = []
+    if tool_messages:
+        for msg in tool_messages:
+            output_messages.append({"role": "tool", "content": msg})
+    output_messages.append({"role": "assistant", "content": agent_response})
+
+    spans = [
+        {
+            "traceId": "integ-trace-1",
+            "spanId": "integ-span-1",
+            "attributes": {"gen_ai.operation.name": "invoke_agent"},
+            "span_events": [
+                {
+                    "body": {
+                        "input": {"messages": [{"role": "user", "content": user_prompt}]},
+                        "output": {"messages": output_messages},
+                    }
+                }
+            ],
+        }
+    ]
+    return EvaluatorInput(
+        evaluation_level="TRACE",
+        session_spans=spans,
+        target_trace_id="integ-trace-1",
+    )
+
+
+class TestDeepEvalAdapterIntegration:
+    """Integration tests for DeepEvalAdapter with real DeepEval metrics."""
+
+    @pytest.fixture(autouse=True)
+    def check_deepeval(self):
+        """Skip if deepeval is not installed."""
+        pytest.importorskip("deepeval")
+
+    def test_bias_metric_passes(self):
+        from deepeval.metrics import BiasMetric
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = BiasMetric(threshold=0.5)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+        assert result.label in ("Pass", "Fail")
+
+    def test_missing_retrieval_context_returns_error(self):
+        from deepeval.metrics import FaithfulnessMetric
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = FaithfulnessMetric(threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(
+            _make_agent_evaluator_input(
+                user_prompt="Is the sky blue?",
+                agent_response="Yes, the sky is blue.",
+            )
+        )
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode == "MISSING_REQUIRED_FIELD" or result.value is not None
+
+    def test_with_field_mapper(self):
+        from deepeval.metrics import BiasMetric
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = BiasMetric(threshold=0.5)
+        adapter = DeepEvalAdapter(
+            metric=metric,
+            field_mapper=lambda ev: {
+                "input": "Is Python a good language?",
+                "actual_output": "Python is a versatile programming language used widely.",
+            },
+        )
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+
+
+class TestAutoevalsAdapterIntegration:
+    """Integration tests for AutoevalsAdapter with real Autoevals scorers."""
+
+    @pytest.fixture(autouse=True)
+    def check_autoevals(self):
+        """Skip if autoevals is not installed."""
+        pytest.importorskip("autoevals")
+
+    def test_factuality_scorer(self):
+        from autoevals import Factuality
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        evaluator_input = _make_agent_evaluator_input()
+        evaluator_input.session_spans[0]["span_events"][0]["body"]["output"]["messages"] = [
+            {"role": "assistant", "content": "The capital of France is Paris."}
+        ]
+
+        result = adapter(evaluator_input)
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+        assert result.label in ("Pass", "Fail")
+
+    def test_custom_threshold(self):
+        from autoevals import Factuality
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(scorer=scorer, threshold=0.9)
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+
+    def test_with_field_mapper(self):
+        from autoevals import Factuality
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(
+            scorer=scorer,
+            field_mapper=lambda ev: {
+                "input": "What is 2+2?",
+                "actual_output": "4",
+                "expected_output": "4",
+            },
+        )
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None

From 9a9b6a75053fb7de6016a54eed9835291ce59fcb Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Tue, 30 Jun 2026 11:53:47 -0700
Subject: [PATCH 12/13] Fix review items: add reference_inputs to model,
 tighten error detection, add validate_fields to DeepEvalAdapter

---
 .../custom_code_based_evaluators/models.py    |  1 +
 .../third_party/autoevals/adapter.py          |  2 +-
 .../third_party/base.py                       | 11 ++--
 .../third_party/deepeval/adapter.py           | 19 ++++--
 .../third_party/span_parsers/common.py        |  1 +
 .../third_party/deepeval/test_adapter.py      | 60 ++++++++++++++++++-
 6 files changed, 80 insertions(+), 14 deletions(-)

diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py
index c876b145..5ff8fafa 100644
--- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py
@@ -51,6 +51,7 @@ class EvaluatorInput(BaseModel):
     session_spans: List[Dict]
     target_trace_id: Optional[str] = None
     target_span_id: Optional[str] = None
+    reference_inputs: Optional[List[Dict]] = None
     schema_version: str = "1.0"
     evaluator_id: Optional[str] = None
     evaluator_name: Optional[str] = None
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
index fa2acba3..ae96a5b5 100644
--- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
@@ -40,7 +40,7 @@ def __init__(
         self.threshold = threshold
 
     def validate_fields(self, fields: Dict[str, Any]) -> None:
-        """Validate that input and actual_output are present."""
+        """Validate minimum required fields; scorer raises on additional missing params."""
         missing = []
         if not fields.get("input"):
             missing.append("input")
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
index 1f28d2a5..3dfd545e 100644
--- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
@@ -2,11 +2,10 @@
 
 import abc
 import logging
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, Optional
 
 from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
 from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
-    SpanParseResult,
     parse_spans,
 )
 
@@ -31,8 +30,9 @@ def __init__(
 
         Args:
             field_mapper: Optional callable that receives the EvaluatorInput and
-                returns a dict of field values. Bypasses default span parsing
-                when provided.
+                returns a dict with keys: 'input', 'actual_output', and optionally
+                'expected_output', 'context', 'retrieval_context'. Bypasses default
+                span parsing when provided.
         """
         self.field_mapper = field_mapper
 
@@ -81,8 +81,7 @@ def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]:
         if self.field_mapper is not None:
             return self.field_mapper(evaluator_input)
 
-        reference_inputs = getattr(evaluator_input, "reference_inputs", None)
-        result = parse_spans(evaluator_input.session_spans, reference_inputs)
+        result = parse_spans(evaluator_input.session_spans, evaluator_input.reference_inputs)
         return result.to_dict()
 
     @abc.abstractmethod
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
index 725584ef..9c7de325 100644
--- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
@@ -45,7 +45,18 @@ def __init__(
             self.metric.model = model
 
     def validate_fields(self, fields: Dict[str, Any]) -> None:
-        """No pre-validation; let DeepEval raise on missing params."""
+        """Validate that input and actual_output are present."""
+        missing = []
+        if not fields.get("input"):
+            missing.append("input")
+        if not fields.get("actual_output"):
+            missing.append("actual_output")
+        if missing:
+            metric_name = type(self.metric).__name__
+            raise ValueError(
+                f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
+                f"Provide a field_mapper or ensure spans contain the necessary data."
+            )
 
     def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
         """Run the DeepEval metric and return formatted results."""
@@ -60,12 +71,12 @@ def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
         try:
             self.metric.measure(test_case)
         except Exception as e:
-            error_type = type(e).__name__
-            if "MissingTestCaseParams" in error_type or "missing" in str(e).lower():
+            if type(e).__name__ == "MissingTestCaseParamsError":
                 return EvaluatorOutput(
                     label="Error",
                     errorCode="MISSING_REQUIRED_FIELD",
-                    errorMessage=f"{type(self.metric).__name__} requires fields not available: {e}",
+                    errorMessage=f"{type(self.metric).__name__} requires fields not extracted from spans: {e}. "
+                    f"Provide a field_mapper to supply custom fields from your trace data.",
                 )
             raise
 
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
index 6d69dbc6..0619be8c 100644
--- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
@@ -94,6 +94,7 @@ def extract_from_agent_span_events(
     for span in session_spans:
         attributes = span.get("attributes", {})
         operation_name = attributes.get("gen_ai.operation.name")
+        # Phase 1: only invoke_agent spans supported; others fall through to field_mapper
         if operation_name != "invoke_agent":
             continue
 
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
index 3c8a3d39..55e40cee 100644
--- a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
@@ -107,6 +107,36 @@ def test_custom_field_mapper(self):
         assert test_case.input == "mapped input"
         assert test_case.actual_output == "mapped output"
 
+    def test_reference_inputs_populates_expected_output(self):
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        evaluator_input = EvaluatorInput(
+            evaluation_level="TRACE",
+            session_spans=[
+                {
+                    "traceId": "t1",
+                    "spanId": "s1",
+                    "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                    "span_events": [
+                        {
+                            "body": {
+                                "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                                "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                            }
+                        }
+                    ],
+                }
+            ],
+            target_trace_id="t1",
+            reference_inputs=[{"expectedResponse": "AI stands for artificial intelligence."}],
+        )
+
+        result = adapter(evaluator_input)
+
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.expected_output == "AI stands for artificial intelligence."
+
     def test_model_override_sets_metric_model(self):
         metric = _mock_metric()
         DeepEvalAdapter(metric=metric, model="bedrock/anthropic.claude-3")
@@ -153,6 +183,31 @@ def test_no_agent_spans_returns_error(self):
         assert result.errorCode == "FIELD_EXTRACTION_ERROR"
         assert result.label == "Error"
 
+    def test_missing_input_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "output": {"messages": [{"role": "assistant", "content": "answer"}]},
+                        }
+                    }
+                ],
+            }
+        ]
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert result.errorCode == "MISSING_REQUIRED_FIELD"
+        assert "input" in result.errorMessage
+        assert "field_mapper" in result.errorMessage
+        metric.measure.assert_not_called()
+
     def test_metric_measure_exception_returns_error(self):
         metric = _mock_metric()
         metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout"))
@@ -166,9 +221,7 @@ def test_metric_measure_exception_returns_error(self):
     def test_missing_params_error_caught(self):
         metric = _mock_metric()
 
-        class MissingTestCaseParamsError(Exception):
-            pass
-
+        MissingTestCaseParamsError = type("MissingTestCaseParamsError", (Exception,), {})
         metric.measure = MagicMock(
             side_effect=MissingTestCaseParamsError("retrieval_context is required")
         )
@@ -178,6 +231,7 @@ class MissingTestCaseParamsError(Exception):
 
         assert result.errorCode == "MISSING_REQUIRED_FIELD"
         assert "retrieval_context" in result.errorMessage
+        assert "field_mapper" in result.errorMessage
 
     def test_never_raises(self):
         metric = _mock_metric()

From 0499d4b31377ef38075930fd4af5bce7a18e4a80 Mon Sep 17 00:00:00 2001
From: Haomiao Shi <haomiao@amazon.com>
Date: Tue, 30 Jun 2026 12:01:29 -0700
Subject: [PATCH 13/13] Adapt to upstream ReferenceInput model: remove
 duplicate field, use expected_response_text property

---
 .../evaluation/custom_code_based_evaluators/models.py      | 1 -
 .../third_party/span_parsers/base.py                       | 7 ++++---
 .../third_party/deepeval/test_adapter.py                   | 2 +-
 .../third_party/span_parsers/test_span_parsers.py          | 3 ++-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py
index 5ff8fafa..c876b145 100644
--- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py
@@ -51,7 +51,6 @@ class EvaluatorInput(BaseModel):
     session_spans: List[Dict]
     target_trace_id: Optional[str] = None
     target_span_id: Optional[str] = None
-    reference_inputs: Optional[List[Dict]] = None
     schema_version: str = "1.0"
     evaluator_id: Optional[str] = None
     evaluator_name: Optional[str] = None
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
index 3b88ff11..9869eab7 100644
--- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
@@ -28,7 +28,7 @@
 
 def parse_spans(
     session_spans: List[Dict[str, Any]],
-    reference_inputs: Optional[List[Dict[str, Any]]] = None,
+    reference_inputs: Optional[List[Any]] = None,
 ) -> SpanParseResult:
     """Parse session spans using the first matching agent-level parser.
 
@@ -38,7 +38,7 @@ def parse_spans(
 
     Args:
         session_spans: Raw ADOT span dicts from the evaluation service.
-        reference_inputs: Optional reference inputs for expected_output.
+        reference_inputs: Optional ReferenceInput list for expected_output.
 
     Returns:
         SpanParseResult with extracted fields.
@@ -50,7 +50,8 @@ def parse_spans(
         result = parser(session_spans)
         if result is not None:
             if reference_inputs:
-                expected = reference_inputs[0].get("expectedResponse")
+                ref = reference_inputs[0]
+                expected = getattr(ref, "expected_response_text", None)
                 if expected:
                     result.expected_output = expected
             return result
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
index 55e40cee..e7efef2a 100644
--- a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
@@ -129,7 +129,7 @@ def test_reference_inputs_populates_expected_output(self):
                 }
             ],
             target_trace_id="t1",
-            reference_inputs=[{"expectedResponse": "AI stands for artificial intelligence."}],
+            reference_inputs=[{"expectedResponse": {"text": "AI stands for artificial intelligence."}}],
         )
 
         result = adapter(evaluator_input)
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
index de2a1bb5..9669e5e5 100644
--- a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import ReferenceInput
 from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
     SpanParseResult,
     parse_spans,
@@ -96,7 +97,7 @@ def test_expected_output_from_reference_inputs(self):
                 output_messages=[{"role": "assistant", "content": "a"}],
             )
         ]
-        refs = [{"expectedResponse": "expected answer"}]
+        refs = [ReferenceInput(expectedResponse={"text": "expected answer"})]
 
         result = parse_spans(spans, reference_inputs=refs)