diff --git a/.gitignore b/.gitignore
index 01fe8e22..161403e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -229,3 +229,4 @@ local_settings.py
 Dockerfile
 CLAUDE.md
 .omc/
+.deepeval/
diff --git a/pyproject.toml b/pyproject.toml
index 61520a5b..b1fc5e90 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -173,3 +173,9 @@ simulation = [
 datasets = [
     "requests>=2.31.0",
 ]
+deepeval = [
+    "deepeval>=2.0.0",
+]
+autoevals = [
+    "autoevals>=0.0.50",
+]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
new file mode 100644
index 00000000..06ba3d0a
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
@@ -0,0 +1,5 @@
+"""Third-party evaluation adapters for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+__all__ = ["BaseAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
new file mode 100644
index 00000000..40e25fc1
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
@@ -0,0 +1,5 @@
+"""Autoevals adapter for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter
+
+__all__ = ["AutoevalsAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
new file mode 100644
index 00000000..ae96a5b5
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py
@@ -0,0 +1,71 @@
+"""Autoevals adapter for AgentCore code-based evaluators."""
+
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class AutoevalsAdapter(BaseAdapter):
+    """Adapter that runs an Autoevals scorer against AgentCore evaluation events.
+
+    Example::
+
+        from autoevals import Factuality
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(scorer=scorer)
+    """
+
+    def __init__(
+        self,
+        scorer: Any,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+        threshold: float = 0.5,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()).
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict of field values. Bypasses default span parsing.
+            threshold: Score threshold for Pass/Fail determination. Defaults to 0.5.
+        """
+        super().__init__(field_mapper=field_mapper)
+        self.scorer = scorer
+        self.threshold = threshold
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate minimum required fields; scorer raises on additional missing params."""
+        missing = []
+        if not fields.get("input"):
+            missing.append("input")
+        if not fields.get("actual_output"):
+            missing.append("actual_output")
+        if missing:
+            scorer_name = type(self.scorer).__name__
+            raise ValueError(
+                f"Field(s) {missing} required by {scorer_name} but not found in evaluation event. "
+                f"Provide a field_mapper or ensure spans contain the necessary data."
+            )
+
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the Autoevals scorer and return formatted results."""
+        kwargs: Dict[str, Any] = {
+            "input": fields.get("input", ""),
+            "output": fields.get("actual_output", ""),
+        }
+        if fields.get("expected_output"):
+            kwargs["expected"] = fields["expected_output"]
+
+        result = self.scorer.eval(**kwargs)
+
+        score = result.score
+        label = "Pass" if score is not None and score >= self.threshold else "Fail"
+        explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else ""
+
+        return EvaluatorOutput(value=score, label=label, explanation=explanation)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
new file mode 100644
index 00000000..3dfd545e
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py
@@ -0,0 +1,109 @@
+"""Base adapter for third-party evaluation framework integrations."""
+
+import abc
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
+    parse_spans,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class BaseAdapter(abc.ABC):
+    """Base adapter for third-party evaluation framework integrations.
+
+    Accepts an EvaluatorInput (from the code_based_evaluators flow),
+    extracts fields from spans using the built-in parser layer, runs the
+    evaluation via execute(), and returns an EvaluatorOutput.
+
+    Never raises unhandled exceptions — always returns a valid EvaluatorOutput.
+    """
+
+    def __init__(
+        self,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict with keys: 'input', 'actual_output', and optionally
+                'expected_output', 'context', 'retrieval_context'. Bypasses default
+                span parsing when provided.
+        """
+        self.field_mapper = field_mapper
+
+    def __call__(self, evaluator_input: EvaluatorInput, context: Any = None) -> EvaluatorOutput:
+        """Handle an evaluation invocation.
+
+        Args:
+            evaluator_input: Parsed EvaluatorInput from the code-based evaluator flow.
+            context: Lambda context object (unused).
+
+        Returns:
+            EvaluatorOutput with score, label, and explanation or error fields.
+        """
+        try:
+            fields = self._extract_fields(evaluator_input)
+        except ValueError as e:
+            logger.error("Field extraction failed: %s", e)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="FIELD_EXTRACTION_ERROR",
+                errorMessage=str(e),
+            )
+
+        try:
+            self.validate_fields(fields)
+        except ValueError as e:
+            logger.error("Validation failed: %s", e)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="MISSING_REQUIRED_FIELD",
+                errorMessage=str(e),
+            )
+
+        try:
+            return self.execute(fields)
+        except Exception as e:
+            logger.error("Execution failed: %s", e, exc_info=True)
+            return EvaluatorOutput(
+                label="Error",
+                errorCode="METRIC_ERROR",
+                errorMessage=f"{type(self).__name__} failed: {e}",
+            )
+
+    def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]:
+        """Extract fields from the EvaluatorInput."""
+        if self.field_mapper is not None:
+            return self.field_mapper(evaluator_input)
+
+        result = parse_spans(evaluator_input.session_spans, evaluator_input.reference_inputs)
+        return result.to_dict()
+
+    @abc.abstractmethod
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that required fields are present.
+
+        Each adapter must explicitly declare its validation behavior.
+
+        Args:
+            fields: Extracted field dict.
+
+        Raises:
+            ValueError: If required fields are missing.
+        """
+
+    @abc.abstractmethod
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the evaluation and return an EvaluatorOutput.
+
+        Args:
+            fields: Extracted field dict with keys like "input", "actual_output", etc.
+
+        Returns:
+            EvaluatorOutput with evaluation results.
+        """
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
new file mode 100644
index 00000000..99cf10d5
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
@@ -0,0 +1,5 @@
+"""DeepEval adapter for AgentCore code-based evaluators."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter
+
+__all__ = ["DeepEvalAdapter"]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
new file mode 100644
index 00000000..9c7de325
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py
@@ -0,0 +1,89 @@
+"""DeepEval adapter for AgentCore code-based evaluators."""
+
+import logging
+from typing import Any, Callable, Dict, Optional
+
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class DeepEvalAdapter(BaseAdapter):
+    """Adapter that runs a DeepEval metric against AgentCore evaluation events.
+
+    Example::
+
+        from deepeval.metrics import AnswerRelevancyMetric
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = AnswerRelevancyMetric(threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+    """
+
+    def __init__(
+        self,
+        metric: BaseMetric,
+        field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None,
+        model: Optional[Any] = None,
+    ):
+        """Initialize the adapter.
+
+        Args:
+            metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric).
+            field_mapper: Optional callable that receives the EvaluatorInput and
+                returns a dict of LLMTestCase field values. Bypasses default span
+                parsing when provided.
+            model: Optional model override for the metric's LLM.
+        """
+        super().__init__(field_mapper=field_mapper)
+        self.metric = metric
+        if model is not None:
+            self.metric.model = model
+
+    def validate_fields(self, fields: Dict[str, Any]) -> None:
+        """Validate that input and actual_output are present."""
+        missing = []
+        if not fields.get("input"):
+            missing.append("input")
+        if not fields.get("actual_output"):
+            missing.append("actual_output")
+        if missing:
+            metric_name = type(self.metric).__name__
+            raise ValueError(
+                f"Field(s) {missing} required by {metric_name} but not found in evaluation event. "
+                f"Provide a field_mapper or ensure spans contain the necessary data."
+            )
+
+    def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput:
+        """Run the DeepEval metric and return formatted results."""
+        test_case = LLMTestCase(
+            input=fields.get("input", ""),
+            actual_output=fields.get("actual_output", ""),
+            expected_output=fields.get("expected_output"),
+            context=fields.get("context"),
+            retrieval_context=fields.get("retrieval_context"),
+        )
+
+        try:
+            self.metric.measure(test_case)
+        except Exception as e:
+            if type(e).__name__ == "MissingTestCaseParamsError":
+                return EvaluatorOutput(
+                    label="Error",
+                    errorCode="MISSING_REQUIRED_FIELD",
+                    errorMessage=f"{type(self.metric).__name__} requires fields not extracted from spans: {e}. "
+                    f"Provide a field_mapper to supply custom fields from your trace data.",
+                )
+            raise
+
+        score = self.metric.score
+        reason = getattr(self.metric, "reason", None) or ""
+        threshold = getattr(self.metric, "threshold", 0.5)
+        success = getattr(self.metric, "success", score is not None and score >= threshold)
+        label = "Pass" if success else "Fail"
+
+        return EvaluatorOutput(value=score, label=label, explanation=reason)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
new file mode 100644
index 00000000..5388df83
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
@@ -0,0 +1,8 @@
+"""Span parsers for extracting evaluation fields from Agent SDK trace formats."""
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.base import (
+    SpanParseResult,
+    parse_spans,
+)
+
+__all__ = ["SpanParseResult", "parse_spans"]
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
new file mode 100644
index 00000000..9869eab7
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py
@@ -0,0 +1,63 @@
+"""Base span parsing logic and orchestration across format-specific parsers."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.strands import (
+    parse_strands_spans,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.otel_langchain import (
+    parse_otel_langchain_spans,
+)
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.openinference import (
+    parse_openinference_spans,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_PARSERS = [
+    parse_strands_spans,
+    parse_otel_langchain_spans,
+    parse_openinference_spans,
+]
+
+
+def parse_spans(
+    session_spans: List[Dict[str, Any]],
+    reference_inputs: Optional[List[Any]] = None,
+) -> SpanParseResult:
+    """Parse session spans using the first matching agent-level parser.
+
+    Iterates through format-specific parsers (Strands, OTel LangChain,
+    OpenInference) and returns the result from the first one that
+    successfully extracts data.
+
+    Args:
+        session_spans: Raw ADOT span dicts from the evaluation service.
+        reference_inputs: Optional ReferenceInput list for expected_output.
+
+    Returns:
+        SpanParseResult with extracted fields.
+
+    Raises:
+        ValueError: If no parser can extract data from the spans.
+    """
+    for parser in _PARSERS:
+        result = parser(session_spans)
+        if result is not None:
+            if reference_inputs:
+                ref = reference_inputs[0]
+                expected = getattr(ref, "expected_response_text", None)
+                if expected:
+                    result.expected_output = expected
+            return result
+
+    raise ValueError(
+        "Could not extract evaluation fields from spans. "
+        "No agent-level span with gen_ai.operation.name=='invoke_agent' and "
+        "valid span_events found. Provide a field_mapper for custom formats."
+    )
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
new file mode 100644
index 00000000..0619be8c
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py
@@ -0,0 +1,146 @@
+"""Common span parsing utilities shared across format-specific parsers."""
+
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SpanParseResult:
+    """Result of parsing spans into evaluation fields."""
+
+    input: Optional[str] = None
+    actual_output: Optional[str] = None
+    retrieval_context: Optional[List[str]] = None
+    context: Optional[List[str]] = None
+    expected_output: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dict, omitting None values."""
+        result: Dict[str, Any] = {}
+        if self.input is not None:
+            result["input"] = self.input
+        if self.actual_output is not None:
+            result["actual_output"] = self.actual_output
+        if self.retrieval_context is not None:
+            result["retrieval_context"] = self.retrieval_context
+        if self.context is not None:
+            result["context"] = self.context
+        if self.expected_output is not None:
+            result["expected_output"] = self.expected_output
+        return result
+
+
+def _get_message_content(message: Any) -> str:
+    """Extract text content from a message object."""
+    if isinstance(message, str):
+        return message
+    if isinstance(message, dict):
+        for key in ("content", "message"):
+            if key in message:
+                val = message[key]
+                if isinstance(val, str):
+                    return val
+                if isinstance(val, dict):
+                    return _get_message_content(val)
+                if isinstance(val, list):
+                    parts = []
+                    for item in val:
+                        if isinstance(item, str):
+                            parts.append(item)
+                        elif isinstance(item, dict) and "text" in item:
+                            parts.append(item["text"])
+                    if parts:
+                        return "\n".join(parts)
+                return str(val)
+    return ""
+
+
+def _parse_span_event_body(body: Any) -> Dict[str, Any]:
+    """Parse the body of a span event, handling both dict and JSON string."""
+    if isinstance(body, str):
+        try:
+            return json.loads(body)
+        except (json.JSONDecodeError, TypeError):
+            return {}
+    if isinstance(body, dict):
+        return body
+    return {}
+
+
+def extract_from_agent_span_events(
+    session_spans: List[Dict[str, Any]],
+) -> Optional[SpanParseResult]:
+    """Extract evaluation fields from agent-level span events.
+
+    Looks for spans where attributes.gen_ai.operation.name == "invoke_agent",
+    then inspects span_events for input/output messages.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent span with valid events found, None otherwise.
+    """
+    user_messages: List[str] = []
+    assistant_messages: List[str] = []
+    tool_messages: List[str] = []
+
+    found_agent_span = False
+
+    for span in session_spans:
+        attributes = span.get("attributes", {})
+        operation_name = attributes.get("gen_ai.operation.name")
+        # Phase 1: only invoke_agent spans supported; others fall through to field_mapper
+        if operation_name != "invoke_agent":
+            continue
+
+        found_agent_span = True
+        span_events = span.get("span_events", [])
+
+        for event in span_events:
+            body = _parse_span_event_body(event.get("body"))
+            if not body:
+                continue
+
+            input_data = body.get("input", {})
+            if isinstance(input_data, dict):
+                for msg in input_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "user" and content:
+                        user_messages.append(content)
+
+            output_data = body.get("output", {})
+            if isinstance(output_data, dict):
+                for msg in output_data.get("messages", []):
+                    if not isinstance(msg, dict):
+                        continue
+                    role = msg.get("role", "")
+                    content = _get_message_content(msg)
+                    if role == "assistant" and content:
+                        assistant_messages.append(content)
+                    elif role == "tool" and content:
+                        tool_messages.append(content)
+
+    if not found_agent_span:
+        return None
+
+    if not user_messages and not assistant_messages:
+        return None
+
+    result = SpanParseResult()
+    if user_messages:
+        result.input = user_messages[0]
+    if assistant_messages:
+        result.actual_output = assistant_messages[-1]
+    if tool_messages:
+        result.retrieval_context = tool_messages
+        result.context = tool_messages
+
+    return result
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py
new file mode 100644
index 00000000..e500740e
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py
@@ -0,0 +1,27 @@
+"""OpenInference LangChain span parser."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+    extract_from_agent_span_events,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def parse_openinference_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]:
+    """Parse spans from OpenInference LangChain instrumentation format.
+
+    Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent")
+    and span_events extraction. OpenInference-specific divergence can be added here
+    as schemas evolve.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent spans found, None otherwise.
+    """
+    return extract_from_agent_span_events(session_spans)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py
new file mode 100644
index 00000000..f1e211c5
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py
@@ -0,0 +1,27 @@
+"""OTel LangChain span parser."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+    extract_from_agent_span_events,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def parse_otel_langchain_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]:
+    """Parse spans from OTel LangChain instrumentation format.
+
+    Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent")
+    and span_events extraction. LangChain-specific divergence can be added here
+    as schemas evolve.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent spans found, None otherwise.
+    """
+    return extract_from_agent_span_events(session_spans)
diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py
new file mode 100644
index 00000000..3789ad9c
--- /dev/null
+++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py
@@ -0,0 +1,26 @@
+"""Strands Agent SDK span parser."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import (
+    SpanParseResult,
+    extract_from_agent_span_events,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def parse_strands_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]:
+    """Parse spans from Strands Agent SDK format.
+
+    Looks for spans with gen_ai.operation.name == "invoke_agent" and
+    extracts input/output from span_events.
+
+    Args:
+        session_spans: Raw ADOT span dicts.
+
+    Returns:
+        SpanParseResult if agent spans found, None otherwise.
+    """
+    return extract_from_agent_span_events(session_spans)
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py
new file mode 100644
index 00000000..2f640817
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py
@@ -0,0 +1,201 @@
+"""Tests for AutoevalsAdapter."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter
+
+
+def _make_evaluator_input(spans=None):
+    """Build an EvaluatorInput with agent-level spans."""
+    if spans is None:
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                            "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                        }
+                    }
+                ],
+            }
+        ]
+    return EvaluatorInput(
+        evaluation_level="TRACE",
+        session_spans=spans,
+        target_trace_id="t1",
+    )
+
+
+def _mock_scorer(score=0.9, rationale="Good answer"):
+    """Create a mock Autoevals scorer."""
+    scorer = MagicMock()
+    type(scorer).__name__ = "MockScorer"
+
+    result = MagicMock()
+    result.score = score
+    result.metadata = {"rationale": rationale}
+
+    scorer.eval = MagicMock(return_value=result)
+    return scorer
+
+
+class TestAutoevalsAdapterSuccess:
+    def test_returns_pass_when_score_above_threshold(self):
+        scorer = _mock_scorer(score=0.8)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value == 0.8
+        assert result.label == "Pass"
+        assert result.explanation == "Good answer"
+
+    def test_returns_fail_when_score_below_threshold(self):
+        scorer = _mock_scorer(score=0.3)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.3
+        assert result.label == "Fail"
+
+    def test_custom_threshold(self):
+        scorer = _mock_scorer(score=0.6)
+        adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Fail"
+
+    def test_custom_threshold_pass(self):
+        scorer = _mock_scorer(score=0.8)
+        adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Pass"
+
+    def test_default_threshold_is_half(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        assert adapter.threshold == 0.5
+
+    def test_scorer_eval_called_with_input_and_output(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        adapter(_make_evaluator_input())
+
+        scorer.eval.assert_called_once()
+        call_kwargs = scorer.eval.call_args[1]
+        assert call_kwargs["input"] == "What is AI?"
+        assert call_kwargs["output"] == "AI is artificial intelligence."
+
+    def test_custom_field_mapper(self):
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(
+            scorer=scorer,
+            field_mapper=lambda ev: {
+                "input": "custom input",
+                "actual_output": "custom output",
+            },
+        )
+
+        result = adapter(_make_evaluator_input())
+
+        call_kwargs = scorer.eval.call_args[1]
+        assert call_kwargs["input"] == "custom input"
+        assert call_kwargs["output"] == "custom output"
+
+
+class TestAutoevalsAdapterErrors:
+    def test_no_agent_spans_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "chat"},
+                "span_events": [],
+            }
+        ]
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert result.errorCode == "FIELD_EXTRACTION_ERROR"
+
+    def test_missing_input_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "output": {"messages": [{"role": "assistant", "content": "answer"}]},
+                        }
+                    }
+                ],
+            }
+        ]
+        scorer = _mock_scorer()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert result.errorCode == "MISSING_REQUIRED_FIELD"
+        assert "input" in result.errorMessage
+
+    def test_scorer_exception_returns_error(self):
+        scorer = _mock_scorer()
+        scorer.eval = MagicMock(side_effect=RuntimeError("API error"))
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.errorCode == "METRIC_ERROR"
+        assert "API error" in result.errorMessage
+
+    def test_never_raises(self):
+        scorer = _mock_scorer()
+        scorer.eval = MagicMock(side_effect=Exception("unexpected"))
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode is not None
+
+
+class TestAutoevalsAdapterEdgeCases:
+    def test_score_none_returns_fail(self):
+        scorer = _mock_scorer(score=None)
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Fail"
+
+    def test_no_metadata_returns_empty_explanation(self):
+        scorer = MagicMock()
+        type(scorer).__name__ = "MockScorer"
+        result_obj = MagicMock(spec=[])
+        result_obj.score = 0.9
+        scorer.eval = MagicMock(return_value=result_obj)
+
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.explanation == ""
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
new file mode 100644
index 00000000..e7efef2a
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py
@@ -0,0 +1,272 @@
+"""Tests for DeepEvalAdapter."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter
+
+
+def _make_evaluator_input(spans=None):
+    """Build an EvaluatorInput with agent-level spans."""
+    if spans is None:
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                            "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                        }
+                    }
+                ],
+            }
+        ]
+    return EvaluatorInput(
+        evaluation_level="TRACE",
+        session_spans=spans,
+        target_trace_id="t1",
+    )
+
+
+def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"):
+    """Create a mock metric that returns a fixed score on measure()."""
+    metric = MagicMock()
+    type(metric).__name__ = name
+    metric.threshold = threshold
+    metric.score = score
+    metric.reason = reason
+    del metric.success
+
+    def measure_side_effect(test_case):
+        metric.score = score
+        metric.reason = reason
+
+    metric.measure = MagicMock(side_effect=measure_side_effect)
+    return metric
+
+
+class TestDeepEvalAdapterSuccess:
+    def test_returns_pass_when_score_above_threshold(self):
+        metric = _mock_metric(score=0.9, threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value == 0.9
+        assert result.label == "Pass"
+        assert result.explanation == "Looks good"
+
+    def test_returns_fail_when_score_below_threshold(self):
+        metric = _mock_metric(score=0.3, threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.3
+        assert result.label == "Fail"
+
+    def test_returns_pass_at_exact_threshold(self):
+        metric = _mock_metric(score=0.7, threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Pass"
+
+    def test_metric_measure_called_with_test_case(self):
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        adapter(_make_evaluator_input())
+
+        metric.measure.assert_called_once()
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "What is AI?"
+        assert test_case.actual_output == "AI is artificial intelligence."
+
+    def test_custom_field_mapper(self):
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(
+            metric=metric,
+            field_mapper=lambda ev: {
+                "input": "mapped input",
+                "actual_output": "mapped output",
+            },
+        )
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.85
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.input == "mapped input"
+        assert test_case.actual_output == "mapped output"
+
+    def test_reference_inputs_populates_expected_output(self):
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        evaluator_input = EvaluatorInput(
+            evaluation_level="TRACE",
+            session_spans=[
+                {
+                    "traceId": "t1",
+                    "spanId": "s1",
+                    "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                    "span_events": [
+                        {
+                            "body": {
+                                "input": {"messages": [{"role": "user", "content": "What is AI?"}]},
+                                "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]},
+                            }
+                        }
+                    ],
+                }
+            ],
+            target_trace_id="t1",
+            reference_inputs=[{"expectedResponse": {"text": "AI stands for artificial intelligence."}}],
+        )
+
+        result = adapter(evaluator_input)
+
+        test_case = metric.measure.call_args[0][0]
+        assert test_case.expected_output == "AI stands for artificial intelligence."
+
+    def test_model_override_sets_metric_model(self):
+        metric = _mock_metric()
+        DeepEvalAdapter(metric=metric, model="bedrock/anthropic.claude-3")
+
+        assert metric.model == "bedrock/anthropic.claude-3"
+
+    def test_label_uses_metric_success_true(self):
+        metric = _mock_metric(score=0.3, threshold=0.7)
+        metric.success = True
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.3
+        assert result.label == "Pass"
+
+    def test_label_uses_metric_success_false(self):
+        metric = _mock_metric(score=0.9, threshold=0.7)
+        metric.success = False
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.9
+        assert result.label == "Fail"
+
+
+class TestDeepEvalAdapterErrors:
+    def test_no_agent_spans_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "chat"},
+                "span_events": [],
+            }
+        ]
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode == "FIELD_EXTRACTION_ERROR"
+        assert result.label == "Error"
+
+    def test_missing_input_returns_error(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [
+                    {
+                        "body": {
+                            "output": {"messages": [{"role": "assistant", "content": "answer"}]},
+                        }
+                    }
+                ],
+            }
+        ]
+        metric = _mock_metric()
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input(spans=spans))
+
+        assert result.errorCode == "MISSING_REQUIRED_FIELD"
+        assert "input" in result.errorMessage
+        assert "field_mapper" in result.errorMessage
+        metric.measure.assert_not_called()
+
+    def test_metric_measure_exception_returns_error(self):
+        metric = _mock_metric()
+        metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout"))
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.errorCode == "METRIC_ERROR"
+        assert "LLM timeout" in result.errorMessage
+
+    def test_missing_params_error_caught(self):
+        metric = _mock_metric()
+
+        MissingTestCaseParamsError = type("MissingTestCaseParamsError", (Exception,), {})
+        metric.measure = MagicMock(
+            side_effect=MissingTestCaseParamsError("retrieval_context is required")
+        )
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.errorCode == "MISSING_REQUIRED_FIELD"
+        assert "retrieval_context" in result.errorMessage
+        assert "field_mapper" in result.errorMessage
+
+    def test_never_raises(self):
+        metric = _mock_metric()
+        metric.measure = MagicMock(side_effect=Exception("unexpected"))
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode is not None
+
+
+class TestDeepEvalAdapterEdgeCases:
+    def test_metric_with_no_reason(self):
+        metric = _mock_metric(score=0.8, reason=None)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.explanation == ""
+
+    def test_metric_score_zero(self):
+        metric = _mock_metric(score=0.0, threshold=0.5)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.value == 0.0
+        assert result.label == "Fail"
+
+    def test_default_threshold_when_missing(self):
+        metric = _mock_metric(score=0.6)
+        del metric.threshold
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_evaluator_input())
+
+        assert result.label == "Pass"
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
new file mode 100644
index 00000000..9669e5e5
--- /dev/null
+++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py
@@ -0,0 +1,195 @@
+"""Tests for span parsers."""
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import ReferenceInput
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import (
+    SpanParseResult,
+    parse_spans,
+)
+
+
+def _make_agent_span(input_messages=None, output_messages=None, span_id="span1"):
+    """Build an agent-level span with span_events."""
+    span_events = []
+    body = {}
+    if input_messages is not None:
+        body["input"] = {"messages": input_messages}
+    if output_messages is not None:
+        body["output"] = {"messages": output_messages}
+    if body:
+        span_events.append({"body": body})
+
+    return {
+        "traceId": "abc123",
+        "spanId": span_id,
+        "attributes": {"gen_ai.operation.name": "invoke_agent"},
+        "span_events": span_events,
+    }
+
+
+class TestParseSpansSuccess:
+    def test_extracts_input_and_output(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "What is AI?"}],
+                output_messages=[{"role": "assistant", "content": "Artificial intelligence."}],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.input == "What is AI?"
+        assert result.actual_output == "Artificial intelligence."
+
+    def test_extracts_tool_messages_as_retrieval_context(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "query"}],
+                output_messages=[
+                    {"role": "tool", "content": "doc chunk 1"},
+                    {"role": "tool", "content": "doc chunk 2"},
+                    {"role": "assistant", "content": "answer"},
+                ],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.retrieval_context == ["doc chunk 1", "doc chunk 2"]
+        assert result.context == ["doc chunk 1", "doc chunk 2"]
+        assert result.actual_output == "answer"
+
+    def test_uses_first_user_message_as_input(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[
+                    {"role": "user", "content": "first"},
+                    {"role": "user", "content": "second"},
+                ],
+                output_messages=[{"role": "assistant", "content": "reply"}],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.input == "first"
+
+    def test_uses_last_assistant_message_as_output(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "q"}],
+                output_messages=[
+                    {"role": "assistant", "content": "first reply"},
+                    {"role": "assistant", "content": "final reply"},
+                ],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.actual_output == "final reply"
+
+    def test_expected_output_from_reference_inputs(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": "q"}],
+                output_messages=[{"role": "assistant", "content": "a"}],
+            )
+        ]
+        refs = [ReferenceInput(expectedResponse={"text": "expected answer"})]
+
+        result = parse_spans(spans, reference_inputs=refs)
+
+        assert result.expected_output == "expected answer"
+
+    def test_nested_content_dict(self):
+        spans = [
+            _make_agent_span(
+                input_messages=[{"role": "user", "content": {"content": "nested"}}],
+                output_messages=[{"role": "assistant", "content": {"content": "nested out"}}],
+            )
+        ]
+
+        result = parse_spans(spans)
+
+        assert result.input == "nested"
+        assert result.actual_output == "nested out"
+
+    def test_body_as_json_string(self):
+        import json
+
+        body = {
+            "input": {"messages": [{"role": "user", "content": "hello"}]},
+            "output": {"messages": [{"role": "assistant", "content": "hi"}]},
+        }
+        span = {
+            "traceId": "t1",
+            "spanId": "s1",
+            "attributes": {"gen_ai.operation.name": "invoke_agent"},
+            "span_events": [{"body": json.dumps(body)}],
+        }
+
+        result = parse_spans([span])
+
+        assert result.input == "hello"
+        assert result.actual_output == "hi"
+
+    def test_to_dict_omits_none(self):
+        result = SpanParseResult(input="q", actual_output="a")
+        d = result.to_dict()
+
+        assert d == {"input": "q", "actual_output": "a"}
+        assert "retrieval_context" not in d
+
+
+class TestParseSpansErrors:
+    def test_no_agent_spans_raises(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "other_op"},
+                "span_events": [],
+            }
+        ]
+
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans(spans)
+
+    def test_empty_spans_raises(self):
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans([])
+
+    def test_agent_span_without_events_raises(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "invoke_agent"},
+                "span_events": [],
+            }
+        ]
+
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans(spans)
+
+    def test_non_agent_spans_ignored(self):
+        spans = [
+            {
+                "traceId": "t1",
+                "spanId": "s1",
+                "attributes": {"gen_ai.operation.name": "chat"},
+                "span_events": [
+                    {
+                        "body": {
+                            "input": {"messages": [{"role": "user", "content": "q"}]},
+                            "output": {"messages": [{"role": "assistant", "content": "a"}]},
+                        }
+                    }
+                ],
+            }
+        ]
+
+        with pytest.raises(ValueError, match="Could not extract evaluation fields"):
+            parse_spans(spans)
diff --git a/tests_integ/evaluation/test_third_party_adapters.py b/tests_integ/evaluation/test_third_party_adapters.py
new file mode 100644
index 00000000..a9f0eac6
--- /dev/null
+++ b/tests_integ/evaluation/test_third_party_adapters.py
@@ -0,0 +1,171 @@
+"""Integration tests for third-party evaluation adapters.
+
+These tests require `deepeval` and `autoevals` packages to be installed.
+They verify the full adapter flow from EvaluatorInput through span parsing
+to metric execution, using real library metrics (not mocks).
+
+SETUP:
+    pip install deepeval autoevals
+
+RUN:
+    pytest tests_integ/evaluation/test_third_party_adapters.py -v
+"""
+
+import pytest
+
+from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput
+
+
+def _make_agent_evaluator_input(
+    user_prompt="What is the capital of France?",
+    agent_response="The capital of France is Paris.",
+    tool_messages=None,
+):
+    """Build an EvaluatorInput with agent-level spans."""
+    output_messages = []
+    if tool_messages:
+        for msg in tool_messages:
+            output_messages.append({"role": "tool", "content": msg})
+    output_messages.append({"role": "assistant", "content": agent_response})
+
+    spans = [
+        {
+            "traceId": "integ-trace-1",
+            "spanId": "integ-span-1",
+            "attributes": {"gen_ai.operation.name": "invoke_agent"},
+            "span_events": [
+                {
+                    "body": {
+                        "input": {"messages": [{"role": "user", "content": user_prompt}]},
+                        "output": {"messages": output_messages},
+                    }
+                }
+            ],
+        }
+    ]
+    return EvaluatorInput(
+        evaluation_level="TRACE",
+        session_spans=spans,
+        target_trace_id="integ-trace-1",
+    )
+
+
+class TestDeepEvalAdapterIntegration:
+    """Integration tests for DeepEvalAdapter with real DeepEval metrics."""
+
+    @pytest.fixture(autouse=True)
+    def check_deepeval(self):
+        """Skip if deepeval is not installed."""
+        pytest.importorskip("deepeval")
+
+    def test_bias_metric_passes(self):
+        from deepeval.metrics import BiasMetric
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = BiasMetric(threshold=0.5)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+        assert result.label in ("Pass", "Fail")
+
+    def test_missing_retrieval_context_returns_error(self):
+        from deepeval.metrics import FaithfulnessMetric
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = FaithfulnessMetric(threshold=0.7)
+        adapter = DeepEvalAdapter(metric=metric)
+
+        result = adapter(
+            _make_agent_evaluator_input(
+                user_prompt="Is the sky blue?",
+                agent_response="Yes, the sky is blue.",
+            )
+        )
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.errorCode == "MISSING_REQUIRED_FIELD" or result.value is not None
+
+    def test_with_field_mapper(self):
+        from deepeval.metrics import BiasMetric
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter
+
+        metric = BiasMetric(threshold=0.5)
+        adapter = DeepEvalAdapter(
+            metric=metric,
+            field_mapper=lambda ev: {
+                "input": "Is Python a good language?",
+                "actual_output": "Python is a versatile programming language used widely.",
+            },
+        )
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+
+
+class TestAutoevalsAdapterIntegration:
+    """Integration tests for AutoevalsAdapter with real Autoevals scorers."""
+
+    @pytest.fixture(autouse=True)
+    def check_autoevals(self):
+        """Skip if autoevals is not installed."""
+        pytest.importorskip("autoevals")
+
+    def test_factuality_scorer(self):
+        from autoevals import Factuality
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(scorer=scorer)
+
+        evaluator_input = _make_agent_evaluator_input()
+        evaluator_input.session_spans[0]["span_events"][0]["body"]["output"]["messages"] = [
+            {"role": "assistant", "content": "The capital of France is Paris."}
+        ]
+
+        result = adapter(evaluator_input)
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+        assert result.label in ("Pass", "Fail")
+
+    def test_custom_threshold(self):
+        from autoevals import Factuality
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(scorer=scorer, threshold=0.9)
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None
+
+    def test_with_field_mapper(self):
+        from autoevals import Factuality
+
+        from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter
+
+        scorer = Factuality()
+        adapter = AutoevalsAdapter(
+            scorer=scorer,
+            field_mapper=lambda ev: {
+                "input": "What is 2+2?",
+                "actual_output": "4",
+                "expected_output": "4",
+            },
+        )
+
+        result = adapter(_make_agent_evaluator_input())
+
+        assert isinstance(result, EvaluatorOutput)
+        assert result.value is not None