diff --git a/.gitignore b/.gitignore index 01fe8e22..161403e7 100644 --- a/.gitignore +++ b/.gitignore @@ -229,3 +229,4 @@ local_settings.py Dockerfile CLAUDE.md .omc/ +.deepeval/ diff --git a/pyproject.toml b/pyproject.toml index 61520a5b..b1fc5e90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -173,3 +173,9 @@ simulation = [ datasets = [ "requests>=2.31.0", ] +deepeval = [ + "deepeval>=2.0.0", +] +autoevals = [ + "autoevals>=0.0.50", +] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py new file mode 100644 index 00000000..06ba3d0a --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py @@ -0,0 +1,5 @@ +"""Third-party evaluation adapters for AgentCore code-based evaluators.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter + +__all__ = ["BaseAdapter"] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py new file mode 100644 index 00000000..40e25fc1 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py @@ -0,0 +1,5 @@ +"""Autoevals adapter for AgentCore code-based evaluators.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter + +__all__ = ["AutoevalsAdapter"] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py new file mode 100644 index 00000000..ae96a5b5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py @@ -0,0 +1,71 @@ +"""Autoevals adapter for AgentCore code-based evaluators.""" + +import logging +from typing import Any, Callable, Dict, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter + +logger = logging.getLogger(__name__) + + +class AutoevalsAdapter(BaseAdapter): + """Adapter that runs an Autoevals scorer against AgentCore evaluation events. + + Example:: + + from autoevals import Factuality + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter + + scorer = Factuality() + adapter = AutoevalsAdapter(scorer=scorer) + """ + + def __init__( + self, + scorer: Any, + field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None, + threshold: float = 0.5, + ): + """Initialize the adapter. + + Args: + scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()). + field_mapper: Optional callable that receives the EvaluatorInput and + returns a dict of field values. Bypasses default span parsing. + threshold: Score threshold for Pass/Fail determination. Defaults to 0.5. + """ + super().__init__(field_mapper=field_mapper) + self.scorer = scorer + self.threshold = threshold + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate minimum required fields; scorer raises on additional missing params.""" + missing = [] + if not fields.get("input"): + missing.append("input") + if not fields.get("actual_output"): + missing.append("actual_output") + if missing: + scorer_name = type(self.scorer).__name__ + raise ValueError( + f"Field(s) {missing} required by {scorer_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: + """Run the Autoevals scorer and return formatted results.""" + kwargs: Dict[str, Any] = { + "input": fields.get("input", ""), + "output": fields.get("actual_output", ""), + } + if fields.get("expected_output"): + kwargs["expected"] = fields["expected_output"] + + result = self.scorer.eval(**kwargs) + + score = result.score + label = "Pass" if score is not None and score >= self.threshold else "Fail" + explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else "" + + return EvaluatorOutput(value=score, label=label, explanation=explanation) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py new file mode 100644 index 00000000..3dfd545e --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py @@ -0,0 +1,109 @@ +"""Base adapter for third-party evaluation framework integrations.""" + +import abc +import logging +from typing import Any, Callable, Dict, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import ( + parse_spans, +) + +logger = logging.getLogger(__name__) + + +class BaseAdapter(abc.ABC): + """Base adapter for third-party evaluation framework integrations. + + Accepts an EvaluatorInput (from the code_based_evaluators flow), + extracts fields from spans using the built-in parser layer, runs the + evaluation via execute(), and returns an EvaluatorOutput. + + Never raises unhandled exceptions — always returns a valid EvaluatorOutput. + """ + + def __init__( + self, + field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None, + ): + """Initialize the adapter. + + Args: + field_mapper: Optional callable that receives the EvaluatorInput and + returns a dict with keys: 'input', 'actual_output', and optionally + 'expected_output', 'context', 'retrieval_context'. Bypasses default + span parsing when provided. + """ + self.field_mapper = field_mapper + + def __call__(self, evaluator_input: EvaluatorInput, context: Any = None) -> EvaluatorOutput: + """Handle an evaluation invocation. + + Args: + evaluator_input: Parsed EvaluatorInput from the code-based evaluator flow. + context: Lambda context object (unused). + + Returns: + EvaluatorOutput with score, label, and explanation or error fields. + """ + try: + fields = self._extract_fields(evaluator_input) + except ValueError as e: + logger.error("Field extraction failed: %s", e) + return EvaluatorOutput( + label="Error", + errorCode="FIELD_EXTRACTION_ERROR", + errorMessage=str(e), + ) + + try: + self.validate_fields(fields) + except ValueError as e: + logger.error("Validation failed: %s", e) + return EvaluatorOutput( + label="Error", + errorCode="MISSING_REQUIRED_FIELD", + errorMessage=str(e), + ) + + try: + return self.execute(fields) + except Exception as e: + logger.error("Execution failed: %s", e, exc_info=True) + return EvaluatorOutput( + label="Error", + errorCode="METRIC_ERROR", + errorMessage=f"{type(self).__name__} failed: {e}", + ) + + def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]: + """Extract fields from the EvaluatorInput.""" + if self.field_mapper is not None: + return self.field_mapper(evaluator_input) + + result = parse_spans(evaluator_input.session_spans, evaluator_input.reference_inputs) + return result.to_dict() + + @abc.abstractmethod + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that required fields are present. + + Each adapter must explicitly declare its validation behavior. + + Args: + fields: Extracted field dict. + + Raises: + ValueError: If required fields are missing. + """ + + @abc.abstractmethod + def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: + """Run the evaluation and return an EvaluatorOutput. + + Args: + fields: Extracted field dict with keys like "input", "actual_output", etc. + + Returns: + EvaluatorOutput with evaluation results. + """ diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py new file mode 100644 index 00000000..99cf10d5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py @@ -0,0 +1,5 @@ +"""DeepEval adapter for AgentCore code-based evaluators.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter + +__all__ = ["DeepEvalAdapter"] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py new file mode 100644 index 00000000..9c7de325 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py @@ -0,0 +1,89 @@ +"""DeepEval adapter for AgentCore code-based evaluators.""" + +import logging +from typing import Any, Callable, Dict, Optional + +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter + +logger = logging.getLogger(__name__) + + +class DeepEvalAdapter(BaseAdapter): + """Adapter that runs a DeepEval metric against AgentCore evaluation events. + + Example:: + + from deepeval.metrics import AnswerRelevancyMetric + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = AnswerRelevancyMetric(threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + """ + + def __init__( + self, + metric: BaseMetric, + field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None, + model: Optional[Any] = None, + ): + """Initialize the adapter. + + Args: + metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). + field_mapper: Optional callable that receives the EvaluatorInput and + returns a dict of LLMTestCase field values. Bypasses default span + parsing when provided. + model: Optional model override for the metric's LLM. + """ + super().__init__(field_mapper=field_mapper) + self.metric = metric + if model is not None: + self.metric.model = model + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that input and actual_output are present.""" + missing = [] + if not fields.get("input"): + missing.append("input") + if not fields.get("actual_output"): + missing.append("actual_output") + if missing: + metric_name = type(self.metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: + """Run the DeepEval metric and return formatted results.""" + test_case = LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) + + try: + self.metric.measure(test_case) + except Exception as e: + if type(e).__name__ == "MissingTestCaseParamsError": + return EvaluatorOutput( + label="Error", + errorCode="MISSING_REQUIRED_FIELD", + errorMessage=f"{type(self.metric).__name__} requires fields not extracted from spans: {e}. " + f"Provide a field_mapper to supply custom fields from your trace data.", + ) + raise + + score = self.metric.score + reason = getattr(self.metric, "reason", None) or "" + threshold = getattr(self.metric, "threshold", 0.5) + success = getattr(self.metric, "success", score is not None and score >= threshold) + label = "Pass" if success else "Fail" + + return EvaluatorOutput(value=score, label=label, explanation=reason) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py new file mode 100644 index 00000000..5388df83 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py @@ -0,0 +1,8 @@ +"""Span parsers for extracting evaluation fields from Agent SDK trace formats.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.base import ( + SpanParseResult, + parse_spans, +) + +__all__ = ["SpanParseResult", "parse_spans"] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py new file mode 100644 index 00000000..9869eab7 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py @@ -0,0 +1,63 @@ +"""Base span parsing logic and orchestration across format-specific parsers.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, +) +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.strands import ( + parse_strands_spans, +) +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.otel_langchain import ( + parse_otel_langchain_spans, +) +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.openinference import ( + parse_openinference_spans, +) + +logger = logging.getLogger(__name__) + + +_PARSERS = [ + parse_strands_spans, + parse_otel_langchain_spans, + parse_openinference_spans, +] + + +def parse_spans( + session_spans: List[Dict[str, Any]], + reference_inputs: Optional[List[Any]] = None, +) -> SpanParseResult: + """Parse session spans using the first matching agent-level parser. + + Iterates through format-specific parsers (Strands, OTel LangChain, + OpenInference) and returns the result from the first one that + successfully extracts data. + + Args: + session_spans: Raw ADOT span dicts from the evaluation service. + reference_inputs: Optional ReferenceInput list for expected_output. + + Returns: + SpanParseResult with extracted fields. + + Raises: + ValueError: If no parser can extract data from the spans. + """ + for parser in _PARSERS: + result = parser(session_spans) + if result is not None: + if reference_inputs: + ref = reference_inputs[0] + expected = getattr(ref, "expected_response_text", None) + if expected: + result.expected_output = expected + return result + + raise ValueError( + "Could not extract evaluation fields from spans. " + "No agent-level span with gen_ai.operation.name=='invoke_agent' and " + "valid span_events found. Provide a field_mapper for custom formats." + ) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py new file mode 100644 index 00000000..0619be8c --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py @@ -0,0 +1,146 @@ +"""Common span parsing utilities shared across format-specific parsers.""" + +import json +import logging +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +@dataclass +class SpanParseResult: + """Result of parsing spans into evaluation fields.""" + + input: Optional[str] = None + actual_output: Optional[str] = None + retrieval_context: Optional[List[str]] = None + context: Optional[List[str]] = None + expected_output: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dict, omitting None values.""" + result: Dict[str, Any] = {} + if self.input is not None: + result["input"] = self.input + if self.actual_output is not None: + result["actual_output"] = self.actual_output + if self.retrieval_context is not None: + result["retrieval_context"] = self.retrieval_context + if self.context is not None: + result["context"] = self.context + if self.expected_output is not None: + result["expected_output"] = self.expected_output + return result + + +def _get_message_content(message: Any) -> str: + """Extract text content from a message object.""" + if isinstance(message, str): + return message + if isinstance(message, dict): + for key in ("content", "message"): + if key in message: + val = message[key] + if isinstance(val, str): + return val + if isinstance(val, dict): + return _get_message_content(val) + if isinstance(val, list): + parts = [] + for item in val: + if isinstance(item, str): + parts.append(item) + elif isinstance(item, dict) and "text" in item: + parts.append(item["text"]) + if parts: + return "\n".join(parts) + return str(val) + return "" + + +def _parse_span_event_body(body: Any) -> Dict[str, Any]: + """Parse the body of a span event, handling both dict and JSON string.""" + if isinstance(body, str): + try: + return json.loads(body) + except (json.JSONDecodeError, TypeError): + return {} + if isinstance(body, dict): + return body + return {} + + +def extract_from_agent_span_events( + session_spans: List[Dict[str, Any]], +) -> Optional[SpanParseResult]: + """Extract evaluation fields from agent-level span events. + + Looks for spans where attributes.gen_ai.operation.name == "invoke_agent", + then inspects span_events for input/output messages. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent span with valid events found, None otherwise. + """ + user_messages: List[str] = [] + assistant_messages: List[str] = [] + tool_messages: List[str] = [] + + found_agent_span = False + + for span in session_spans: + attributes = span.get("attributes", {}) + operation_name = attributes.get("gen_ai.operation.name") + # Phase 1: only invoke_agent spans supported; others fall through to field_mapper + if operation_name != "invoke_agent": + continue + + found_agent_span = True + span_events = span.get("span_events", []) + + for event in span_events: + body = _parse_span_event_body(event.get("body")) + if not body: + continue + + input_data = body.get("input", {}) + if isinstance(input_data, dict): + for msg in input_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "user" and content: + user_messages.append(content) + + output_data = body.get("output", {}) + if isinstance(output_data, dict): + for msg in output_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) + + if not found_agent_span: + return None + + if not user_messages and not assistant_messages: + return None + + result = SpanParseResult() + if user_messages: + result.input = user_messages[0] + if assistant_messages: + result.actual_output = assistant_messages[-1] + if tool_messages: + result.retrieval_context = tool_messages + result.context = tool_messages + + return result diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py new file mode 100644 index 00000000..e500740e --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py @@ -0,0 +1,27 @@ +"""OpenInference LangChain span parser.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, + extract_from_agent_span_events, +) + +logger = logging.getLogger(__name__) + + +def parse_openinference_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]: + """Parse spans from OpenInference LangChain instrumentation format. + + Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent") + and span_events extraction. OpenInference-specific divergence can be added here + as schemas evolve. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent spans found, None otherwise. + """ + return extract_from_agent_span_events(session_spans) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py new file mode 100644 index 00000000..f1e211c5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py @@ -0,0 +1,27 @@ +"""OTel LangChain span parser.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, + extract_from_agent_span_events, +) + +logger = logging.getLogger(__name__) + + +def parse_otel_langchain_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]: + """Parse spans from OTel LangChain instrumentation format. + + Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent") + and span_events extraction. LangChain-specific divergence can be added here + as schemas evolve. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent spans found, None otherwise. + """ + return extract_from_agent_span_events(session_spans) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py new file mode 100644 index 00000000..3789ad9c --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py @@ -0,0 +1,26 @@ +"""Strands Agent SDK span parser.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, + extract_from_agent_span_events, +) + +logger = logging.getLogger(__name__) + + +def parse_strands_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]: + """Parse spans from Strands Agent SDK format. + + Looks for spans with gen_ai.operation.name == "invoke_agent" and + extracts input/output from span_events. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent spans found, None otherwise. + """ + return extract_from_agent_span_events(session_spans) diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py new file mode 100644 index 00000000..2f640817 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py @@ -0,0 +1,201 @@ +"""Tests for AutoevalsAdapter.""" + +from unittest.mock import MagicMock + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter + + +def _make_evaluator_input(spans=None): + """Build an EvaluatorInput with agent-level spans.""" + if spans is None: + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ], + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="t1", + ) + + +def _mock_scorer(score=0.9, rationale="Good answer"): + """Create a mock Autoevals scorer.""" + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + + result = MagicMock() + result.score = score + result.metadata = {"rationale": rationale} + + scorer.eval = MagicMock(return_value=result) + return scorer + + +class TestAutoevalsAdapterSuccess: + def test_returns_pass_when_score_above_threshold(self): + scorer = _mock_scorer(score=0.8) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value == 0.8 + assert result.label == "Pass" + assert result.explanation == "Good answer" + + def test_returns_fail_when_score_below_threshold(self): + scorer = _mock_scorer(score=0.3) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.3 + assert result.label == "Fail" + + def test_custom_threshold(self): + scorer = _mock_scorer(score=0.6) + adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Fail" + + def test_custom_threshold_pass(self): + scorer = _mock_scorer(score=0.8) + adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Pass" + + def test_default_threshold_is_half(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + assert adapter.threshold == 0.5 + + def test_scorer_eval_called_with_input_and_output(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + adapter(_make_evaluator_input()) + + scorer.eval.assert_called_once() + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "What is AI?" + assert call_kwargs["output"] == "AI is artificial intelligence." + + def test_custom_field_mapper(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter( + scorer=scorer, + field_mapper=lambda ev: { + "input": "custom input", + "actual_output": "custom output", + }, + ) + + result = adapter(_make_evaluator_input()) + + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "custom input" + assert call_kwargs["output"] == "custom output" + + +class TestAutoevalsAdapterErrors: + def test_no_agent_spans_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "chat"}, + "span_events": [], + } + ] + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert result.errorCode == "FIELD_EXTRACTION_ERROR" + + def test_missing_input_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "output": {"messages": [{"role": "assistant", "content": "answer"}]}, + } + } + ], + } + ] + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert result.errorCode == "MISSING_REQUIRED_FIELD" + assert "input" in result.errorMessage + + def test_scorer_exception_returns_error(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=RuntimeError("API error")) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.errorCode == "METRIC_ERROR" + assert "API error" in result.errorMessage + + def test_never_raises(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=Exception("unexpected")) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode is not None + + +class TestAutoevalsAdapterEdgeCases: + def test_score_none_returns_fail(self): + scorer = _mock_scorer(score=None) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Fail" + + def test_no_metadata_returns_empty_explanation(self): + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + result_obj = MagicMock(spec=[]) + result_obj.score = 0.9 + scorer.eval = MagicMock(return_value=result_obj) + + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.explanation == "" diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py new file mode 100644 index 00000000..e7efef2a --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py @@ -0,0 +1,272 @@ +"""Tests for DeepEvalAdapter.""" + +from unittest.mock import MagicMock + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter + + +def _make_evaluator_input(spans=None): + """Build an EvaluatorInput with agent-level spans.""" + if spans is None: + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ], + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="t1", + ) + + +def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"): + """Create a mock metric that returns a fixed score on measure().""" + metric = MagicMock() + type(metric).__name__ = name + metric.threshold = threshold + metric.score = score + metric.reason = reason + del metric.success + + def measure_side_effect(test_case): + metric.score = score + metric.reason = reason + + metric.measure = MagicMock(side_effect=measure_side_effect) + return metric + + +class TestDeepEvalAdapterSuccess: + def test_returns_pass_when_score_above_threshold(self): + metric = _mock_metric(score=0.9, threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value == 0.9 + assert result.label == "Pass" + assert result.explanation == "Looks good" + + def test_returns_fail_when_score_below_threshold(self): + metric = _mock_metric(score=0.3, threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.3 + assert result.label == "Fail" + + def test_returns_pass_at_exact_threshold(self): + metric = _mock_metric(score=0.7, threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Pass" + + def test_metric_measure_called_with_test_case(self): + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + adapter(_make_evaluator_input()) + + metric.measure.assert_called_once() + test_case = metric.measure.call_args[0][0] + assert test_case.input == "What is AI?" + assert test_case.actual_output == "AI is artificial intelligence." + + def test_custom_field_mapper(self): + metric = _mock_metric() + adapter = DeepEvalAdapter( + metric=metric, + field_mapper=lambda ev: { + "input": "mapped input", + "actual_output": "mapped output", + }, + ) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.85 + test_case = metric.measure.call_args[0][0] + assert test_case.input == "mapped input" + assert test_case.actual_output == "mapped output" + + def test_reference_inputs_populates_expected_output(self): + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + evaluator_input = EvaluatorInput( + evaluation_level="TRACE", + session_spans=[ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ], + } + ], + target_trace_id="t1", + reference_inputs=[{"expectedResponse": {"text": "AI stands for artificial intelligence."}}], + ) + + result = adapter(evaluator_input) + + test_case = metric.measure.call_args[0][0] + assert test_case.expected_output == "AI stands for artificial intelligence." + + def test_model_override_sets_metric_model(self): + metric = _mock_metric() + DeepEvalAdapter(metric=metric, model="bedrock/anthropic.claude-3") + + assert metric.model == "bedrock/anthropic.claude-3" + + def test_label_uses_metric_success_true(self): + metric = _mock_metric(score=0.3, threshold=0.7) + metric.success = True + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.3 + assert result.label == "Pass" + + def test_label_uses_metric_success_false(self): + metric = _mock_metric(score=0.9, threshold=0.7) + metric.success = False + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.9 + assert result.label == "Fail" + + +class TestDeepEvalAdapterErrors: + def test_no_agent_spans_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "chat"}, + "span_events": [], + } + ] + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode == "FIELD_EXTRACTION_ERROR" + assert result.label == "Error" + + def test_missing_input_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "output": {"messages": [{"role": "assistant", "content": "answer"}]}, + } + } + ], + } + ] + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert result.errorCode == "MISSING_REQUIRED_FIELD" + assert "input" in result.errorMessage + assert "field_mapper" in result.errorMessage + metric.measure.assert_not_called() + + def test_metric_measure_exception_returns_error(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout")) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.errorCode == "METRIC_ERROR" + assert "LLM timeout" in result.errorMessage + + def test_missing_params_error_caught(self): + metric = _mock_metric() + + MissingTestCaseParamsError = type("MissingTestCaseParamsError", (Exception,), {}) + metric.measure = MagicMock( + side_effect=MissingTestCaseParamsError("retrieval_context is required") + ) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.errorCode == "MISSING_REQUIRED_FIELD" + assert "retrieval_context" in result.errorMessage + assert "field_mapper" in result.errorMessage + + def test_never_raises(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=Exception("unexpected")) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode is not None + + +class TestDeepEvalAdapterEdgeCases: + def test_metric_with_no_reason(self): + metric = _mock_metric(score=0.8, reason=None) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.explanation == "" + + def test_metric_score_zero(self): + metric = _mock_metric(score=0.0, threshold=0.5) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.0 + assert result.label == "Fail" + + def test_default_threshold_when_missing(self): + metric = _mock_metric(score=0.6) + del metric.threshold + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Pass" diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py new file mode 100644 index 00000000..9669e5e5 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py @@ -0,0 +1,195 @@ +"""Tests for span parsers.""" + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import ReferenceInput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import ( + SpanParseResult, + parse_spans, +) + + +def _make_agent_span(input_messages=None, output_messages=None, span_id="span1"): + """Build an agent-level span with span_events.""" + span_events = [] + body = {} + if input_messages is not None: + body["input"] = {"messages": input_messages} + if output_messages is not None: + body["output"] = {"messages": output_messages} + if body: + span_events.append({"body": body}) + + return { + "traceId": "abc123", + "spanId": span_id, + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": span_events, + } + + +class TestParseSpansSuccess: + def test_extracts_input_and_output(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "What is AI?"}], + output_messages=[{"role": "assistant", "content": "Artificial intelligence."}], + ) + ] + + result = parse_spans(spans) + + assert result.input == "What is AI?" + assert result.actual_output == "Artificial intelligence." + + def test_extracts_tool_messages_as_retrieval_context(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "doc chunk 1"}, + {"role": "tool", "content": "doc chunk 2"}, + {"role": "assistant", "content": "answer"}, + ], + ) + ] + + result = parse_spans(spans) + + assert result.retrieval_context == ["doc chunk 1", "doc chunk 2"] + assert result.context == ["doc chunk 1", "doc chunk 2"] + assert result.actual_output == "answer" + + def test_uses_first_user_message_as_input(self): + spans = [ + _make_agent_span( + input_messages=[ + {"role": "user", "content": "first"}, + {"role": "user", "content": "second"}, + ], + output_messages=[{"role": "assistant", "content": "reply"}], + ) + ] + + result = parse_spans(spans) + + assert result.input == "first" + + def test_uses_last_assistant_message_as_output(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "q"}], + output_messages=[ + {"role": "assistant", "content": "first reply"}, + {"role": "assistant", "content": "final reply"}, + ], + ) + ] + + result = parse_spans(spans) + + assert result.actual_output == "final reply" + + def test_expected_output_from_reference_inputs(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "q"}], + output_messages=[{"role": "assistant", "content": "a"}], + ) + ] + refs = [ReferenceInput(expectedResponse={"text": "expected answer"})] + + result = parse_spans(spans, reference_inputs=refs) + + assert result.expected_output == "expected answer" + + def test_nested_content_dict(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": {"content": "nested"}}], + output_messages=[{"role": "assistant", "content": {"content": "nested out"}}], + ) + ] + + result = parse_spans(spans) + + assert result.input == "nested" + assert result.actual_output == "nested out" + + def test_body_as_json_string(self): + import json + + body = { + "input": {"messages": [{"role": "user", "content": "hello"}]}, + "output": {"messages": [{"role": "assistant", "content": "hi"}]}, + } + span = { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [{"body": json.dumps(body)}], + } + + result = parse_spans([span]) + + assert result.input == "hello" + assert result.actual_output == "hi" + + def test_to_dict_omits_none(self): + result = SpanParseResult(input="q", actual_output="a") + d = result.to_dict() + + assert d == {"input": "q", "actual_output": "a"} + assert "retrieval_context" not in d + + +class TestParseSpansErrors: + def test_no_agent_spans_raises(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "other_op"}, + "span_events": [], + } + ] + + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans(spans) + + def test_empty_spans_raises(self): + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans([]) + + def test_agent_span_without_events_raises(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [], + } + ] + + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans(spans) + + def test_non_agent_spans_ignored(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "chat"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "q"}]}, + "output": {"messages": [{"role": "assistant", "content": "a"}]}, + } + } + ], + } + ] + + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans(spans) diff --git a/tests_integ/evaluation/test_third_party_adapters.py b/tests_integ/evaluation/test_third_party_adapters.py new file mode 100644 index 00000000..a9f0eac6 --- /dev/null +++ b/tests_integ/evaluation/test_third_party_adapters.py @@ -0,0 +1,171 @@ +"""Integration tests for third-party evaluation adapters. + +These tests require `deepeval` and `autoevals` packages to be installed. +They verify the full adapter flow from EvaluatorInput through span parsing +to metric execution, using real library metrics (not mocks). + +SETUP: + pip install deepeval autoevals + +RUN: + pytest tests_integ/evaluation/test_third_party_adapters.py -v +""" + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput + + +def _make_agent_evaluator_input( + user_prompt="What is the capital of France?", + agent_response="The capital of France is Paris.", + tool_messages=None, +): + """Build an EvaluatorInput with agent-level spans.""" + output_messages = [] + if tool_messages: + for msg in tool_messages: + output_messages.append({"role": "tool", "content": msg}) + output_messages.append({"role": "assistant", "content": agent_response}) + + spans = [ + { + "traceId": "integ-trace-1", + "spanId": "integ-span-1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": user_prompt}]}, + "output": {"messages": output_messages}, + } + } + ], + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="integ-trace-1", + ) + + +class TestDeepEvalAdapterIntegration: + """Integration tests for DeepEvalAdapter with real DeepEval metrics.""" + + @pytest.fixture(autouse=True) + def check_deepeval(self): + """Skip if deepeval is not installed.""" + pytest.importorskip("deepeval") + + def test_bias_metric_passes(self): + from deepeval.metrics import BiasMetric + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = BiasMetric(threshold=0.5) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + assert result.label in ("Pass", "Fail") + + def test_missing_retrieval_context_returns_error(self): + from deepeval.metrics import FaithfulnessMetric + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = FaithfulnessMetric(threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter( + _make_agent_evaluator_input( + user_prompt="Is the sky blue?", + agent_response="Yes, the sky is blue.", + ) + ) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode == "MISSING_REQUIRED_FIELD" or result.value is not None + + def test_with_field_mapper(self): + from deepeval.metrics import BiasMetric + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = BiasMetric(threshold=0.5) + adapter = DeepEvalAdapter( + metric=metric, + field_mapper=lambda ev: { + "input": "Is Python a good language?", + "actual_output": "Python is a versatile programming language used widely.", + }, + ) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + + +class TestAutoevalsAdapterIntegration: + """Integration tests for AutoevalsAdapter with real Autoevals scorers.""" + + @pytest.fixture(autouse=True) + def check_autoevals(self): + """Skip if autoevals is not installed.""" + pytest.importorskip("autoevals") + + def test_factuality_scorer(self): + from autoevals import Factuality + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter + + scorer = Factuality() + adapter = AutoevalsAdapter(scorer=scorer) + + evaluator_input = _make_agent_evaluator_input() + evaluator_input.session_spans[0]["span_events"][0]["body"]["output"]["messages"] = [ + {"role": "assistant", "content": "The capital of France is Paris."} + ] + + result = adapter(evaluator_input) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + assert result.label in ("Pass", "Fail") + + def test_custom_threshold(self): + from autoevals import Factuality + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter + + scorer = Factuality() + adapter = AutoevalsAdapter(scorer=scorer, threshold=0.9) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + + def test_with_field_mapper(self): + from autoevals import Factuality + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter + + scorer = Factuality() + adapter = AutoevalsAdapter( + scorer=scorer, + field_mapper=lambda ev: { + "input": "What is 2+2?", + "actual_output": "4", + "expected_output": "4", + }, + ) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None