From 77d16a90a2bc14adb6f46c0635a5f82254c95d21 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 10:39:26 -0700 Subject: [PATCH 01/13] Add DeepEvalHandler integration with unit tests Introduces a new integrations/deepeval/ module that adapts AgentCore Lambda evaluation events into DeepEval LLMTestCase objects, runs any BaseMetric, and returns structured score/label/explanation responses. --- .../integrations/deepeval/__init__.py | 5 + .../integrations/deepeval/handler.py | 88 +++++ .../integrations/deepeval/input_mapper.py | 191 ++++++++++ .../integrations/deepeval/__init__.py | 0 .../integrations/deepeval/test_handler.py | 230 ++++++++++++ .../deepeval/test_input_mapper.py | 331 ++++++++++++++++++ 6 files changed, 845 insertions(+) create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py new file mode 100644 index 00000000..76f6461f --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py @@ -0,0 +1,5 @@ +"""DeepEval integration for AgentCore Evaluation.""" + +from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler + +__all__ = ["DeepEvalHandler"] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py new file mode 100644 index 00000000..b339b883 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -0,0 +1,88 @@ +"""DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics.""" + +import logging +from typing import Any, Callable, Dict, Optional + +from deepeval.metrics import BaseMetric + +from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( + ParsedEvaluationEvent, + build_test_case, +) + +logger = logging.getLogger(__name__) + + +class DeepEvalHandler: + """Lambda handler that runs a DeepEval metric against AgentCore evaluation events. + + Never raises unhandled exceptions — always returns a valid response dict. + + Example:: + + from deepeval.metrics import AnswerRelevancyMetric + + metric = AnswerRelevancyMetric(threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + # Use as Lambda handler + def lambda_handler(event, context): + return handler(event, context) + """ + + def __init__( + self, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + ): + """Initialize the handler. + + Args: + metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of LLMTestCase field values. Bypasses default span + extraction when provided. + """ + self.metric = metric + self.field_mapper = field_mapper + + def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: + """Handle a Lambda invocation. + + Args: + event: Raw Lambda event dict from the evaluation service. + context: Lambda context object (unused). + + Returns: + Success: {"value": float, "label": str, "explanation": str} + Error: {"errorCode": str, "errorMessage": str} + """ + try: + parsed = ParsedEvaluationEvent.from_lambda_event(event) + except (KeyError, IndexError, TypeError) as e: + logger.error("Failed to parse evaluation event: %s", e) + return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}") + + try: + test_case = build_test_case(parsed, self.metric, self.field_mapper) + except ValueError as e: + logger.error("Missing required fields: %s", e) + return _error_response("MISSING_REQUIRED_FIELD", str(e)) + + try: + self.metric.measure(test_case) + except Exception as e: + logger.error("Metric measurement failed: %s", e, exc_info=True) + return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}") + + score = self.metric.score + reason = getattr(self.metric, "reason", None) or "" + threshold = getattr(self.metric, "threshold", 0.5) + label = "Pass" if score is not None and score >= threshold else "Fail" + + return {"value": score, "label": label, "explanation": reason} + + +def _error_response(code: str, message: str) -> Dict[str, str]: + """Build a standardized error response dict.""" + return {"errorCode": code, "errorMessage": message} diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py new file mode 100644 index 00000000..50873cf5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -0,0 +1,191 @@ +"""Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects.""" + +import logging +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional + +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase, LLMTestCaseParams + +logger = logging.getLogger(__name__) + +_PARAM_TO_FIELD: Dict[LLMTestCaseParams, str] = { + LLMTestCaseParams.INPUT: "input", + LLMTestCaseParams.ACTUAL_OUTPUT: "actual_output", + LLMTestCaseParams.EXPECTED_OUTPUT: "expected_output", + LLMTestCaseParams.CONTEXT: "context", + LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrieval_context", +} + +_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { + "AnswerRelevancyMetric": ["input", "actual_output"], + "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"], + "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"], + "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "HallucinationMetric": ["input", "actual_output", "context"], + "BiasMetric": ["input", "actual_output"], + "ToxicityMetric": ["input", "actual_output"], + "GEval": ["input", "actual_output"], + "SummarizationMetric": ["input", "actual_output"], +} + + +@dataclass +class ParsedEvaluationEvent: + """Parsed representation of the AgentCore Lambda evaluation event.""" + + evaluation_level: str + session_spans: List[Dict[str, Any]] + target_trace_id: Optional[str] = None + target_span_id: Optional[str] = None + reference_inputs: List[Dict[str, Any]] = field(default_factory=list) + + @classmethod + def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent": + """Parse a raw Lambda event dict into a structured object. + + Args: + event: Raw Lambda event payload from the evaluation service. + + Returns: + ParsedEvaluationEvent with extracted fields. + + Raises: + KeyError: If required top-level fields are missing. + """ + evaluation_input = event["evaluationInput"] + target = event.get("evaluationTarget") or {} + trace_ids = target.get("traceIds") or [] + span_ids = target.get("spanIds") or [] + + return cls( + evaluation_level=event["evaluationLevel"], + session_spans=evaluation_input["sessionSpans"], + target_trace_id=trace_ids[0] if trace_ids else None, + target_span_id=span_ids[0] if span_ids else None, + reference_inputs=event.get("evaluationReferenceInputs") or [], + ) + + +def _get_required_params(metric: BaseMetric) -> List[str]: + """Determine which LLMTestCase fields a metric requires. + + Fallback chain: + 1. metric._required_params (DeepEval internal attribute) + 2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name + 3. metric.evaluation_params (GEval special case) + 4. Default: ["input", "actual_output"] + """ + if hasattr(metric, "_required_params") and metric._required_params: + params = metric._required_params + return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + + class_name = type(metric).__name__ + if class_name in _METRIC_REQUIRED_PARAMS: + return _METRIC_REQUIRED_PARAMS[class_name] + + if hasattr(metric, "evaluation_params") and metric.evaluation_params: + params = metric.evaluation_params + return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + + return ["input", "actual_output"] + + +def _extract_fields_from_spans( + parsed: ParsedEvaluationEvent, +) -> Dict[str, Any]: + """Extract LLMTestCase fields from ADOT session spans. + + Bridges Session → LLMTestCase fields: + - input ← user messages (role=="user") + - actual_output ← assistant messages (role=="assistant") + - retrieval_context ← tool messages (role=="tool") + - expected_output ← evaluationReferenceInputs[0].expectedResponse + """ + user_messages: List[str] = [] + assistant_messages: List[str] = [] + tool_messages: List[str] = [] + + for span in parsed.session_spans: + attributes = span.get("attributes", {}) + role = attributes.get("gen_ai.message.role", "") + content = attributes.get("gen_ai.message.content", "") + + if not content: + content = attributes.get("gen_ai.completion", "") + + if role == "user" and content: + user_messages.append(content) + elif role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) + + fields: Dict[str, Any] = {} + + if user_messages: + fields["input"] = "\n".join(user_messages) + if assistant_messages: + fields["actual_output"] = "\n".join(assistant_messages) + if tool_messages: + fields["retrieval_context"] = tool_messages + + if parsed.reference_inputs: + expected = parsed.reference_inputs[0].get("expectedResponse") + if expected: + fields["expected_output"] = expected + + return fields + + +def build_test_case( + parsed: ParsedEvaluationEvent, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, +) -> LLMTestCase: + """Build a DeepEval LLMTestCase from a parsed evaluation event. + + Args: + parsed: The parsed Lambda event. + metric: The DeepEval metric (used to determine required fields). + field_mapper: Optional callable that receives the raw Lambda event fields + and returns a dict of LLMTestCase field values. Bypasses default + span extraction when provided. + + Returns: + An LLMTestCase ready for metric.measure(). + + Raises: + ValueError: If required fields for the metric cannot be populated. + """ + if field_mapper is not None: + raw_event = { + "evaluationLevel": parsed.evaluation_level, + "evaluationInput": {"sessionSpans": parsed.session_spans}, + "evaluationTarget": { + "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], + "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], + }, + "evaluationReferenceInputs": parsed.reference_inputs, + } + fields = field_mapper(raw_event) + else: + fields = _extract_fields_from_spans(parsed) + + required = _get_required_params(metric) + missing = [f for f in required if f not in fields or not fields[f]] + if missing: + metric_name = type(metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + return LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py new file mode 100644 index 00000000..77988ab7 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -0,0 +1,230 @@ +"""Tests for DeepEvalHandler.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler + + +def _make_event( + level="TRACE", + trace_ids=None, + spans=None, + reference_inputs=None, +): + """Build a raw Lambda event dict for testing.""" + event = { + "schemaVersion": "1.0", + "evaluationLevel": level, + "evaluationInput": { + "sessionSpans": spans + or [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": { + "gen_ai.message.role": "user", + "gen_ai.message.content": "What is AI?", + }, + }, + { + "traceId": "abc123", + "spanId": "span2", + "attributes": { + "gen_ai.message.role": "assistant", + "gen_ai.message.content": "AI is artificial intelligence.", + }, + }, + ] + }, + "evaluationTarget": {}, + } + if trace_ids is not None: + event["evaluationTarget"]["traceIds"] = trace_ids + if reference_inputs is not None: + event["evaluationReferenceInputs"] = reference_inputs + return event + + +def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"): + """Create a mock metric that returns a fixed score on measure().""" + metric = MagicMock() + type(metric).__name__ = name + metric.threshold = threshold + metric.score = score + metric.reason = reason + metric._required_params = None + del metric._required_params + del metric.evaluation_params + + def measure_side_effect(test_case): + metric.score = score + metric.reason = reason + + metric.measure = MagicMock(side_effect=measure_side_effect) + return metric + + +class TestDeepEvalHandlerSuccess: + def test_returns_pass_when_score_above_threshold(self): + metric = _mock_metric(score=0.9, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.9 + assert result["label"] == "Pass" + assert result["explanation"] == "Looks good" + + def test_returns_fail_when_score_below_threshold(self): + metric = _mock_metric(score=0.3, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.3 + assert result["label"] == "Fail" + + def test_returns_pass_at_exact_threshold(self): + metric = _mock_metric(score=0.7, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["label"] == "Pass" + + def test_metric_measure_called_with_test_case(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + handler(_make_event()) + + metric.measure.assert_called_once() + test_case = metric.measure.call_args[0][0] + assert test_case.input == "What is AI?" + assert test_case.actual_output == "AI is artificial intelligence." + + def test_context_parameter_ignored(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + mock_context = {"function_name": "my-lambda"} + + result = handler(_make_event(), mock_context) + + assert result["value"] == 0.85 + + def test_custom_field_mapper(self): + metric = _mock_metric() + handler = DeepEvalHandler( + metric=metric, + field_mapper=lambda event: { + "input": "mapped input", + "actual_output": "mapped output", + }, + ) + + result = handler(_make_event()) + + assert result["value"] == 0.85 + test_case = metric.measure.call_args[0][0] + assert test_case.input == "mapped input" + assert test_case.actual_output == "mapped output" + + +class TestDeepEvalHandlerErrors: + def test_invalid_event_returns_error(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + result = handler({}) + + assert result["errorCode"] == "INVALID_EVENT" + assert "errorMessage" in result + assert "value" not in result + + def test_missing_evaluation_input_returns_error(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + event = {"evaluationLevel": "TRACE", "evaluationTarget": {}} + result = handler(event) + + assert result["errorCode"] == "INVALID_EVENT" + + def test_missing_required_field_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "q"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "a"}, + }, + ] + metric = _mock_metric(name="FaithfulnessMetric") + handler = DeepEvalHandler(metric=metric) + + event = _make_event(spans=spans) + result = handler(event) + + assert result["errorCode"] == "MISSING_REQUIRED_FIELD" + assert "retrieval_context" in result["errorMessage"] + + def test_metric_measure_exception_returns_error(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout")) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["errorCode"] == "METRIC_ERROR" + assert "LLM timeout" in result["errorMessage"] + + def test_never_raises_on_any_input(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + for bad_input in [None, [], "string", 42, {"random": "keys"}]: + result = handler(bad_input) + assert "errorCode" in result or "value" in result + + +class TestDeepEvalHandlerEdgeCases: + def test_metric_with_no_reason(self): + metric = _mock_metric(score=0.8, reason=None) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["explanation"] == "" + + def test_metric_score_zero(self): + metric = _mock_metric(score=0.0, threshold=0.5) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.0 + assert result["label"] == "Fail" + + def test_metric_score_one(self): + metric = _mock_metric(score=1.0, threshold=0.5) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 1.0 + assert result["label"] == "Pass" + + def test_default_threshold_when_missing(self): + metric = _mock_metric(score=0.6) + del metric.threshold + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["label"] == "Pass" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py new file mode 100644 index 00000000..efab5459 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -0,0 +1,331 @@ +"""Tests for deepeval input_mapper module.""" + +from unittest.mock import MagicMock + +import pytest +from deepeval.test_case import LLMTestCaseParams + +from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( + ParsedEvaluationEvent, + _get_required_params, + build_test_case, +) + + +def _make_event( + level="TRACE", + trace_ids=None, + span_ids=None, + spans=None, + reference_inputs=None, +): + """Build a raw Lambda event dict for testing.""" + event = { + "schemaVersion": "1.0", + "evaluationLevel": level, + "evaluationInput": { + "sessionSpans": spans + or [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": { + "gen_ai.message.role": "user", + "gen_ai.message.content": "What is the capital of France?", + }, + }, + { + "traceId": "abc123", + "spanId": "span2", + "attributes": { + "gen_ai.message.role": "assistant", + "gen_ai.message.content": "The capital of France is Paris.", + }, + }, + ] + }, + "evaluationTarget": {}, + } + if trace_ids is not None: + event["evaluationTarget"]["traceIds"] = trace_ids + if span_ids is not None: + event["evaluationTarget"]["spanIds"] = span_ids + if reference_inputs is not None: + event["evaluationReferenceInputs"] = reference_inputs + return event + + +def _mock_metric(name="MockMetric", required_params=None, evaluation_params=None, threshold=0.5): + """Create a mock DeepEval metric.""" + metric = MagicMock() + type(metric).__name__ = name + metric.threshold = threshold + + if required_params is not None: + metric._required_params = required_params + else: + del metric._required_params + + if evaluation_params is not None: + metric.evaluation_params = evaluation_params + else: + del metric.evaluation_params + + return metric + + +class TestParsedEvaluationEvent: + def test_from_lambda_event_trace_level(self): + event = _make_event(level="TRACE", trace_ids=["trace-1"]) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.evaluation_level == "TRACE" + assert parsed.target_trace_id == "trace-1" + assert parsed.target_span_id is None + assert len(parsed.session_spans) == 2 + + def test_from_lambda_event_tool_call_level(self): + event = _make_event(level="TOOL_CALL", span_ids=["span-42"]) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.evaluation_level == "TOOL_CALL" + assert parsed.target_span_id == "span-42" + assert parsed.target_trace_id is None + + def test_from_lambda_event_session_level(self): + event = _make_event(level="SESSION") + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.evaluation_level == "SESSION" + assert parsed.target_trace_id is None + assert parsed.target_span_id is None + + def test_from_lambda_event_with_reference_inputs(self): + refs = [{"expectedResponse": "Paris is the capital of France."}] + event = _make_event(reference_inputs=refs) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.reference_inputs == refs + + def test_from_lambda_event_missing_reference_inputs(self): + event = _make_event() + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.reference_inputs == [] + + def test_from_lambda_event_missing_evaluation_level_raises(self): + event = _make_event() + del event["evaluationLevel"] + + with pytest.raises(KeyError): + ParsedEvaluationEvent.from_lambda_event(event) + + def test_from_lambda_event_missing_evaluation_input_raises(self): + event = _make_event() + del event["evaluationInput"] + + with pytest.raises(KeyError): + ParsedEvaluationEvent.from_lambda_event(event) + + def test_from_lambda_event_missing_target_key_defaults(self): + event = _make_event() + del event["evaluationTarget"] + parsed = ParsedEvaluationEvent.from_lambda_event(event) + + assert parsed.target_trace_id is None + assert parsed.target_span_id is None + + +class TestGetRequiredParams: + def test_uses_required_params_attribute(self): + metric = _mock_metric( + required_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT] + ) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + def test_falls_back_to_static_registry(self): + metric = _mock_metric(name="FaithfulnessMetric") + result = _get_required_params(metric) + + assert result == ["input", "actual_output", "retrieval_context"] + + def test_falls_back_to_evaluation_params(self): + metric = _mock_metric( + name="UnknownMetric", + evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT], + ) + result = _get_required_params(metric) + + assert result == ["input", "retrieval_context"] + + def test_defaults_to_input_and_actual_output(self): + metric = _mock_metric(name="UnknownMetric") + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + def test_empty_required_params_falls_through(self): + metric = _mock_metric(name="UnknownMetric", required_params=[]) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + +class TestBuildTestCase: + def test_basic_span_extraction(self): + event = _make_event() + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "What is the capital of France?" + assert test_case.actual_output == "The capital of France is Paris." + + def test_retrieval_context_from_tool_spans(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 1"}, + }, + { + "traceId": "t1", + "spanId": "s3", + "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 2"}, + }, + { + "traceId": "t1", + "spanId": "s4", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="FaithfulnessMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "query" + assert test_case.actual_output == "answer" + assert test_case.retrieval_context == ["doc chunk 1", "doc chunk 2"] + + def test_expected_output_from_reference_inputs(self): + refs = [{"expectedResponse": "Paris"}] + event = _make_event(reference_inputs=refs) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.expected_output == "Paris" + + def test_missing_required_field_raises_value_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="FaithfulnessMetric") + + with pytest.raises(ValueError, match="retrieval_context"): + build_test_case(parsed, metric) + + def test_custom_field_mapper_bypasses_extraction(self): + event = _make_event() + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + def custom_mapper(raw_event): + return { + "input": "custom input", + "actual_output": "custom output", + } + + test_case = build_test_case(parsed, metric, field_mapper=custom_mapper) + + assert test_case.input == "custom input" + assert test_case.actual_output == "custom output" + + def test_field_mapper_receives_reconstructed_event(self): + refs = [{"expectedResponse": "expected"}] + event = _make_event(level="TRACE", trace_ids=["t1"], reference_inputs=refs) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + received_events = [] + + def capture_mapper(raw_event): + received_events.append(raw_event) + return {"input": "x", "actual_output": "y"} + + build_test_case(parsed, metric, field_mapper=capture_mapper) + + raw = received_events[0] + assert raw["evaluationLevel"] == "TRACE" + assert raw["evaluationTarget"]["traceIds"] == ["t1"] + assert raw["evaluationReferenceInputs"] == refs + + def test_multiple_user_messages_concatenated(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "hello"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "world"}, + }, + { + "traceId": "t1", + "spanId": "s3", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "hi"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "hello\nworld" + + def test_gen_ai_completion_fallback(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.message.role": "user", "gen_ai.completion": "fallback input"}, + }, + { + "traceId": "t1", + "spanId": "s2", + "attributes": {"gen_ai.message.role": "assistant", "gen_ai.completion": "fallback output"}, + }, + ] + event = _make_event(spans=spans) + parsed = ParsedEvaluationEvent.from_lambda_event(event) + metric = _mock_metric(name="AnswerRelevancyMetric") + + test_case = build_test_case(parsed, metric) + + assert test_case.input == "fallback input" + assert test_case.actual_output == "fallback output" From 402ea7891e0175a0d00255ee69fe62888ba2a631 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 11:38:57 -0700 Subject: [PATCH 02/13] Fix span extraction to use real AgentCore _eval_log_records structure --- .../integrations/deepeval/input_mapper.py | 94 +++- .../integrations/deepeval/test_handler.py | 57 +-- .../deepeval/test_input_mapper.py | 402 ++++++++++++++---- 3 files changed, 415 insertions(+), 138 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index 50873cf5..cd67845f 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -1,5 +1,6 @@ """Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects.""" +import json import logging from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional @@ -92,15 +93,36 @@ def _get_required_params(metric: BaseMetric) -> List[str]: return ["input", "actual_output"] +def _get_message_content(message: Any) -> str: + """Extract text content from a message object. + + Message content can be a dict with a "content" or "message" key, or a plain string. + Handles one level of nesting (e.g. {"content": {"content": "text"}}). + """ + if isinstance(message, str): + return message + if isinstance(message, dict): + for key in ("content", "message"): + if key in message: + val = message[key] + if isinstance(val, str): + return val + if isinstance(val, dict): + return _get_message_content(val) + return str(val) + return "" + + def _extract_fields_from_spans( parsed: ParsedEvaluationEvent, ) -> Dict[str, Any]: - """Extract LLMTestCase fields from ADOT session spans. + """Extract LLMTestCase fields from AgentCore session spans. - Bridges Session → LLMTestCase fields: - - input ← user messages (role=="user") - - actual_output ← assistant messages (role=="assistant") - - retrieval_context ← tool messages (role=="tool") + Parses _eval_log_records from span attributes, filters by target_trace_id, + and extracts messages by role: + - input ← input messages where role=="user" + - actual_output ← output messages where role=="assistant" + - retrieval_context ← output messages where role=="tool" - expected_output ← evaluationReferenceInputs[0].expectedResponse """ user_messages: List[str] = [] @@ -109,18 +131,56 @@ def _extract_fields_from_spans( for span in parsed.session_spans: attributes = span.get("attributes", {}) - role = attributes.get("gen_ai.message.role", "") - content = attributes.get("gen_ai.message.content", "") - - if not content: - content = attributes.get("gen_ai.completion", "") - - if role == "user" and content: - user_messages.append(content) - elif role == "assistant" and content: - assistant_messages.append(content) - elif role == "tool" and content: - tool_messages.append(content) + log_records_raw = attributes.get("_eval_log_records") + if not log_records_raw: + continue + + if isinstance(log_records_raw, str): + try: + log_records = json.loads(log_records_raw) + except (json.JSONDecodeError, TypeError): + logger.debug("Failed to parse _eval_log_records as JSON") + continue + else: + log_records = log_records_raw + + if not isinstance(log_records, list): + continue + + for record in log_records: + if not isinstance(record, dict): + continue + + if parsed.target_trace_id: + record_trace_id = record.get("traceId") or record.get("trace_id") + if record_trace_id and record_trace_id != parsed.target_trace_id: + continue + + body = record.get("body", {}) + if not isinstance(body, dict): + continue + + input_data = body.get("input", {}) + if isinstance(input_data, dict): + for msg in input_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "user" and content: + user_messages.append(content) + + output_data = body.get("output", {}) + if isinstance(output_data, dict): + for msg in output_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) fields: Dict[str, Any] = {} diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 77988ab7..c3fa98ae 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -1,5 +1,6 @@ """Tests for DeepEvalHandler.""" +import json from unittest.mock import MagicMock, patch import pytest @@ -14,30 +15,27 @@ def _make_event( reference_inputs=None, ): """Build a raw Lambda event dict for testing.""" + if spans is None: + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ] + spans = [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + event = { "schemaVersion": "1.0", "evaluationLevel": level, - "evaluationInput": { - "sessionSpans": spans - or [ - { - "traceId": "abc123", - "spanId": "span1", - "attributes": { - "gen_ai.message.role": "user", - "gen_ai.message.content": "What is AI?", - }, - }, - { - "traceId": "abc123", - "spanId": "span2", - "attributes": { - "gen_ai.message.role": "assistant", - "gen_ai.message.content": "AI is artificial intelligence.", - }, - }, - ] - }, + "evaluationInput": {"sessionSpans": spans}, "evaluationTarget": {}, } if trace_ids is not None: @@ -153,17 +151,20 @@ def test_missing_evaluation_input_returns_error(self): assert result["errorCode"] == "INVALID_EVENT" def test_missing_required_field_returns_error(self): + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "q"}]}, + "output": {"messages": [{"role": "assistant", "content": "a"}]}, + } + } + ] spans = [ { "traceId": "t1", "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "q"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "a"}, - }, + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } ] metric = _mock_metric(name="FaithfulnessMetric") handler = DeepEvalHandler(metric=metric) diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index efab5459..67447f48 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -1,5 +1,6 @@ """Tests for deepeval input_mapper module.""" +import json from unittest.mock import MagicMock import pytest @@ -7,11 +8,38 @@ from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( ParsedEvaluationEvent, + _extract_fields_from_spans, _get_required_params, build_test_case, ) +def _make_log_record( + input_messages=None, + output_messages=None, + trace_id=None, +): + """Build a single log record dict.""" + record = {"body": {}} + if input_messages is not None: + record["body"]["input"] = {"messages": input_messages} + if output_messages is not None: + record["body"]["output"] = {"messages": output_messages} + if trace_id is not None: + record["traceId"] = trace_id + return record + + +def _make_span_with_log_records(log_records, span_id="span1", as_json_string=True): + """Build a span dict with _eval_log_records in attributes.""" + value = json.dumps(log_records) if as_json_string else log_records + return { + "traceId": "abc123", + "spanId": span_id, + "attributes": {"_eval_log_records": value}, + } + + def _make_event( level="TRACE", trace_ids=None, @@ -20,30 +48,19 @@ def _make_event( reference_inputs=None, ): """Build a raw Lambda event dict for testing.""" + if spans is None: + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "What is the capital of France?"}], + output_messages=[{"role": "assistant", "content": "The capital of France is Paris."}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + event = { "schemaVersion": "1.0", "evaluationLevel": level, - "evaluationInput": { - "sessionSpans": spans - or [ - { - "traceId": "abc123", - "spanId": "span1", - "attributes": { - "gen_ai.message.role": "user", - "gen_ai.message.content": "What is the capital of France?", - }, - }, - { - "traceId": "abc123", - "spanId": "span2", - "attributes": { - "gen_ai.message.role": "assistant", - "gen_ai.message.content": "The capital of France is Paris.", - }, - }, - ] - }, + "evaluationInput": {"sessionSpans": spans}, "evaluationTarget": {}, } if trace_ids is not None: @@ -82,7 +99,7 @@ def test_from_lambda_event_trace_level(self): assert parsed.evaluation_level == "TRACE" assert parsed.target_trace_id == "trace-1" assert parsed.target_span_id is None - assert len(parsed.session_spans) == 2 + assert len(parsed.session_spans) == 1 def test_from_lambda_event_tool_call_level(self): event = _make_event(level="TOOL_CALL", span_ids=["span-42"]) @@ -173,6 +190,250 @@ def test_empty_required_params_falls_through(self): assert result == ["input", "actual_output"] +class TestExtractFieldsFromSpans: + def test_basic_extraction(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "hello"}], + output_messages=[{"role": "assistant", "content": "world"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "hello" + assert fields["actual_output"] == "world" + + def test_tool_messages_become_retrieval_context(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "doc chunk 1"}, + {"role": "tool", "content": "doc chunk 2"}, + {"role": "assistant", "content": "answer"}, + ], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"] + assert fields["actual_output"] == "answer" + + def test_message_content_as_dict_with_content_key(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": {"content": "nested content"}}], + output_messages=[{"role": "assistant", "content": {"content": "nested output"}}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "nested content" + assert fields["actual_output"] == "nested output" + + def test_message_content_as_dict_with_message_key(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "message": "msg key input"}], + output_messages=[{"role": "assistant", "message": "msg key output"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "msg key input" + assert fields["actual_output"] == "msg key output" + + def test_message_content_as_plain_string_in_content_field(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "plain string"}], + output_messages=[{"role": "assistant", "content": "plain response"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "plain string" + assert fields["actual_output"] == "plain response" + + def test_target_trace_id_filters_records(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "relevant"}], + output_messages=[{"role": "assistant", "content": "relevant answer"}], + trace_id="target-trace", + ), + _make_log_record( + input_messages=[{"role": "user", "content": "irrelevant"}], + output_messages=[{"role": "assistant", "content": "irrelevant answer"}], + trace_id="other-trace", + ), + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="target-trace", + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "relevant" + assert fields["actual_output"] == "relevant answer" + + def test_no_target_trace_id_includes_all_records(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "first"}], + output_messages=[{"role": "assistant", "content": "first answer"}], + trace_id="trace-1", + ), + _make_log_record( + input_messages=[{"role": "user", "content": "second"}], + output_messages=[{"role": "assistant", "content": "second answer"}], + trace_id="trace-2", + ), + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="SESSION", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "first\nsecond" + assert fields["actual_output"] == "first answer\nsecond answer" + + def test_log_records_as_parsed_list(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "from list"}], + output_messages=[{"role": "assistant", "content": "from list answer"}], + ) + ] + spans = [_make_span_with_log_records(log_records, as_json_string=False)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "from list" + assert fields["actual_output"] == "from list answer" + + def test_invalid_json_log_records_skipped(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": "not valid json{{{"}, + } + ] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields == {} + + def test_span_without_log_records_skipped(self): + spans = [{"traceId": "t1", "spanId": "s1", "attributes": {}}] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields == {} + + def test_multiple_spans_aggregated(self): + log_records_1 = [ + _make_log_record( + input_messages=[{"role": "user", "content": "q1"}], + output_messages=[{"role": "assistant", "content": "a1"}], + ) + ] + log_records_2 = [ + _make_log_record( + input_messages=[{"role": "user", "content": "q2"}], + output_messages=[{"role": "assistant", "content": "a2"}], + ) + ] + spans = [ + _make_span_with_log_records(log_records_1, span_id="s1"), + _make_span_with_log_records(log_records_2, span_id="s2"), + ] + parsed = ParsedEvaluationEvent( + evaluation_level="SESSION", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "q1\nq2" + assert fields["actual_output"] == "a1\na2" + + def test_reference_inputs_expected_output(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "q"}], + output_messages=[{"role": "assistant", "content": "a"}], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", + session_spans=spans, + reference_inputs=[{"expectedResponse": "expected answer"}], + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["expected_output"] == "expected answer" + + def test_record_without_matching_trace_id_key_included(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "no trace id record"}], + output_messages=[{"role": "assistant", "content": "response"}], + ), + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="target-trace", + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["input"] == "no trace id record" + + class TestBuildTestCase: def test_basic_span_extraction(self): event = _make_event() @@ -184,29 +445,18 @@ def test_basic_span_extraction(self): assert test_case.input == "What is the capital of France?" assert test_case.actual_output == "The capital of France is Paris." - def test_retrieval_context_from_tool_spans(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 1"}, - }, - { - "traceId": "t1", - "spanId": "s3", - "attributes": {"gen_ai.message.role": "tool", "gen_ai.message.content": "doc chunk 2"}, - }, - { - "traceId": "t1", - "spanId": "s4", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, - }, + def test_retrieval_context_from_tool_messages(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "doc chunk 1"}, + {"role": "tool", "content": "doc chunk 2"}, + {"role": "assistant", "content": "answer"}, + ], + ) ] + spans = [_make_span_with_log_records(log_records)] event = _make_event(spans=spans) parsed = ParsedEvaluationEvent.from_lambda_event(event) metric = _mock_metric(name="FaithfulnessMetric") @@ -228,18 +478,13 @@ def test_expected_output_from_reference_inputs(self): assert test_case.expected_output == "Paris" def test_missing_required_field_raises_value_error(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "query"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "answer"}, - }, + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[{"role": "assistant", "content": "answer"}], + ) ] + spans = [_make_span_with_log_records(log_records)] event = _make_event(spans=spans) parsed = ParsedEvaluationEvent.from_lambda_event(event) metric = _mock_metric(name="FaithfulnessMetric") @@ -283,23 +528,16 @@ def capture_mapper(raw_event): assert raw["evaluationReferenceInputs"] == refs def test_multiple_user_messages_concatenated(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "hello"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "user", "gen_ai.message.content": "world"}, - }, - { - "traceId": "t1", - "spanId": "s3", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.message.content": "hi"}, - }, + log_records = [ + _make_log_record( + input_messages=[ + {"role": "user", "content": "hello"}, + {"role": "user", "content": "world"}, + ], + output_messages=[{"role": "assistant", "content": "hi"}], + ) ] + spans = [_make_span_with_log_records(log_records)] event = _make_event(spans=spans) parsed = ParsedEvaluationEvent.from_lambda_event(event) metric = _mock_metric(name="AnswerRelevancyMetric") @@ -307,25 +545,3 @@ def test_multiple_user_messages_concatenated(self): test_case = build_test_case(parsed, metric) assert test_case.input == "hello\nworld" - - def test_gen_ai_completion_fallback(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"gen_ai.message.role": "user", "gen_ai.completion": "fallback input"}, - }, - { - "traceId": "t1", - "spanId": "s2", - "attributes": {"gen_ai.message.role": "assistant", "gen_ai.completion": "fallback output"}, - }, - ] - event = _make_event(spans=spans) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="AnswerRelevancyMetric") - - test_case = build_test_case(parsed, metric) - - assert test_case.input == "fallback input" - assert test_case.actual_output == "fallback output" From e9ef47d40b40027bdd917dc551698404a834efad Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:14:10 -0700 Subject: [PATCH 03/13] Set context field from tool messages for HallucinationMetric support --- .../integrations/deepeval/input_mapper.py | 1 + .../deepeval/test_input_mapper.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index cd67845f..39182636 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -190,6 +190,7 @@ def _extract_fields_from_spans( fields["actual_output"] = "\n".join(assistant_messages) if tool_messages: fields["retrieval_context"] = tool_messages + fields["context"] = tool_messages if parsed.reference_inputs: expected = parsed.reference_inputs[0].get("expectedResponse") diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index 67447f48..ca661128 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -229,6 +229,26 @@ def test_tool_messages_become_retrieval_context(self): assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"] assert fields["actual_output"] == "answer" + def test_tool_messages_also_set_context_for_hallucination_metric(self): + log_records = [ + _make_log_record( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "context chunk"}, + {"role": "assistant", "content": "answer"}, + ], + ) + ] + spans = [_make_span_with_log_records(log_records)] + parsed = ParsedEvaluationEvent( + evaluation_level="TRACE", session_spans=spans + ) + + fields = _extract_fields_from_spans(parsed) + + assert fields["context"] == ["context chunk"] + assert fields["context"] == fields["retrieval_context"] + def test_message_content_as_dict_with_content_key(self): log_records = [ _make_log_record( From f97827e892f8c431d0e5d258bd16cc92ef05d447 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:36:35 -0700 Subject: [PATCH 04/13] Use metric.success for label instead of manual threshold comparison --- .../integrations/deepeval/handler.py | 3 +- .../integrations/deepeval/test_handler.py | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index b339b883..4893889c 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -78,7 +78,8 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] score = self.metric.score reason = getattr(self.metric, "reason", None) or "" threshold = getattr(self.metric, "threshold", 0.5) - label = "Pass" if score is not None and score >= threshold else "Fail" + success = getattr(self.metric, "success", score is not None and score >= threshold) + label = "Pass" if success else "Fail" return {"value": score, "label": label, "explanation": reason} diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index c3fa98ae..009f5e54 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -55,6 +55,7 @@ def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetri metric._required_params = None del metric._required_params del metric.evaluation_params + del metric.success def measure_side_effect(test_case): metric.score = score @@ -229,3 +230,31 @@ def test_default_threshold_when_missing(self): result = handler(_make_event()) assert result["label"] == "Pass" + + def test_label_uses_metric_success_true(self): + metric = _mock_metric(score=0.3, threshold=0.7) + metric.success = True + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.3 + assert result["label"] == "Pass" + + def test_label_uses_metric_success_false(self): + metric = _mock_metric(score=0.9, threshold=0.7) + metric.success = False + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.9 + assert result["label"] == "Fail" + + def test_label_falls_back_to_threshold_when_no_success(self): + metric = _mock_metric(score=0.8, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["label"] == "Pass" From 9d256aea85fdc1718349562691bb3055301b9410 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:42:07 -0700 Subject: [PATCH 05/13] Add model override and timeout enforcement to DeepEvalHandler --- .../integrations/deepeval/handler.py | 49 ++++++++++++++- .../integrations/deepeval/test_handler.py | 61 +++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index 4893889c..c71ed6da 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -1,6 +1,7 @@ """DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics.""" import logging +import threading from typing import Any, Callable, Dict, Optional from deepeval.metrics import BaseMetric @@ -30,10 +31,14 @@ def lambda_handler(event, context): return handler(event, context) """ + DEFAULT_TIMEOUT = 290 + def __init__( self, metric: BaseMetric, field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + model: Optional[str] = None, + timeout: Optional[int] = None, ): """Initialize the handler. @@ -42,9 +47,15 @@ def __init__( field_mapper: Optional callable that receives the raw Lambda event and returns a dict of LLMTestCase field values. Bypasses default span extraction when provided. + model: Optional model identifier to override the metric's LLM + (e.g. a Bedrock model string instead of the default OpenAI model). + timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 + (slightly under Lambda's 300s max). Set to None to disable. """ self.metric = metric self.field_mapper = field_mapper + self.model = model + self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: """Handle a Lambda invocation. @@ -69,8 +80,16 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] logger.error("Missing required fields: %s", e) return _error_response("MISSING_REQUIRED_FIELD", str(e)) + if self.model is not None: + self.metric.model = self.model + try: - self.metric.measure(test_case) + self._measure_with_timeout(test_case) + except _MetricTimeout: + return _error_response( + "METRIC_TIMEOUT", + f"{type(self.metric).__name__} exceeded {self.timeout}s timeout.", + ) except Exception as e: logger.error("Metric measurement failed: %s", e, exc_info=True) return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}") @@ -83,6 +102,34 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] return {"value": score, "label": label, "explanation": reason} + def _measure_with_timeout(self, test_case: Any) -> None: + """Run metric.measure with a thread-based timeout.""" + if self.timeout <= 0: + self.metric.measure(test_case) + return + + exception_holder: list = [] + + def target(): + try: + self.metric.measure(test_case) + except Exception as e: + exception_holder.append(e) + + thread = threading.Thread(target=target, daemon=True) + thread.start() + thread.join(timeout=self.timeout) + + if thread.is_alive(): + raise _MetricTimeout() + + if exception_holder: + raise exception_holder[0] + + +class _MetricTimeout(Exception): + """Raised when metric.measure exceeds the configured timeout.""" + def _error_response(code: str, message: str) -> Dict[str, str]: """Build a standardized error response dict.""" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 009f5e54..9867969b 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -1,6 +1,7 @@ """Tests for DeepEvalHandler.""" import json +import time from unittest.mock import MagicMock, patch import pytest @@ -258,3 +259,63 @@ def test_label_falls_back_to_threshold_when_no_success(self): result = handler(_make_event()) assert result["label"] == "Pass" + + def test_model_override_sets_metric_model(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3") + + handler(_make_event()) + + assert metric.model == "bedrock/anthropic.claude-3" + + def test_no_model_override_leaves_metric_unchanged(self): + metric = _mock_metric() + metric.model = "original-model" + handler = DeepEvalHandler(metric=metric) + + handler(_make_event()) + + assert metric.model == "original-model" + + +class TestDeepEvalHandlerTimeout: + def test_timeout_returns_error(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=lambda tc: time.sleep(5)) + handler = DeepEvalHandler(metric=metric, timeout=1) + + result = handler(_make_event()) + + assert result["errorCode"] == "METRIC_TIMEOUT" + assert "1s timeout" in result["errorMessage"] + + def test_no_timeout_when_measure_completes_in_time(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric, timeout=10) + + result = handler(_make_event()) + + assert result["value"] == 0.85 + assert "errorCode" not in result + + def test_default_timeout_is_290(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + assert handler.timeout == 290 + + def test_custom_timeout_value(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric, timeout=60) + + assert handler.timeout == 60 + + def test_metric_exception_still_propagates_with_timeout(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=RuntimeError("LLM error")) + handler = DeepEvalHandler(metric=metric, timeout=10) + + result = handler(_make_event()) + + assert result["errorCode"] == "METRIC_ERROR" + assert "LLM error" in result["errorMessage"] From c142d50cd4b367b49df45b2a04e332f363d4fccc Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 12:56:33 -0700 Subject: [PATCH 06/13] Add model override, timeout enforcement, use metric.success, fix SingleTurnParams deprecation --- .../evaluation/integrations/deepeval/handler.py | 7 ++++--- .../integrations/deepeval/input_mapper.py | 14 +++++++------- .../integrations/deepeval/test_input_mapper.py | 6 +++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index c71ed6da..ed261727 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -37,7 +37,7 @@ def __init__( self, metric: BaseMetric, field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, - model: Optional[str] = None, + model: Optional[Any] = None, timeout: Optional[int] = None, ): """Initialize the handler. @@ -47,8 +47,9 @@ def __init__( field_mapper: Optional callable that receives the raw Lambda event and returns a dict of LLMTestCase field values. Bypasses default span extraction when provided. - model: Optional model identifier to override the metric's LLM - (e.g. a Bedrock model string instead of the default OpenAI model). + model: Optional model override for the metric's LLM. Can be a string + model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM + subclass instance. timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 (slightly under Lambda's 300s max). Set to None to disable. """ diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index 39182636..47e75c0c 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -6,16 +6,16 @@ from typing import Any, Callable, Dict, List, Optional from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from deepeval.test_case import LLMTestCase, SingleTurnParams logger = logging.getLogger(__name__) -_PARAM_TO_FIELD: Dict[LLMTestCaseParams, str] = { - LLMTestCaseParams.INPUT: "input", - LLMTestCaseParams.ACTUAL_OUTPUT: "actual_output", - LLMTestCaseParams.EXPECTED_OUTPUT: "expected_output", - LLMTestCaseParams.CONTEXT: "context", - LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrieval_context", +_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = { + SingleTurnParams.INPUT: "input", + SingleTurnParams.ACTUAL_OUTPUT: "actual_output", + SingleTurnParams.EXPECTED_OUTPUT: "expected_output", + SingleTurnParams.CONTEXT: "context", + SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context", } _METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index ca661128..6d2a5420 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -4,7 +4,7 @@ from unittest.mock import MagicMock import pytest -from deepeval.test_case import LLMTestCaseParams +from deepeval.test_case import SingleTurnParams from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( ParsedEvaluationEvent, @@ -156,7 +156,7 @@ def test_from_lambda_event_missing_target_key_defaults(self): class TestGetRequiredParams: def test_uses_required_params_attribute(self): metric = _mock_metric( - required_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT] + required_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT] ) result = _get_required_params(metric) @@ -171,7 +171,7 @@ def test_falls_back_to_static_registry(self): def test_falls_back_to_evaluation_params(self): metric = _mock_metric( name="UnknownMetric", - evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT], + evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.RETRIEVAL_CONTEXT], ) result = _get_required_params(metric) From 3ccc98cf9ae0939dfbb7ca07d6290f82752d1a46 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 16:42:01 -0700 Subject: [PATCH 07/13] Fix _get_required_params to handle GEval unmappable typing params --- .deepeval/.deepeval_telemetry.txt | 2 ++ .../evaluation/integrations/deepeval/input_mapper.py | 3 ++- .../integrations/deepeval/test_input_mapper.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 .deepeval/.deepeval_telemetry.txt diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt new file mode 100644 index 00000000..916744ae --- /dev/null +++ b/.deepeval/.deepeval_telemetry.txt @@ -0,0 +1,2 @@ +DEEPEVAL_ID=f26d66a4-b0b0-4096-859f-89f1ddf7ceee +DEEPEVAL_STATUS=old diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py index 47e75c0c..941afce2 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py @@ -80,7 +80,8 @@ def _get_required_params(metric: BaseMetric) -> List[str]: """ if hasattr(metric, "_required_params") and metric._required_params: params = metric._required_params - return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + if all(p in _PARAM_TO_FIELD for p in params): + return [_PARAM_TO_FIELD[p] for p in params] class_name = type(metric).__name__ if class_name in _METRIC_REQUIRED_PARAMS: diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index 6d2a5420..1d90a689 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -183,6 +183,18 @@ def test_defaults_to_input_and_actual_output(self): assert result == ["input", "actual_output"] + def test_unmappable_required_params_skips_to_static_registry(self): + metric = _mock_metric(name="GEval", required_params=["SomeTypingObject", "AnotherType"]) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + + def test_unmappable_required_params_falls_to_default(self): + metric = _mock_metric(name="UnknownMetric", required_params=["SomeTypingObject"]) + result = _get_required_params(metric) + + assert result == ["input", "actual_output"] + def test_empty_required_params_falls_through(self): metric = _mock_metric(name="UnknownMetric", required_params=[]) result = _get_required_params(metric) From a884f912d4f3faa18fb3c978ed1ad1a41db1f19b Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Mon, 15 Jun 2026 16:50:12 -0700 Subject: [PATCH 08/13] Add .deepeval/ to gitignore --- .deepeval/.deepeval_telemetry.txt | 2 -- .gitignore | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 .deepeval/.deepeval_telemetry.txt diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt deleted file mode 100644 index 916744ae..00000000 --- a/.deepeval/.deepeval_telemetry.txt +++ /dev/null @@ -1,2 +0,0 @@ -DEEPEVAL_ID=f26d66a4-b0b0-4096-859f-89f1ddf7ceee -DEEPEVAL_STATUS=old diff --git a/.gitignore b/.gitignore index 01fe8e22..161403e7 100644 --- a/.gitignore +++ b/.gitignore @@ -229,3 +229,4 @@ local_settings.py Dockerfile CLAUDE.md .omc/ +.deepeval/ From 6ac198cc5ece549997dd9589695277225944fd1e Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Tue, 16 Jun 2026 15:26:05 -0700 Subject: [PATCH 09/13] Move model override to init to avoid per-call mutation --- .../evaluation/integrations/deepeval/handler.py | 6 ++---- .../evaluation/integrations/deepeval/test_handler.py | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py index ed261727..0e91bafe 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py @@ -55,8 +55,9 @@ def __init__( """ self.metric = metric self.field_mapper = field_mapper - self.model = model self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT + if model is not None: + self.metric.model = model def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: """Handle a Lambda invocation. @@ -81,9 +82,6 @@ def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any] logger.error("Missing required fields: %s", e) return _error_response("MISSING_REQUIRED_FIELD", str(e)) - if self.model is not None: - self.metric.model = self.model - try: self._measure_with_timeout(test_case) except _MetricTimeout: diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 9867969b..77961f14 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -264,8 +264,6 @@ def test_model_override_sets_metric_model(self): metric = _mock_metric() handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3") - handler(_make_event()) - assert metric.model == "bedrock/anthropic.claude-3" def test_no_model_override_leaves_metric_unchanged(self): From 8d415e5791ba260a7253c196a92c861c7e44e34f Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Wed, 24 Jun 2026 16:25:17 -0700 Subject: [PATCH 10/13] Refactor to BaseAdapter framework with DeepEval/Autoevals adapters and EvaluatorInput support --- .../evaluation/integrations/__init__.py | 4 + .../integrations/autoevals/__init__.py | 5 + .../integrations/autoevals/adapter.py | 72 +++++ .../evaluation/integrations/base.py | 302 ++++++++++++++++++ .../integrations/deepeval/__init__.py | 4 +- .../integrations/deepeval/adapter.py | 189 +++++++++++ .../integrations/deepeval/handler.py | 135 -------- .../integrations/deepeval/input_mapper.py | 253 --------------- .../integrations/autoevals/__init__.py | 0 .../integrations/autoevals/test_adapter.py | 217 +++++++++++++ .../integrations/deepeval/test_handler.py | 112 ++++++- .../deepeval/test_input_mapper.py | 8 +- 12 files changed, 906 insertions(+), 395 deletions(-) create mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/base.py create mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py create mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py diff --git a/src/bedrock_agentcore/evaluation/integrations/__init__.py b/src/bedrock_agentcore/evaluation/integrations/__init__.py index 33048d5d..a1ff7691 100644 --- a/src/bedrock_agentcore/evaluation/integrations/__init__.py +++ b/src/bedrock_agentcore/evaluation/integrations/__init__.py @@ -1 +1,5 @@ """AgentCore Evaluation integrations.""" + +from bedrock_agentcore.evaluation.integrations.base import BaseAdapter, ParsedEvaluationEvent + +__all__ = ["BaseAdapter", "ParsedEvaluationEvent"] diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py new file mode 100644 index 00000000..0bc3b4ff --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py @@ -0,0 +1,5 @@ +"""Autoevals integration for AgentCore Evaluation.""" + +from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter + +__all__ = ["AutoevalsAdapter"] diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py new file mode 100644 index 00000000..fe89435e --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py @@ -0,0 +1,72 @@ +"""Autoevals adapter for AgentCore evaluation integrations.""" + +import logging +from typing import Any, Callable, Dict, Optional + +from bedrock_agentcore.evaluation.integrations.base import BaseAdapter + +logger = logging.getLogger(__name__) + + +class AutoevalsAdapter(BaseAdapter): + """Adapter that runs an Autoevals scorer against AgentCore evaluation events. + + Example:: + + from autoevals import Factuality + + scorer = Factuality() + handler = AutoevalsAdapter(scorer=scorer) + + # Use as Lambda handler + def lambda_handler(event, context): + return handler(event, context) + """ + + def __init__( + self, + scorer: Any, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + timeout: Optional[int] = None, + ): + """Initialize the adapter. + + Args: + scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()). + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of field values. Bypasses default span extraction. + timeout: Maximum seconds to allow for scorer.eval(). Defaults to 290. + """ + super().__init__(field_mapper=field_mapper, timeout=timeout) + self.scorer = scorer + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that input and actual_output are present.""" + missing = [] + if not fields.get("input"): + missing.append("input") + if not fields.get("actual_output"): + missing.append("actual_output") + if missing: + scorer_name = type(self.scorer).__name__ + raise ValueError( + f"Field(s) {missing} required by {scorer_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run the Autoevals scorer and return formatted results.""" + kwargs: Dict[str, Any] = { + "input": fields.get("input", ""), + "output": fields.get("actual_output", ""), + } + if fields.get("expected_output"): + kwargs["expected"] = fields["expected_output"] + + result = self.scorer.eval(**kwargs) + + score = result.score + label = "Pass" if score is not None and score >= 0.5 else "Fail" + explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else "" + + return {"value": score, "label": label, "explanation": explanation} diff --git a/src/bedrock_agentcore/evaluation/integrations/base.py b/src/bedrock_agentcore/evaluation/integrations/base.py new file mode 100644 index 00000000..a10f6606 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/base.py @@ -0,0 +1,302 @@ +"""Base adapter for AgentCore evaluation integrations.""" + +import abc +import json +import logging +import threading +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Union + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput + +logger = logging.getLogger(__name__) + + +@dataclass +class ParsedEvaluationEvent: + """Parsed representation of the AgentCore Lambda evaluation event.""" + + evaluation_level: str + session_spans: List[Dict[str, Any]] + target_trace_id: Optional[str] = None + target_span_id: Optional[str] = None + reference_inputs: List[Dict[str, Any]] = field(default_factory=list) + + @classmethod + def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent": + """Parse a raw Lambda event dict into a structured object. + + Args: + event: Raw Lambda event payload from the evaluation service. + + Returns: + ParsedEvaluationEvent with extracted fields. + + Raises: + KeyError: If required top-level fields are missing. + """ + evaluation_input = event["evaluationInput"] + target = event.get("evaluationTarget") or {} + trace_ids = target.get("traceIds") or [] + span_ids = target.get("spanIds") or [] + + return cls( + evaluation_level=event["evaluationLevel"], + session_spans=evaluation_input["sessionSpans"], + target_trace_id=trace_ids[0] if trace_ids else None, + target_span_id=span_ids[0] if span_ids else None, + reference_inputs=event.get("evaluationReferenceInputs") or [], + ) + + +def _get_message_content(message: Any) -> str: + """Extract text content from a message object. + + Message content can be a dict with a "content" or "message" key, or a plain string. + Handles one level of nesting (e.g. {"content": {"content": "text"}}). + """ + if isinstance(message, str): + return message + if isinstance(message, dict): + for key in ("content", "message"): + if key in message: + val = message[key] + if isinstance(val, str): + return val + if isinstance(val, dict): + return _get_message_content(val) + return str(val) + return "" + + +def extract_fields_from_spans( + parsed: ParsedEvaluationEvent, +) -> Dict[str, Any]: + """Extract evaluation fields from AgentCore session spans. + + Parses _eval_log_records from span attributes, filters by target_trace_id, + and extracts messages by role: + - input ← input messages where role=="user" + - actual_output ← output messages where role=="assistant" + - retrieval_context ← output messages where role=="tool" + - context ← same as retrieval_context + - expected_output ← evaluationReferenceInputs[0].expectedResponse + """ + user_messages: List[str] = [] + assistant_messages: List[str] = [] + tool_messages: List[str] = [] + + for span in parsed.session_spans: + attributes = span.get("attributes", {}) + log_records_raw = attributes.get("_eval_log_records") + if not log_records_raw: + continue + + if isinstance(log_records_raw, str): + try: + log_records = json.loads(log_records_raw) + except (json.JSONDecodeError, TypeError): + logger.debug("Failed to parse _eval_log_records as JSON") + continue + else: + log_records = log_records_raw + + if not isinstance(log_records, list): + continue + + for record in log_records: + if not isinstance(record, dict): + continue + + if parsed.target_trace_id: + record_trace_id = record.get("traceId") or record.get("trace_id") + if record_trace_id and record_trace_id != parsed.target_trace_id: + continue + + body = record.get("body", {}) + if not isinstance(body, dict): + continue + + input_data = body.get("input", {}) + if isinstance(input_data, dict): + for msg in input_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "user" and content: + user_messages.append(content) + + output_data = body.get("output", {}) + if isinstance(output_data, dict): + for msg in output_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) + + fields: Dict[str, Any] = {} + + if user_messages: + fields["input"] = "\n".join(user_messages) + if assistant_messages: + fields["actual_output"] = "\n".join(assistant_messages) + if tool_messages: + fields["retrieval_context"] = tool_messages + fields["context"] = tool_messages + + if parsed.reference_inputs: + expected = parsed.reference_inputs[0].get("expectedResponse") + if expected: + fields["expected_output"] = expected + + return fields + + +class _ExecutionTimeout(Exception): + """Raised when execution exceeds the configured timeout.""" + + +def _error_response(code: str, message: str) -> Dict[str, str]: + """Build a standardized error response dict.""" + return {"errorCode": code, "errorMessage": message} + + +class BaseAdapter(abc.ABC): + """Base adapter for evaluation framework integrations. + + Subclasses only need to implement execute(fields) which runs the actual + evaluation logic and returns (score, label, explanation). + + Never raises unhandled exceptions — always returns a valid response dict. + """ + + DEFAULT_TIMEOUT = 290 + + def __init__( + self, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + timeout: Optional[int] = None, + ): + """Initialize the adapter. + + Args: + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of field values. Bypasses default span extraction. + timeout: Maximum seconds to allow for execute(). Defaults to 290 + (slightly under Lambda's 300s max). + """ + self.field_mapper = field_mapper + self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT + + def __call__(self, event: Union[Dict[str, Any], EvaluatorInput], context: Any = None) -> Dict[str, Any]: + """Handle a Lambda invocation. + + Args: + event: Either a raw Lambda event dict or an EvaluatorInput instance + from bedrock_agentcore.evaluation.custom_code_based_evaluators.models. + context: Lambda context object (unused). + + Returns: + Success: {"value": float, "label": str, "explanation": str} + Error: {"errorCode": str, "errorMessage": str} + """ + try: + if isinstance(event, EvaluatorInput): + parsed = ParsedEvaluationEvent( + evaluation_level=event.evaluation_level, + session_spans=event.session_spans, + target_trace_id=event.target_trace_id, + target_span_id=event.target_span_id, + reference_inputs=getattr(event, "reference_inputs", []) or [], + ) + else: + parsed = ParsedEvaluationEvent.from_lambda_event(event) + except (KeyError, IndexError, TypeError) as e: + logger.error("Failed to parse evaluation event: %s", e) + return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}") + + try: + fields = self._extract_fields(parsed) + except ValueError as e: + logger.error("Missing required fields: %s", e) + return _error_response("MISSING_REQUIRED_FIELD", str(e)) + + try: + result = self._execute_with_timeout(fields) + except _ExecutionTimeout: + return _error_response( + "METRIC_TIMEOUT", + f"{type(self).__name__} exceeded {self.timeout}s timeout.", + ) + except Exception as e: + logger.error("Execution failed: %s", e, exc_info=True) + return _error_response("METRIC_ERROR", f"{type(self).__name__} failed: {e}") + + return result + + def _extract_fields(self, parsed: ParsedEvaluationEvent) -> Dict[str, Any]: + """Extract fields from event, using field_mapper if provided.""" + if self.field_mapper is not None: + raw_event = { + "evaluationLevel": parsed.evaluation_level, + "evaluationInput": {"sessionSpans": parsed.session_spans}, + "evaluationTarget": { + "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], + "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], + }, + "evaluationReferenceInputs": parsed.reference_inputs, + } + return self.field_mapper(raw_event) + + fields = extract_fields_from_spans(parsed) + self.validate_fields(fields) + return fields + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that required fields are present. + + Override in subclasses to enforce field requirements. + Default implementation does nothing. + """ + + @abc.abstractmethod + def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run the evaluation and return the response dict. + + Args: + fields: Extracted field dict with keys like "input", "actual_output", etc. + + Returns: + {"value": float, "label": str, "explanation": str} + """ + + def _execute_with_timeout(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run execute() with a thread-based timeout.""" + if self.timeout <= 0: + return self.execute(fields) + + result_holder: list = [] + exception_holder: list = [] + + def target(): + try: + result_holder.append(self.execute(fields)) + except Exception as e: + exception_holder.append(e) + + thread = threading.Thread(target=target, daemon=True) + thread.start() + thread.join(timeout=self.timeout) + + if thread.is_alive(): + raise _ExecutionTimeout() + + if exception_holder: + raise exception_holder[0] + + return result_holder[0] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py index 76f6461f..adb6ba44 100644 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py @@ -1,5 +1,5 @@ """DeepEval integration for AgentCore Evaluation.""" -from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler +from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler -__all__ = ["DeepEvalHandler"] +__all__ = ["DeepEvalAdapter", "DeepEvalHandler"] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py new file mode 100644 index 00000000..e8748782 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py @@ -0,0 +1,189 @@ +"""DeepEval adapter for AgentCore evaluation integrations.""" + +import logging +from typing import Any, Callable, Dict, List, Optional + +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase, SingleTurnParams + +from bedrock_agentcore.evaluation.integrations.base import ( + BaseAdapter, + ParsedEvaluationEvent, + extract_fields_from_spans, +) + +logger = logging.getLogger(__name__) + +_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = { + SingleTurnParams.INPUT: "input", + SingleTurnParams.ACTUAL_OUTPUT: "actual_output", + SingleTurnParams.EXPECTED_OUTPUT: "expected_output", + SingleTurnParams.CONTEXT: "context", + SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context", +} + +_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { + "AnswerRelevancyMetric": ["input", "actual_output"], + "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"], + "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"], + "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"], + "HallucinationMetric": ["input", "actual_output", "context"], + "BiasMetric": ["input", "actual_output"], + "ToxicityMetric": ["input", "actual_output"], + "GEval": ["input", "actual_output"], + "SummarizationMetric": ["input", "actual_output"], +} + + +def _get_required_params(metric: BaseMetric) -> List[str]: + """Determine which LLMTestCase fields a metric requires. + + Fallback chain: + 1. metric._required_params (DeepEval internal attribute) + 2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name + 3. metric.evaluation_params (GEval special case) + 4. Default: ["input", "actual_output"] + """ + if hasattr(metric, "_required_params") and metric._required_params: + params = metric._required_params + if all(p in _PARAM_TO_FIELD for p in params): + return [_PARAM_TO_FIELD[p] for p in params] + + class_name = type(metric).__name__ + if class_name in _METRIC_REQUIRED_PARAMS: + return _METRIC_REQUIRED_PARAMS[class_name] + + if hasattr(metric, "evaluation_params") and metric.evaluation_params: + params = metric.evaluation_params + return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] + + return ["input", "actual_output"] + + +class DeepEvalAdapter(BaseAdapter): + """Adapter that runs a DeepEval metric against AgentCore evaluation events. + + Example:: + + from deepeval.metrics import AnswerRelevancyMetric + + metric = AnswerRelevancyMetric(threshold=0.7) + handler = DeepEvalAdapter(metric=metric) + + # Use as Lambda handler + def lambda_handler(event, context): + return handler(event, context) + """ + + def __init__( + self, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + model: Optional[Any] = None, + timeout: Optional[int] = None, + ): + """Initialize the adapter. + + Args: + metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). + field_mapper: Optional callable that receives the raw Lambda event and + returns a dict of LLMTestCase field values. Bypasses default span + extraction when provided. + model: Optional model override for the metric's LLM. Can be a string + model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM + subclass instance. + timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 + (slightly under Lambda's 300s max). + """ + super().__init__(field_mapper=field_mapper, timeout=timeout) + self.metric = metric + if model is not None: + self.metric.model = model + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that fields required by the metric are present.""" + required = _get_required_params(self.metric) + missing = [f for f in required if f not in fields or not fields[f]] + if missing: + metric_name = type(self.metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: + """Run the DeepEval metric and return formatted results.""" + test_case = LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) + + self.metric.measure(test_case) + + score = self.metric.score + reason = getattr(self.metric, "reason", None) or "" + threshold = getattr(self.metric, "threshold", 0.5) + success = getattr(self.metric, "success", score is not None and score >= threshold) + label = "Pass" if success else "Fail" + + return {"value": score, "label": label, "explanation": reason} + + +def build_test_case( + parsed: ParsedEvaluationEvent, + metric: BaseMetric, + field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, +) -> LLMTestCase: + """Build a DeepEval LLMTestCase from a parsed evaluation event. + + Args: + parsed: The parsed Lambda event. + metric: The DeepEval metric (used to determine required fields). + field_mapper: Optional callable that receives the raw Lambda event fields + and returns a dict of LLMTestCase field values. Bypasses default + span extraction when provided. + + Returns: + An LLMTestCase ready for metric.measure(). + + Raises: + ValueError: If required fields for the metric cannot be populated. + """ + if field_mapper is not None: + raw_event = { + "evaluationLevel": parsed.evaluation_level, + "evaluationInput": {"sessionSpans": parsed.session_spans}, + "evaluationTarget": { + "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], + "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], + }, + "evaluationReferenceInputs": parsed.reference_inputs, + } + fields = field_mapper(raw_event) + else: + fields = extract_fields_from_spans(parsed) + + required = _get_required_params(metric) + missing = [f for f in required if f not in fields or not fields[f]] + if missing: + metric_name = type(metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) + + return LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) + + +# Backward-compatible alias +DeepEvalHandler = DeepEvalAdapter diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py deleted file mode 100644 index 0e91bafe..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/handler.py +++ /dev/null @@ -1,135 +0,0 @@ -"""DeepEval handler that adapts AgentCore Lambda evaluation events to DeepEval metrics.""" - -import logging -import threading -from typing import Any, Callable, Dict, Optional - -from deepeval.metrics import BaseMetric - -from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( - ParsedEvaluationEvent, - build_test_case, -) - -logger = logging.getLogger(__name__) - - -class DeepEvalHandler: - """Lambda handler that runs a DeepEval metric against AgentCore evaluation events. - - Never raises unhandled exceptions — always returns a valid response dict. - - Example:: - - from deepeval.metrics import AnswerRelevancyMetric - - metric = AnswerRelevancyMetric(threshold=0.7) - handler = DeepEvalHandler(metric=metric) - - # Use as Lambda handler - def lambda_handler(event, context): - return handler(event, context) - """ - - DEFAULT_TIMEOUT = 290 - - def __init__( - self, - metric: BaseMetric, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, - model: Optional[Any] = None, - timeout: Optional[int] = None, - ): - """Initialize the handler. - - Args: - metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). - field_mapper: Optional callable that receives the raw Lambda event and - returns a dict of LLMTestCase field values. Bypasses default span - extraction when provided. - model: Optional model override for the metric's LLM. Can be a string - model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM - subclass instance. - timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 - (slightly under Lambda's 300s max). Set to None to disable. - """ - self.metric = metric - self.field_mapper = field_mapper - self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT - if model is not None: - self.metric.model = model - - def __call__(self, event: Dict[str, Any], context: Any = None) -> Dict[str, Any]: - """Handle a Lambda invocation. - - Args: - event: Raw Lambda event dict from the evaluation service. - context: Lambda context object (unused). - - Returns: - Success: {"value": float, "label": str, "explanation": str} - Error: {"errorCode": str, "errorMessage": str} - """ - try: - parsed = ParsedEvaluationEvent.from_lambda_event(event) - except (KeyError, IndexError, TypeError) as e: - logger.error("Failed to parse evaluation event: %s", e) - return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}") - - try: - test_case = build_test_case(parsed, self.metric, self.field_mapper) - except ValueError as e: - logger.error("Missing required fields: %s", e) - return _error_response("MISSING_REQUIRED_FIELD", str(e)) - - try: - self._measure_with_timeout(test_case) - except _MetricTimeout: - return _error_response( - "METRIC_TIMEOUT", - f"{type(self.metric).__name__} exceeded {self.timeout}s timeout.", - ) - except Exception as e: - logger.error("Metric measurement failed: %s", e, exc_info=True) - return _error_response("METRIC_ERROR", f"{type(self.metric).__name__} failed: {e}") - - score = self.metric.score - reason = getattr(self.metric, "reason", None) or "" - threshold = getattr(self.metric, "threshold", 0.5) - success = getattr(self.metric, "success", score is not None and score >= threshold) - label = "Pass" if success else "Fail" - - return {"value": score, "label": label, "explanation": reason} - - def _measure_with_timeout(self, test_case: Any) -> None: - """Run metric.measure with a thread-based timeout.""" - if self.timeout <= 0: - self.metric.measure(test_case) - return - - exception_holder: list = [] - - def target(): - try: - self.metric.measure(test_case) - except Exception as e: - exception_holder.append(e) - - thread = threading.Thread(target=target, daemon=True) - thread.start() - thread.join(timeout=self.timeout) - - if thread.is_alive(): - raise _MetricTimeout() - - if exception_holder: - raise exception_holder[0] - - -class _MetricTimeout(Exception): - """Raised when metric.measure exceeds the configured timeout.""" - - -def _error_response(code: str, message: str) -> Dict[str, str]: - """Build a standardized error response dict.""" - return {"errorCode": code, "errorMessage": message} diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py deleted file mode 100644 index 941afce2..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/input_mapper.py +++ /dev/null @@ -1,253 +0,0 @@ -"""Map AgentCore Lambda evaluation events to DeepEval LLMTestCase objects.""" - -import json -import logging -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional - -from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase, SingleTurnParams - -logger = logging.getLogger(__name__) - -_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = { - SingleTurnParams.INPUT: "input", - SingleTurnParams.ACTUAL_OUTPUT: "actual_output", - SingleTurnParams.EXPECTED_OUTPUT: "expected_output", - SingleTurnParams.CONTEXT: "context", - SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context", -} - -_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { - "AnswerRelevancyMetric": ["input", "actual_output"], - "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"], - "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"], - "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"], - "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"], - "HallucinationMetric": ["input", "actual_output", "context"], - "BiasMetric": ["input", "actual_output"], - "ToxicityMetric": ["input", "actual_output"], - "GEval": ["input", "actual_output"], - "SummarizationMetric": ["input", "actual_output"], -} - - -@dataclass -class ParsedEvaluationEvent: - """Parsed representation of the AgentCore Lambda evaluation event.""" - - evaluation_level: str - session_spans: List[Dict[str, Any]] - target_trace_id: Optional[str] = None - target_span_id: Optional[str] = None - reference_inputs: List[Dict[str, Any]] = field(default_factory=list) - - @classmethod - def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent": - """Parse a raw Lambda event dict into a structured object. - - Args: - event: Raw Lambda event payload from the evaluation service. - - Returns: - ParsedEvaluationEvent with extracted fields. - - Raises: - KeyError: If required top-level fields are missing. - """ - evaluation_input = event["evaluationInput"] - target = event.get("evaluationTarget") or {} - trace_ids = target.get("traceIds") or [] - span_ids = target.get("spanIds") or [] - - return cls( - evaluation_level=event["evaluationLevel"], - session_spans=evaluation_input["sessionSpans"], - target_trace_id=trace_ids[0] if trace_ids else None, - target_span_id=span_ids[0] if span_ids else None, - reference_inputs=event.get("evaluationReferenceInputs") or [], - ) - - -def _get_required_params(metric: BaseMetric) -> List[str]: - """Determine which LLMTestCase fields a metric requires. - - Fallback chain: - 1. metric._required_params (DeepEval internal attribute) - 2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name - 3. metric.evaluation_params (GEval special case) - 4. Default: ["input", "actual_output"] - """ - if hasattr(metric, "_required_params") and metric._required_params: - params = metric._required_params - if all(p in _PARAM_TO_FIELD for p in params): - return [_PARAM_TO_FIELD[p] for p in params] - - class_name = type(metric).__name__ - if class_name in _METRIC_REQUIRED_PARAMS: - return _METRIC_REQUIRED_PARAMS[class_name] - - if hasattr(metric, "evaluation_params") and metric.evaluation_params: - params = metric.evaluation_params - return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] - - return ["input", "actual_output"] - - -def _get_message_content(message: Any) -> str: - """Extract text content from a message object. - - Message content can be a dict with a "content" or "message" key, or a plain string. - Handles one level of nesting (e.g. {"content": {"content": "text"}}). - """ - if isinstance(message, str): - return message - if isinstance(message, dict): - for key in ("content", "message"): - if key in message: - val = message[key] - if isinstance(val, str): - return val - if isinstance(val, dict): - return _get_message_content(val) - return str(val) - return "" - - -def _extract_fields_from_spans( - parsed: ParsedEvaluationEvent, -) -> Dict[str, Any]: - """Extract LLMTestCase fields from AgentCore session spans. - - Parses _eval_log_records from span attributes, filters by target_trace_id, - and extracts messages by role: - - input ← input messages where role=="user" - - actual_output ← output messages where role=="assistant" - - retrieval_context ← output messages where role=="tool" - - expected_output ← evaluationReferenceInputs[0].expectedResponse - """ - user_messages: List[str] = [] - assistant_messages: List[str] = [] - tool_messages: List[str] = [] - - for span in parsed.session_spans: - attributes = span.get("attributes", {}) - log_records_raw = attributes.get("_eval_log_records") - if not log_records_raw: - continue - - if isinstance(log_records_raw, str): - try: - log_records = json.loads(log_records_raw) - except (json.JSONDecodeError, TypeError): - logger.debug("Failed to parse _eval_log_records as JSON") - continue - else: - log_records = log_records_raw - - if not isinstance(log_records, list): - continue - - for record in log_records: - if not isinstance(record, dict): - continue - - if parsed.target_trace_id: - record_trace_id = record.get("traceId") or record.get("trace_id") - if record_trace_id and record_trace_id != parsed.target_trace_id: - continue - - body = record.get("body", {}) - if not isinstance(body, dict): - continue - - input_data = body.get("input", {}) - if isinstance(input_data, dict): - for msg in input_data.get("messages", []): - if not isinstance(msg, dict): - continue - role = msg.get("role", "") - content = _get_message_content(msg) - if role == "user" and content: - user_messages.append(content) - - output_data = body.get("output", {}) - if isinstance(output_data, dict): - for msg in output_data.get("messages", []): - if not isinstance(msg, dict): - continue - role = msg.get("role", "") - content = _get_message_content(msg) - if role == "assistant" and content: - assistant_messages.append(content) - elif role == "tool" and content: - tool_messages.append(content) - - fields: Dict[str, Any] = {} - - if user_messages: - fields["input"] = "\n".join(user_messages) - if assistant_messages: - fields["actual_output"] = "\n".join(assistant_messages) - if tool_messages: - fields["retrieval_context"] = tool_messages - fields["context"] = tool_messages - - if parsed.reference_inputs: - expected = parsed.reference_inputs[0].get("expectedResponse") - if expected: - fields["expected_output"] = expected - - return fields - - -def build_test_case( - parsed: ParsedEvaluationEvent, - metric: BaseMetric, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, -) -> LLMTestCase: - """Build a DeepEval LLMTestCase from a parsed evaluation event. - - Args: - parsed: The parsed Lambda event. - metric: The DeepEval metric (used to determine required fields). - field_mapper: Optional callable that receives the raw Lambda event fields - and returns a dict of LLMTestCase field values. Bypasses default - span extraction when provided. - - Returns: - An LLMTestCase ready for metric.measure(). - - Raises: - ValueError: If required fields for the metric cannot be populated. - """ - if field_mapper is not None: - raw_event = { - "evaluationLevel": parsed.evaluation_level, - "evaluationInput": {"sessionSpans": parsed.session_spans}, - "evaluationTarget": { - "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], - "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], - }, - "evaluationReferenceInputs": parsed.reference_inputs, - } - fields = field_mapper(raw_event) - else: - fields = _extract_fields_from_spans(parsed) - - required = _get_required_params(metric) - missing = [f for f in required if f not in fields or not fields[f]] - if missing: - metric_name = type(metric).__name__ - raise ValueError( - f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " - f"Provide a field_mapper or ensure spans contain the necessary data." - ) - - return LLMTestCase( - input=fields.get("input", ""), - actual_output=fields.get("actual_output", ""), - expected_output=fields.get("expected_output"), - context=fields.get("context"), - retrieval_context=fields.get("retrieval_context"), - ) diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py new file mode 100644 index 00000000..17f674bd --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py @@ -0,0 +1,217 @@ +"""Tests for AutoevalsAdapter.""" + +import json +import time +from unittest.mock import MagicMock + +import pytest + +from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter + + +def _make_event( + level="TRACE", + trace_ids=None, + spans=None, + reference_inputs=None, +): + """Build a raw Lambda event dict for testing.""" + if spans is None: + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ] + spans = [ + { + "traceId": "abc123", + "spanId": "span1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + + event = { + "schemaVersion": "1.0", + "evaluationLevel": level, + "evaluationInput": {"sessionSpans": spans}, + "evaluationTarget": {}, + } + if trace_ids is not None: + event["evaluationTarget"]["traceIds"] = trace_ids + if reference_inputs is not None: + event["evaluationReferenceInputs"] = reference_inputs + return event + + +def _mock_scorer(score=0.9, rationale="Good answer"): + """Create a mock Autoevals scorer.""" + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + + result = MagicMock() + result.score = score + result.metadata = {"rationale": rationale} + + scorer.eval = MagicMock(return_value=result) + return scorer + + +class TestAutoevalsAdapterSuccess: + def test_returns_pass_when_score_above_half(self): + scorer = _mock_scorer(score=0.8) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["value"] == 0.8 + assert result["label"] == "Pass" + assert result["explanation"] == "Good answer" + + def test_returns_fail_when_score_below_half(self): + scorer = _mock_scorer(score=0.3) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["value"] == 0.3 + assert result["label"] == "Fail" + + def test_scorer_eval_called_with_input_and_output(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + adapter(_make_event()) + + scorer.eval.assert_called_once() + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "What is AI?" + assert call_kwargs["output"] == "AI is artificial intelligence." + + def test_expected_output_passed_as_expected(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + refs = [{"expectedResponse": "AI stands for artificial intelligence."}] + result = adapter(_make_event(reference_inputs=refs)) + + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["expected"] == "AI stands for artificial intelligence." + + def test_no_expected_output_omits_expected_kwarg(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + adapter(_make_event()) + + call_kwargs = scorer.eval.call_args[1] + assert "expected" not in call_kwargs + + def test_custom_field_mapper(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter( + scorer=scorer, + field_mapper=lambda event: { + "input": "custom input", + "actual_output": "custom output", + }, + ) + + result = adapter(_make_event()) + + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "custom input" + assert call_kwargs["output"] == "custom output" + + +class TestAutoevalsAdapterErrors: + def test_invalid_event_returns_error(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter({}) + + assert result["errorCode"] == "INVALID_EVENT" + + def test_missing_input_returns_error(self): + log_records = [ + { + "body": { + "output": {"messages": [{"role": "assistant", "content": "answer"}]}, + } + } + ] + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event(spans=spans)) + + assert result["errorCode"] == "MISSING_REQUIRED_FIELD" + assert "input" in result["errorMessage"] + + def test_scorer_exception_returns_error(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=RuntimeError("API error")) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["errorCode"] == "METRIC_ERROR" + assert "API error" in result["errorMessage"] + + def test_never_raises_on_bad_input(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + for bad_input in [None, [], "string", 42]: + result = adapter(bad_input) + assert "errorCode" in result + + +class TestAutoevalsAdapterTimeout: + def test_timeout_returns_error(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=lambda **kw: time.sleep(5)) + adapter = AutoevalsAdapter(scorer=scorer, timeout=1) + + result = adapter(_make_event()) + + assert result["errorCode"] == "METRIC_TIMEOUT" + + def test_default_timeout_is_290(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + assert adapter.timeout == 290 + + +class TestAutoevalsAdapterEdgeCases: + def test_score_none_returns_fail(self): + scorer = _mock_scorer(score=None) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["label"] == "Fail" + + def test_no_metadata_returns_empty_explanation(self): + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + result_obj = MagicMock(spec=[]) + result_obj.score = 0.9 + scorer.eval = MagicMock(return_value=result_obj) + + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_event()) + + assert result["explanation"] == "" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py index 77961f14..67bfda3d 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py @@ -1,4 +1,4 @@ -"""Tests for DeepEvalHandler.""" +"""Tests for DeepEvalHandler and DeepEvalAdapter.""" import json import time @@ -6,7 +6,9 @@ import pytest -from bedrock_agentcore.evaluation.integrations.deepeval.handler import DeepEvalHandler +from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler +from bedrock_agentcore.evaluation.integrations.base import BaseAdapter +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput def _make_event( @@ -317,3 +319,109 @@ def test_metric_exception_still_propagates_with_timeout(self): assert result["errorCode"] == "METRIC_ERROR" assert "LLM error" in result["errorMessage"] + + +class TestBackwardCompatibility: + def test_handler_is_alias_for_adapter(self): + assert DeepEvalHandler is DeepEvalAdapter + + def test_adapter_is_subclass_of_base(self): + assert issubclass(DeepEvalAdapter, BaseAdapter) + + def test_import_from_init(self): + from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalHandler as H + from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalAdapter as A + + assert H is A + + def test_handler_works_same_as_before(self): + metric = _mock_metric(score=0.9, threshold=0.7) + handler = DeepEvalHandler(metric=metric) + + result = handler(_make_event()) + + assert result["value"] == 0.9 + assert result["label"] == "Pass" + + +class TestEvaluatorInputAcceptance: + def _make_evaluator_input(self): + log_records = [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "Hello"}]}, + "output": {"messages": [{"role": "assistant", "content": "Hi there"}]}, + } + } + ] + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="t1", + target_span_id=None, + ) + + def test_accepts_evaluator_input(self): + metric = _mock_metric(score=0.95) + handler = DeepEvalHandler(metric=metric) + + result = handler(self._make_evaluator_input()) + + assert result["value"] == 0.95 + assert result["label"] == "Pass" + + def test_evaluator_input_extracts_fields_correctly(self): + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + handler(self._make_evaluator_input()) + + test_case = metric.measure.call_args[0][0] + assert test_case.input == "Hello" + assert test_case.actual_output == "Hi there" + + def test_evaluator_input_with_trace_id_filtering(self): + log_records = [ + { + "traceId": "target", + "body": { + "input": {"messages": [{"role": "user", "content": "relevant"}]}, + "output": {"messages": [{"role": "assistant", "content": "yes"}]}, + }, + }, + { + "traceId": "other", + "body": { + "input": {"messages": [{"role": "user", "content": "irrelevant"}]}, + "output": {"messages": [{"role": "assistant", "content": "no"}]}, + }, + }, + ] + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"_eval_log_records": json.dumps(log_records)}, + } + ] + evaluator_input = EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="target", + ) + + metric = _mock_metric() + handler = DeepEvalHandler(metric=metric) + + handler(evaluator_input) + + test_case = metric.measure.call_args[0][0] + assert test_case.input == "relevant" + assert test_case.actual_output == "yes" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py index 1d90a689..2d6fbaea 100644 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py @@ -1,4 +1,4 @@ -"""Tests for deepeval input_mapper module.""" +"""Tests for deepeval input mapping and test case building.""" import json from unittest.mock import MagicMock @@ -6,9 +6,11 @@ import pytest from deepeval.test_case import SingleTurnParams -from bedrock_agentcore.evaluation.integrations.deepeval.input_mapper import ( +from bedrock_agentcore.evaluation.integrations.base import ( ParsedEvaluationEvent, - _extract_fields_from_spans, + extract_fields_from_spans as _extract_fields_from_spans, +) +from bedrock_agentcore.evaluation.integrations.deepeval.adapter import ( _get_required_params, build_test_case, ) From 8627ab09409cf69c52fe937e1442ef75635df68e Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Sat, 27 Jun 2026 08:35:38 -0700 Subject: [PATCH 11/13] Major refactor: move to custom_code_based_evaluators, add span parser layer, simplify per TJ/Irene feedback --- pyproject.toml | 6 + .../third_party/__init__.py | 5 + .../third_party/autoevals/__init__.py | 5 + .../third_party}/autoevals/adapter.py | 31 +- .../third_party/base.py | 110 ++++ .../third_party/deepeval/__init__.py | 5 + .../third_party/deepeval/adapter.py | 78 +++ .../third_party/span_parsers/__init__.py | 8 + .../third_party/span_parsers/base.py | 62 ++ .../third_party/span_parsers/common.py | 145 +++++ .../third_party/span_parsers/openinference.py | 27 + .../span_parsers/otel_langchain.py | 27 + .../third_party/span_parsers/strands.py | 26 + .../evaluation/integrations/__init__.py | 4 - .../integrations/autoevals/__init__.py | 5 - .../evaluation/integrations/base.py | 302 --------- .../integrations/deepeval/__init__.py | 5 - .../integrations/deepeval/adapter.py | 189 ------ .../third_party}/__init__.py | 0 .../third_party/autoevals}/__init__.py | 0 .../third_party/autoevals/test_adapter.py | 201 ++++++ .../third_party/deepeval/__init__.py | 0 .../third_party/deepeval/test_adapter.py | 218 +++++++ .../third_party/span_parsers/__init__.py | 0 .../span_parsers/test_span_parsers.py | 194 ++++++ .../integrations/autoevals/test_adapter.py | 217 ------- .../integrations/deepeval/test_handler.py | 427 ------------- .../deepeval/test_input_mapper.py | 581 ------------------ .../evaluation/test_third_party_adapters.py | 171 ++++++ 29 files changed, 1303 insertions(+), 1746 deletions(-) create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py rename src/bedrock_agentcore/evaluation/{integrations => custom_code_based_evaluators/third_party}/autoevals/adapter.py (63%) create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py create mode 100644 src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/base.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py delete mode 100644 src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py rename tests/bedrock_agentcore/evaluation/{integrations/autoevals => custom_code_based_evaluators/third_party}/__init__.py (100%) rename tests/bedrock_agentcore/evaluation/{integrations/deepeval => custom_code_based_evaluators/third_party/autoevals}/__init__.py (100%) create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py create mode 100644 tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py delete mode 100644 tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py delete mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py delete mode 100644 tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py create mode 100644 tests_integ/evaluation/test_third_party_adapters.py diff --git a/pyproject.toml b/pyproject.toml index 61520a5b..b1fc5e90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -173,3 +173,9 @@ simulation = [ datasets = [ "requests>=2.31.0", ] +deepeval = [ + "deepeval>=2.0.0", +] +autoevals = [ + "autoevals>=0.0.50", +] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py new file mode 100644 index 00000000..06ba3d0a --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py @@ -0,0 +1,5 @@ +"""Third-party evaluation adapters for AgentCore code-based evaluators.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter + +__all__ = ["BaseAdapter"] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py new file mode 100644 index 00000000..40e25fc1 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py @@ -0,0 +1,5 @@ +"""Autoevals adapter for AgentCore code-based evaluators.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter + +__all__ = ["AutoevalsAdapter"] diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py similarity index 63% rename from src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py rename to src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py index fe89435e..fa2acba3 100644 --- a/src/bedrock_agentcore/evaluation/integrations/autoevals/adapter.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py @@ -1,9 +1,10 @@ -"""Autoevals adapter for AgentCore evaluation integrations.""" +"""Autoevals adapter for AgentCore code-based evaluators.""" import logging from typing import Any, Callable, Dict, Optional -from bedrock_agentcore.evaluation.integrations.base import BaseAdapter +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter logger = logging.getLogger(__name__) @@ -14,31 +15,29 @@ class AutoevalsAdapter(BaseAdapter): Example:: from autoevals import Factuality + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter scorer = Factuality() - handler = AutoevalsAdapter(scorer=scorer) - - # Use as Lambda handler - def lambda_handler(event, context): - return handler(event, context) + adapter = AutoevalsAdapter(scorer=scorer) """ def __init__( self, scorer: Any, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, - timeout: Optional[int] = None, + field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None, + threshold: float = 0.5, ): """Initialize the adapter. Args: scorer: An Autoevals scorer instance (e.g. Factuality(), ClosedQA()). - field_mapper: Optional callable that receives the raw Lambda event and - returns a dict of field values. Bypasses default span extraction. - timeout: Maximum seconds to allow for scorer.eval(). Defaults to 290. + field_mapper: Optional callable that receives the EvaluatorInput and + returns a dict of field values. Bypasses default span parsing. + threshold: Score threshold for Pass/Fail determination. Defaults to 0.5. """ - super().__init__(field_mapper=field_mapper, timeout=timeout) + super().__init__(field_mapper=field_mapper) self.scorer = scorer + self.threshold = threshold def validate_fields(self, fields: Dict[str, Any]) -> None: """Validate that input and actual_output are present.""" @@ -54,7 +53,7 @@ def validate_fields(self, fields: Dict[str, Any]) -> None: f"Provide a field_mapper or ensure spans contain the necessary data." ) - def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: + def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: """Run the Autoevals scorer and return formatted results.""" kwargs: Dict[str, Any] = { "input": fields.get("input", ""), @@ -66,7 +65,7 @@ def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: result = self.scorer.eval(**kwargs) score = result.score - label = "Pass" if score is not None and score >= 0.5 else "Fail" + label = "Pass" if score is not None and score >= self.threshold else "Fail" explanation = getattr(result, "metadata", {}).get("rationale", "") if hasattr(result, "metadata") else "" - return {"value": score, "label": label, "explanation": explanation} + return EvaluatorOutput(value=score, label=label, explanation=explanation) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py new file mode 100644 index 00000000..1f28d2a5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py @@ -0,0 +1,110 @@ +"""Base adapter for third-party evaluation framework integrations.""" + +import abc +import logging +from typing import Any, Callable, Dict, List, Optional, Union + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import ( + SpanParseResult, + parse_spans, +) + +logger = logging.getLogger(__name__) + + +class BaseAdapter(abc.ABC): + """Base adapter for third-party evaluation framework integrations. + + Accepts an EvaluatorInput (from the code_based_evaluators flow), + extracts fields from spans using the built-in parser layer, runs the + evaluation via execute(), and returns an EvaluatorOutput. + + Never raises unhandled exceptions — always returns a valid EvaluatorOutput. + """ + + def __init__( + self, + field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None, + ): + """Initialize the adapter. + + Args: + field_mapper: Optional callable that receives the EvaluatorInput and + returns a dict of field values. Bypasses default span parsing + when provided. + """ + self.field_mapper = field_mapper + + def __call__(self, evaluator_input: EvaluatorInput, context: Any = None) -> EvaluatorOutput: + """Handle an evaluation invocation. + + Args: + evaluator_input: Parsed EvaluatorInput from the code-based evaluator flow. + context: Lambda context object (unused). + + Returns: + EvaluatorOutput with score, label, and explanation or error fields. + """ + try: + fields = self._extract_fields(evaluator_input) + except ValueError as e: + logger.error("Field extraction failed: %s", e) + return EvaluatorOutput( + label="Error", + errorCode="FIELD_EXTRACTION_ERROR", + errorMessage=str(e), + ) + + try: + self.validate_fields(fields) + except ValueError as e: + logger.error("Validation failed: %s", e) + return EvaluatorOutput( + label="Error", + errorCode="MISSING_REQUIRED_FIELD", + errorMessage=str(e), + ) + + try: + return self.execute(fields) + except Exception as e: + logger.error("Execution failed: %s", e, exc_info=True) + return EvaluatorOutput( + label="Error", + errorCode="METRIC_ERROR", + errorMessage=f"{type(self).__name__} failed: {e}", + ) + + def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]: + """Extract fields from the EvaluatorInput.""" + if self.field_mapper is not None: + return self.field_mapper(evaluator_input) + + reference_inputs = getattr(evaluator_input, "reference_inputs", None) + result = parse_spans(evaluator_input.session_spans, reference_inputs) + return result.to_dict() + + @abc.abstractmethod + def validate_fields(self, fields: Dict[str, Any]) -> None: + """Validate that required fields are present. + + Each adapter must explicitly declare its validation behavior. + + Args: + fields: Extracted field dict. + + Raises: + ValueError: If required fields are missing. + """ + + @abc.abstractmethod + def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: + """Run the evaluation and return an EvaluatorOutput. + + Args: + fields: Extracted field dict with keys like "input", "actual_output", etc. + + Returns: + EvaluatorOutput with evaluation results. + """ diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py new file mode 100644 index 00000000..99cf10d5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py @@ -0,0 +1,5 @@ +"""DeepEval adapter for AgentCore code-based evaluators.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter + +__all__ = ["DeepEvalAdapter"] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py new file mode 100644 index 00000000..725584ef --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py @@ -0,0 +1,78 @@ +"""DeepEval adapter for AgentCore code-based evaluators.""" + +import logging +from typing import Any, Callable, Dict, Optional + +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.base import BaseAdapter + +logger = logging.getLogger(__name__) + + +class DeepEvalAdapter(BaseAdapter): + """Adapter that runs a DeepEval metric against AgentCore evaluation events. + + Example:: + + from deepeval.metrics import AnswerRelevancyMetric + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = AnswerRelevancyMetric(threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + """ + + def __init__( + self, + metric: BaseMetric, + field_mapper: Optional[Callable[[EvaluatorInput], Dict[str, Any]]] = None, + model: Optional[Any] = None, + ): + """Initialize the adapter. + + Args: + metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). + field_mapper: Optional callable that receives the EvaluatorInput and + returns a dict of LLMTestCase field values. Bypasses default span + parsing when provided. + model: Optional model override for the metric's LLM. + """ + super().__init__(field_mapper=field_mapper) + self.metric = metric + if model is not None: + self.metric.model = model + + def validate_fields(self, fields: Dict[str, Any]) -> None: + """No pre-validation; let DeepEval raise on missing params.""" + + def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: + """Run the DeepEval metric and return formatted results.""" + test_case = LLMTestCase( + input=fields.get("input", ""), + actual_output=fields.get("actual_output", ""), + expected_output=fields.get("expected_output"), + context=fields.get("context"), + retrieval_context=fields.get("retrieval_context"), + ) + + try: + self.metric.measure(test_case) + except Exception as e: + error_type = type(e).__name__ + if "MissingTestCaseParams" in error_type or "missing" in str(e).lower(): + return EvaluatorOutput( + label="Error", + errorCode="MISSING_REQUIRED_FIELD", + errorMessage=f"{type(self.metric).__name__} requires fields not available: {e}", + ) + raise + + score = self.metric.score + reason = getattr(self.metric, "reason", None) or "" + threshold = getattr(self.metric, "threshold", 0.5) + success = getattr(self.metric, "success", score is not None and score >= threshold) + label = "Pass" if success else "Fail" + + return EvaluatorOutput(value=score, label=label, explanation=reason) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py new file mode 100644 index 00000000..5388df83 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py @@ -0,0 +1,8 @@ +"""Span parsers for extracting evaluation fields from Agent SDK trace formats.""" + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.base import ( + SpanParseResult, + parse_spans, +) + +__all__ = ["SpanParseResult", "parse_spans"] diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py new file mode 100644 index 00000000..3b88ff11 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py @@ -0,0 +1,62 @@ +"""Base span parsing logic and orchestration across format-specific parsers.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, +) +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.strands import ( + parse_strands_spans, +) +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.otel_langchain import ( + parse_otel_langchain_spans, +) +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.openinference import ( + parse_openinference_spans, +) + +logger = logging.getLogger(__name__) + + +_PARSERS = [ + parse_strands_spans, + parse_otel_langchain_spans, + parse_openinference_spans, +] + + +def parse_spans( + session_spans: List[Dict[str, Any]], + reference_inputs: Optional[List[Dict[str, Any]]] = None, +) -> SpanParseResult: + """Parse session spans using the first matching agent-level parser. + + Iterates through format-specific parsers (Strands, OTel LangChain, + OpenInference) and returns the result from the first one that + successfully extracts data. + + Args: + session_spans: Raw ADOT span dicts from the evaluation service. + reference_inputs: Optional reference inputs for expected_output. + + Returns: + SpanParseResult with extracted fields. + + Raises: + ValueError: If no parser can extract data from the spans. + """ + for parser in _PARSERS: + result = parser(session_spans) + if result is not None: + if reference_inputs: + expected = reference_inputs[0].get("expectedResponse") + if expected: + result.expected_output = expected + return result + + raise ValueError( + "Could not extract evaluation fields from spans. " + "No agent-level span with gen_ai.operation.name=='invoke_agent' and " + "valid span_events found. Provide a field_mapper for custom formats." + ) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py new file mode 100644 index 00000000..6d69dbc6 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py @@ -0,0 +1,145 @@ +"""Common span parsing utilities shared across format-specific parsers.""" + +import json +import logging +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +@dataclass +class SpanParseResult: + """Result of parsing spans into evaluation fields.""" + + input: Optional[str] = None + actual_output: Optional[str] = None + retrieval_context: Optional[List[str]] = None + context: Optional[List[str]] = None + expected_output: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dict, omitting None values.""" + result: Dict[str, Any] = {} + if self.input is not None: + result["input"] = self.input + if self.actual_output is not None: + result["actual_output"] = self.actual_output + if self.retrieval_context is not None: + result["retrieval_context"] = self.retrieval_context + if self.context is not None: + result["context"] = self.context + if self.expected_output is not None: + result["expected_output"] = self.expected_output + return result + + +def _get_message_content(message: Any) -> str: + """Extract text content from a message object.""" + if isinstance(message, str): + return message + if isinstance(message, dict): + for key in ("content", "message"): + if key in message: + val = message[key] + if isinstance(val, str): + return val + if isinstance(val, dict): + return _get_message_content(val) + if isinstance(val, list): + parts = [] + for item in val: + if isinstance(item, str): + parts.append(item) + elif isinstance(item, dict) and "text" in item: + parts.append(item["text"]) + if parts: + return "\n".join(parts) + return str(val) + return "" + + +def _parse_span_event_body(body: Any) -> Dict[str, Any]: + """Parse the body of a span event, handling both dict and JSON string.""" + if isinstance(body, str): + try: + return json.loads(body) + except (json.JSONDecodeError, TypeError): + return {} + if isinstance(body, dict): + return body + return {} + + +def extract_from_agent_span_events( + session_spans: List[Dict[str, Any]], +) -> Optional[SpanParseResult]: + """Extract evaluation fields from agent-level span events. + + Looks for spans where attributes.gen_ai.operation.name == "invoke_agent", + then inspects span_events for input/output messages. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent span with valid events found, None otherwise. + """ + user_messages: List[str] = [] + assistant_messages: List[str] = [] + tool_messages: List[str] = [] + + found_agent_span = False + + for span in session_spans: + attributes = span.get("attributes", {}) + operation_name = attributes.get("gen_ai.operation.name") + if operation_name != "invoke_agent": + continue + + found_agent_span = True + span_events = span.get("span_events", []) + + for event in span_events: + body = _parse_span_event_body(event.get("body")) + if not body: + continue + + input_data = body.get("input", {}) + if isinstance(input_data, dict): + for msg in input_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "user" and content: + user_messages.append(content) + + output_data = body.get("output", {}) + if isinstance(output_data, dict): + for msg in output_data.get("messages", []): + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = _get_message_content(msg) + if role == "assistant" and content: + assistant_messages.append(content) + elif role == "tool" and content: + tool_messages.append(content) + + if not found_agent_span: + return None + + if not user_messages and not assistant_messages: + return None + + result = SpanParseResult() + if user_messages: + result.input = user_messages[0] + if assistant_messages: + result.actual_output = assistant_messages[-1] + if tool_messages: + result.retrieval_context = tool_messages + result.context = tool_messages + + return result diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py new file mode 100644 index 00000000..e500740e --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/openinference.py @@ -0,0 +1,27 @@ +"""OpenInference LangChain span parser.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, + extract_from_agent_span_events, +) + +logger = logging.getLogger(__name__) + + +def parse_openinference_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]: + """Parse spans from OpenInference LangChain instrumentation format. + + Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent") + and span_events extraction. OpenInference-specific divergence can be added here + as schemas evolve. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent spans found, None otherwise. + """ + return extract_from_agent_span_events(session_spans) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py new file mode 100644 index 00000000..f1e211c5 --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/otel_langchain.py @@ -0,0 +1,27 @@ +"""OTel LangChain span parser.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, + extract_from_agent_span_events, +) + +logger = logging.getLogger(__name__) + + +def parse_otel_langchain_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]: + """Parse spans from OTel LangChain instrumentation format. + + Uses the same agent-level semantic signal (gen_ai.operation.name == "invoke_agent") + and span_events extraction. LangChain-specific divergence can be added here + as schemas evolve. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent spans found, None otherwise. + """ + return extract_from_agent_span_events(session_spans) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py new file mode 100644 index 00000000..3789ad9c --- /dev/null +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/strands.py @@ -0,0 +1,26 @@ +"""Strands Agent SDK span parser.""" + +import logging +from typing import Any, Dict, List, Optional + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers.common import ( + SpanParseResult, + extract_from_agent_span_events, +) + +logger = logging.getLogger(__name__) + + +def parse_strands_spans(session_spans: List[Dict[str, Any]]) -> Optional[SpanParseResult]: + """Parse spans from Strands Agent SDK format. + + Looks for spans with gen_ai.operation.name == "invoke_agent" and + extracts input/output from span_events. + + Args: + session_spans: Raw ADOT span dicts. + + Returns: + SpanParseResult if agent spans found, None otherwise. + """ + return extract_from_agent_span_events(session_spans) diff --git a/src/bedrock_agentcore/evaluation/integrations/__init__.py b/src/bedrock_agentcore/evaluation/integrations/__init__.py index a1ff7691..33048d5d 100644 --- a/src/bedrock_agentcore/evaluation/integrations/__init__.py +++ b/src/bedrock_agentcore/evaluation/integrations/__init__.py @@ -1,5 +1 @@ """AgentCore Evaluation integrations.""" - -from bedrock_agentcore.evaluation.integrations.base import BaseAdapter, ParsedEvaluationEvent - -__all__ = ["BaseAdapter", "ParsedEvaluationEvent"] diff --git a/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py deleted file mode 100644 index 0bc3b4ff..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Autoevals integration for AgentCore Evaluation.""" - -from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter - -__all__ = ["AutoevalsAdapter"] diff --git a/src/bedrock_agentcore/evaluation/integrations/base.py b/src/bedrock_agentcore/evaluation/integrations/base.py deleted file mode 100644 index a10f6606..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/base.py +++ /dev/null @@ -1,302 +0,0 @@ -"""Base adapter for AgentCore evaluation integrations.""" - -import abc -import json -import logging -import threading -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Union - -from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput - -logger = logging.getLogger(__name__) - - -@dataclass -class ParsedEvaluationEvent: - """Parsed representation of the AgentCore Lambda evaluation event.""" - - evaluation_level: str - session_spans: List[Dict[str, Any]] - target_trace_id: Optional[str] = None - target_span_id: Optional[str] = None - reference_inputs: List[Dict[str, Any]] = field(default_factory=list) - - @classmethod - def from_lambda_event(cls, event: Dict[str, Any]) -> "ParsedEvaluationEvent": - """Parse a raw Lambda event dict into a structured object. - - Args: - event: Raw Lambda event payload from the evaluation service. - - Returns: - ParsedEvaluationEvent with extracted fields. - - Raises: - KeyError: If required top-level fields are missing. - """ - evaluation_input = event["evaluationInput"] - target = event.get("evaluationTarget") or {} - trace_ids = target.get("traceIds") or [] - span_ids = target.get("spanIds") or [] - - return cls( - evaluation_level=event["evaluationLevel"], - session_spans=evaluation_input["sessionSpans"], - target_trace_id=trace_ids[0] if trace_ids else None, - target_span_id=span_ids[0] if span_ids else None, - reference_inputs=event.get("evaluationReferenceInputs") or [], - ) - - -def _get_message_content(message: Any) -> str: - """Extract text content from a message object. - - Message content can be a dict with a "content" or "message" key, or a plain string. - Handles one level of nesting (e.g. {"content": {"content": "text"}}). - """ - if isinstance(message, str): - return message - if isinstance(message, dict): - for key in ("content", "message"): - if key in message: - val = message[key] - if isinstance(val, str): - return val - if isinstance(val, dict): - return _get_message_content(val) - return str(val) - return "" - - -def extract_fields_from_spans( - parsed: ParsedEvaluationEvent, -) -> Dict[str, Any]: - """Extract evaluation fields from AgentCore session spans. - - Parses _eval_log_records from span attributes, filters by target_trace_id, - and extracts messages by role: - - input ← input messages where role=="user" - - actual_output ← output messages where role=="assistant" - - retrieval_context ← output messages where role=="tool" - - context ← same as retrieval_context - - expected_output ← evaluationReferenceInputs[0].expectedResponse - """ - user_messages: List[str] = [] - assistant_messages: List[str] = [] - tool_messages: List[str] = [] - - for span in parsed.session_spans: - attributes = span.get("attributes", {}) - log_records_raw = attributes.get("_eval_log_records") - if not log_records_raw: - continue - - if isinstance(log_records_raw, str): - try: - log_records = json.loads(log_records_raw) - except (json.JSONDecodeError, TypeError): - logger.debug("Failed to parse _eval_log_records as JSON") - continue - else: - log_records = log_records_raw - - if not isinstance(log_records, list): - continue - - for record in log_records: - if not isinstance(record, dict): - continue - - if parsed.target_trace_id: - record_trace_id = record.get("traceId") or record.get("trace_id") - if record_trace_id and record_trace_id != parsed.target_trace_id: - continue - - body = record.get("body", {}) - if not isinstance(body, dict): - continue - - input_data = body.get("input", {}) - if isinstance(input_data, dict): - for msg in input_data.get("messages", []): - if not isinstance(msg, dict): - continue - role = msg.get("role", "") - content = _get_message_content(msg) - if role == "user" and content: - user_messages.append(content) - - output_data = body.get("output", {}) - if isinstance(output_data, dict): - for msg in output_data.get("messages", []): - if not isinstance(msg, dict): - continue - role = msg.get("role", "") - content = _get_message_content(msg) - if role == "assistant" and content: - assistant_messages.append(content) - elif role == "tool" and content: - tool_messages.append(content) - - fields: Dict[str, Any] = {} - - if user_messages: - fields["input"] = "\n".join(user_messages) - if assistant_messages: - fields["actual_output"] = "\n".join(assistant_messages) - if tool_messages: - fields["retrieval_context"] = tool_messages - fields["context"] = tool_messages - - if parsed.reference_inputs: - expected = parsed.reference_inputs[0].get("expectedResponse") - if expected: - fields["expected_output"] = expected - - return fields - - -class _ExecutionTimeout(Exception): - """Raised when execution exceeds the configured timeout.""" - - -def _error_response(code: str, message: str) -> Dict[str, str]: - """Build a standardized error response dict.""" - return {"errorCode": code, "errorMessage": message} - - -class BaseAdapter(abc.ABC): - """Base adapter for evaluation framework integrations. - - Subclasses only need to implement execute(fields) which runs the actual - evaluation logic and returns (score, label, explanation). - - Never raises unhandled exceptions — always returns a valid response dict. - """ - - DEFAULT_TIMEOUT = 290 - - def __init__( - self, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, - timeout: Optional[int] = None, - ): - """Initialize the adapter. - - Args: - field_mapper: Optional callable that receives the raw Lambda event and - returns a dict of field values. Bypasses default span extraction. - timeout: Maximum seconds to allow for execute(). Defaults to 290 - (slightly under Lambda's 300s max). - """ - self.field_mapper = field_mapper - self.timeout = timeout if timeout is not None else self.DEFAULT_TIMEOUT - - def __call__(self, event: Union[Dict[str, Any], EvaluatorInput], context: Any = None) -> Dict[str, Any]: - """Handle a Lambda invocation. - - Args: - event: Either a raw Lambda event dict or an EvaluatorInput instance - from bedrock_agentcore.evaluation.custom_code_based_evaluators.models. - context: Lambda context object (unused). - - Returns: - Success: {"value": float, "label": str, "explanation": str} - Error: {"errorCode": str, "errorMessage": str} - """ - try: - if isinstance(event, EvaluatorInput): - parsed = ParsedEvaluationEvent( - evaluation_level=event.evaluation_level, - session_spans=event.session_spans, - target_trace_id=event.target_trace_id, - target_span_id=event.target_span_id, - reference_inputs=getattr(event, "reference_inputs", []) or [], - ) - else: - parsed = ParsedEvaluationEvent.from_lambda_event(event) - except (KeyError, IndexError, TypeError) as e: - logger.error("Failed to parse evaluation event: %s", e) - return _error_response("INVALID_EVENT", f"Failed to parse evaluation event: {e}") - - try: - fields = self._extract_fields(parsed) - except ValueError as e: - logger.error("Missing required fields: %s", e) - return _error_response("MISSING_REQUIRED_FIELD", str(e)) - - try: - result = self._execute_with_timeout(fields) - except _ExecutionTimeout: - return _error_response( - "METRIC_TIMEOUT", - f"{type(self).__name__} exceeded {self.timeout}s timeout.", - ) - except Exception as e: - logger.error("Execution failed: %s", e, exc_info=True) - return _error_response("METRIC_ERROR", f"{type(self).__name__} failed: {e}") - - return result - - def _extract_fields(self, parsed: ParsedEvaluationEvent) -> Dict[str, Any]: - """Extract fields from event, using field_mapper if provided.""" - if self.field_mapper is not None: - raw_event = { - "evaluationLevel": parsed.evaluation_level, - "evaluationInput": {"sessionSpans": parsed.session_spans}, - "evaluationTarget": { - "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], - "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], - }, - "evaluationReferenceInputs": parsed.reference_inputs, - } - return self.field_mapper(raw_event) - - fields = extract_fields_from_spans(parsed) - self.validate_fields(fields) - return fields - - def validate_fields(self, fields: Dict[str, Any]) -> None: - """Validate that required fields are present. - - Override in subclasses to enforce field requirements. - Default implementation does nothing. - """ - - @abc.abstractmethod - def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: - """Run the evaluation and return the response dict. - - Args: - fields: Extracted field dict with keys like "input", "actual_output", etc. - - Returns: - {"value": float, "label": str, "explanation": str} - """ - - def _execute_with_timeout(self, fields: Dict[str, Any]) -> Dict[str, Any]: - """Run execute() with a thread-based timeout.""" - if self.timeout <= 0: - return self.execute(fields) - - result_holder: list = [] - exception_holder: list = [] - - def target(): - try: - result_holder.append(self.execute(fields)) - except Exception as e: - exception_holder.append(e) - - thread = threading.Thread(target=target, daemon=True) - thread.start() - thread.join(timeout=self.timeout) - - if thread.is_alive(): - raise _ExecutionTimeout() - - if exception_holder: - raise exception_holder[0] - - return result_holder[0] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py deleted file mode 100644 index adb6ba44..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""DeepEval integration for AgentCore Evaluation.""" - -from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler - -__all__ = ["DeepEvalAdapter", "DeepEvalHandler"] diff --git a/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py deleted file mode 100644 index e8748782..00000000 --- a/src/bedrock_agentcore/evaluation/integrations/deepeval/adapter.py +++ /dev/null @@ -1,189 +0,0 @@ -"""DeepEval adapter for AgentCore evaluation integrations.""" - -import logging -from typing import Any, Callable, Dict, List, Optional - -from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase, SingleTurnParams - -from bedrock_agentcore.evaluation.integrations.base import ( - BaseAdapter, - ParsedEvaluationEvent, - extract_fields_from_spans, -) - -logger = logging.getLogger(__name__) - -_PARAM_TO_FIELD: Dict[SingleTurnParams, str] = { - SingleTurnParams.INPUT: "input", - SingleTurnParams.ACTUAL_OUTPUT: "actual_output", - SingleTurnParams.EXPECTED_OUTPUT: "expected_output", - SingleTurnParams.CONTEXT: "context", - SingleTurnParams.RETRIEVAL_CONTEXT: "retrieval_context", -} - -_METRIC_REQUIRED_PARAMS: Dict[str, List[str]] = { - "AnswerRelevancyMetric": ["input", "actual_output"], - "FaithfulnessMetric": ["input", "actual_output", "retrieval_context"], - "ContextualRelevancyMetric": ["input", "actual_output", "retrieval_context"], - "ContextualPrecisionMetric": ["input", "actual_output", "expected_output", "retrieval_context"], - "ContextualRecallMetric": ["input", "actual_output", "expected_output", "retrieval_context"], - "HallucinationMetric": ["input", "actual_output", "context"], - "BiasMetric": ["input", "actual_output"], - "ToxicityMetric": ["input", "actual_output"], - "GEval": ["input", "actual_output"], - "SummarizationMetric": ["input", "actual_output"], -} - - -def _get_required_params(metric: BaseMetric) -> List[str]: - """Determine which LLMTestCase fields a metric requires. - - Fallback chain: - 1. metric._required_params (DeepEval internal attribute) - 2. Static registry _METRIC_REQUIRED_PARAMS keyed by class name - 3. metric.evaluation_params (GEval special case) - 4. Default: ["input", "actual_output"] - """ - if hasattr(metric, "_required_params") and metric._required_params: - params = metric._required_params - if all(p in _PARAM_TO_FIELD for p in params): - return [_PARAM_TO_FIELD[p] for p in params] - - class_name = type(metric).__name__ - if class_name in _METRIC_REQUIRED_PARAMS: - return _METRIC_REQUIRED_PARAMS[class_name] - - if hasattr(metric, "evaluation_params") and metric.evaluation_params: - params = metric.evaluation_params - return [_PARAM_TO_FIELD.get(p, str(p).lower()) for p in params] - - return ["input", "actual_output"] - - -class DeepEvalAdapter(BaseAdapter): - """Adapter that runs a DeepEval metric against AgentCore evaluation events. - - Example:: - - from deepeval.metrics import AnswerRelevancyMetric - - metric = AnswerRelevancyMetric(threshold=0.7) - handler = DeepEvalAdapter(metric=metric) - - # Use as Lambda handler - def lambda_handler(event, context): - return handler(event, context) - """ - - def __init__( - self, - metric: BaseMetric, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, - model: Optional[Any] = None, - timeout: Optional[int] = None, - ): - """Initialize the adapter. - - Args: - metric: A DeepEval BaseMetric instance (e.g. AnswerRelevancyMetric). - field_mapper: Optional callable that receives the raw Lambda event and - returns a dict of LLMTestCase field values. Bypasses default span - extraction when provided. - model: Optional model override for the metric's LLM. Can be a string - model ID (e.g. "bedrock/anthropic.claude-3") or a DeepEvalBaseLLM - subclass instance. - timeout: Maximum seconds to allow for metric.measure(). Defaults to 290 - (slightly under Lambda's 300s max). - """ - super().__init__(field_mapper=field_mapper, timeout=timeout) - self.metric = metric - if model is not None: - self.metric.model = model - - def validate_fields(self, fields: Dict[str, Any]) -> None: - """Validate that fields required by the metric are present.""" - required = _get_required_params(self.metric) - missing = [f for f in required if f not in fields or not fields[f]] - if missing: - metric_name = type(self.metric).__name__ - raise ValueError( - f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " - f"Provide a field_mapper or ensure spans contain the necessary data." - ) - - def execute(self, fields: Dict[str, Any]) -> Dict[str, Any]: - """Run the DeepEval metric and return formatted results.""" - test_case = LLMTestCase( - input=fields.get("input", ""), - actual_output=fields.get("actual_output", ""), - expected_output=fields.get("expected_output"), - context=fields.get("context"), - retrieval_context=fields.get("retrieval_context"), - ) - - self.metric.measure(test_case) - - score = self.metric.score - reason = getattr(self.metric, "reason", None) or "" - threshold = getattr(self.metric, "threshold", 0.5) - success = getattr(self.metric, "success", score is not None and score >= threshold) - label = "Pass" if success else "Fail" - - return {"value": score, "label": label, "explanation": reason} - - -def build_test_case( - parsed: ParsedEvaluationEvent, - metric: BaseMetric, - field_mapper: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, -) -> LLMTestCase: - """Build a DeepEval LLMTestCase from a parsed evaluation event. - - Args: - parsed: The parsed Lambda event. - metric: The DeepEval metric (used to determine required fields). - field_mapper: Optional callable that receives the raw Lambda event fields - and returns a dict of LLMTestCase field values. Bypasses default - span extraction when provided. - - Returns: - An LLMTestCase ready for metric.measure(). - - Raises: - ValueError: If required fields for the metric cannot be populated. - """ - if field_mapper is not None: - raw_event = { - "evaluationLevel": parsed.evaluation_level, - "evaluationInput": {"sessionSpans": parsed.session_spans}, - "evaluationTarget": { - "traceIds": [parsed.target_trace_id] if parsed.target_trace_id else [], - "spanIds": [parsed.target_span_id] if parsed.target_span_id else [], - }, - "evaluationReferenceInputs": parsed.reference_inputs, - } - fields = field_mapper(raw_event) - else: - fields = extract_fields_from_spans(parsed) - - required = _get_required_params(metric) - missing = [f for f in required if f not in fields or not fields[f]] - if missing: - metric_name = type(metric).__name__ - raise ValueError( - f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " - f"Provide a field_mapper or ensure spans contain the necessary data." - ) - - return LLMTestCase( - input=fields.get("input", ""), - actual_output=fields.get("actual_output", ""), - expected_output=fields.get("expected_output"), - context=fields.get("context"), - retrieval_context=fields.get("retrieval_context"), - ) - - -# Backward-compatible alias -DeepEvalHandler = DeepEvalAdapter diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py similarity index 100% rename from tests/bedrock_agentcore/evaluation/integrations/autoevals/__init__.py rename to tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/__init__.py diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py similarity index 100% rename from tests/bedrock_agentcore/evaluation/integrations/deepeval/__init__.py rename to tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/__init__.py diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py new file mode 100644 index 00000000..2f640817 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/test_adapter.py @@ -0,0 +1,201 @@ +"""Tests for AutoevalsAdapter.""" + +from unittest.mock import MagicMock + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals.adapter import AutoevalsAdapter + + +def _make_evaluator_input(spans=None): + """Build an EvaluatorInput with agent-level spans.""" + if spans is None: + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ], + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="t1", + ) + + +def _mock_scorer(score=0.9, rationale="Good answer"): + """Create a mock Autoevals scorer.""" + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + + result = MagicMock() + result.score = score + result.metadata = {"rationale": rationale} + + scorer.eval = MagicMock(return_value=result) + return scorer + + +class TestAutoevalsAdapterSuccess: + def test_returns_pass_when_score_above_threshold(self): + scorer = _mock_scorer(score=0.8) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value == 0.8 + assert result.label == "Pass" + assert result.explanation == "Good answer" + + def test_returns_fail_when_score_below_threshold(self): + scorer = _mock_scorer(score=0.3) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.3 + assert result.label == "Fail" + + def test_custom_threshold(self): + scorer = _mock_scorer(score=0.6) + adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Fail" + + def test_custom_threshold_pass(self): + scorer = _mock_scorer(score=0.8) + adapter = AutoevalsAdapter(scorer=scorer, threshold=0.7) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Pass" + + def test_default_threshold_is_half(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + assert adapter.threshold == 0.5 + + def test_scorer_eval_called_with_input_and_output(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + adapter(_make_evaluator_input()) + + scorer.eval.assert_called_once() + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "What is AI?" + assert call_kwargs["output"] == "AI is artificial intelligence." + + def test_custom_field_mapper(self): + scorer = _mock_scorer() + adapter = AutoevalsAdapter( + scorer=scorer, + field_mapper=lambda ev: { + "input": "custom input", + "actual_output": "custom output", + }, + ) + + result = adapter(_make_evaluator_input()) + + call_kwargs = scorer.eval.call_args[1] + assert call_kwargs["input"] == "custom input" + assert call_kwargs["output"] == "custom output" + + +class TestAutoevalsAdapterErrors: + def test_no_agent_spans_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "chat"}, + "span_events": [], + } + ] + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert result.errorCode == "FIELD_EXTRACTION_ERROR" + + def test_missing_input_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "output": {"messages": [{"role": "assistant", "content": "answer"}]}, + } + } + ], + } + ] + scorer = _mock_scorer() + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert result.errorCode == "MISSING_REQUIRED_FIELD" + assert "input" in result.errorMessage + + def test_scorer_exception_returns_error(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=RuntimeError("API error")) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.errorCode == "METRIC_ERROR" + assert "API error" in result.errorMessage + + def test_never_raises(self): + scorer = _mock_scorer() + scorer.eval = MagicMock(side_effect=Exception("unexpected")) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode is not None + + +class TestAutoevalsAdapterEdgeCases: + def test_score_none_returns_fail(self): + scorer = _mock_scorer(score=None) + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Fail" + + def test_no_metadata_returns_empty_explanation(self): + scorer = MagicMock() + type(scorer).__name__ = "MockScorer" + result_obj = MagicMock(spec=[]) + result_obj.score = 0.9 + scorer.eval = MagicMock(return_value=result_obj) + + adapter = AutoevalsAdapter(scorer=scorer) + + result = adapter(_make_evaluator_input()) + + assert result.explanation == "" diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py new file mode 100644 index 00000000..3c8a3d39 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py @@ -0,0 +1,218 @@ +"""Tests for DeepEvalAdapter.""" + +from unittest.mock import MagicMock + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval.adapter import DeepEvalAdapter + + +def _make_evaluator_input(spans=None): + """Build an EvaluatorInput with agent-level spans.""" + if spans is None: + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ], + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="t1", + ) + + +def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"): + """Create a mock metric that returns a fixed score on measure().""" + metric = MagicMock() + type(metric).__name__ = name + metric.threshold = threshold + metric.score = score + metric.reason = reason + del metric.success + + def measure_side_effect(test_case): + metric.score = score + metric.reason = reason + + metric.measure = MagicMock(side_effect=measure_side_effect) + return metric + + +class TestDeepEvalAdapterSuccess: + def test_returns_pass_when_score_above_threshold(self): + metric = _mock_metric(score=0.9, threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value == 0.9 + assert result.label == "Pass" + assert result.explanation == "Looks good" + + def test_returns_fail_when_score_below_threshold(self): + metric = _mock_metric(score=0.3, threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.3 + assert result.label == "Fail" + + def test_returns_pass_at_exact_threshold(self): + metric = _mock_metric(score=0.7, threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Pass" + + def test_metric_measure_called_with_test_case(self): + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + adapter(_make_evaluator_input()) + + metric.measure.assert_called_once() + test_case = metric.measure.call_args[0][0] + assert test_case.input == "What is AI?" + assert test_case.actual_output == "AI is artificial intelligence." + + def test_custom_field_mapper(self): + metric = _mock_metric() + adapter = DeepEvalAdapter( + metric=metric, + field_mapper=lambda ev: { + "input": "mapped input", + "actual_output": "mapped output", + }, + ) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.85 + test_case = metric.measure.call_args[0][0] + assert test_case.input == "mapped input" + assert test_case.actual_output == "mapped output" + + def test_model_override_sets_metric_model(self): + metric = _mock_metric() + DeepEvalAdapter(metric=metric, model="bedrock/anthropic.claude-3") + + assert metric.model == "bedrock/anthropic.claude-3" + + def test_label_uses_metric_success_true(self): + metric = _mock_metric(score=0.3, threshold=0.7) + metric.success = True + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.3 + assert result.label == "Pass" + + def test_label_uses_metric_success_false(self): + metric = _mock_metric(score=0.9, threshold=0.7) + metric.success = False + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.9 + assert result.label == "Fail" + + +class TestDeepEvalAdapterErrors: + def test_no_agent_spans_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "chat"}, + "span_events": [], + } + ] + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode == "FIELD_EXTRACTION_ERROR" + assert result.label == "Error" + + def test_metric_measure_exception_returns_error(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout")) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.errorCode == "METRIC_ERROR" + assert "LLM timeout" in result.errorMessage + + def test_missing_params_error_caught(self): + metric = _mock_metric() + + class MissingTestCaseParamsError(Exception): + pass + + metric.measure = MagicMock( + side_effect=MissingTestCaseParamsError("retrieval_context is required") + ) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.errorCode == "MISSING_REQUIRED_FIELD" + assert "retrieval_context" in result.errorMessage + + def test_never_raises(self): + metric = _mock_metric() + metric.measure = MagicMock(side_effect=Exception("unexpected")) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode is not None + + +class TestDeepEvalAdapterEdgeCases: + def test_metric_with_no_reason(self): + metric = _mock_metric(score=0.8, reason=None) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.explanation == "" + + def test_metric_score_zero(self): + metric = _mock_metric(score=0.0, threshold=0.5) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.value == 0.0 + assert result.label == "Fail" + + def test_default_threshold_when_missing(self): + metric = _mock_metric(score=0.6) + del metric.threshold + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input()) + + assert result.label == "Pass" diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py new file mode 100644 index 00000000..de2a1bb5 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py @@ -0,0 +1,194 @@ +"""Tests for span parsers.""" + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import ( + SpanParseResult, + parse_spans, +) + + +def _make_agent_span(input_messages=None, output_messages=None, span_id="span1"): + """Build an agent-level span with span_events.""" + span_events = [] + body = {} + if input_messages is not None: + body["input"] = {"messages": input_messages} + if output_messages is not None: + body["output"] = {"messages": output_messages} + if body: + span_events.append({"body": body}) + + return { + "traceId": "abc123", + "spanId": span_id, + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": span_events, + } + + +class TestParseSpansSuccess: + def test_extracts_input_and_output(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "What is AI?"}], + output_messages=[{"role": "assistant", "content": "Artificial intelligence."}], + ) + ] + + result = parse_spans(spans) + + assert result.input == "What is AI?" + assert result.actual_output == "Artificial intelligence." + + def test_extracts_tool_messages_as_retrieval_context(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "query"}], + output_messages=[ + {"role": "tool", "content": "doc chunk 1"}, + {"role": "tool", "content": "doc chunk 2"}, + {"role": "assistant", "content": "answer"}, + ], + ) + ] + + result = parse_spans(spans) + + assert result.retrieval_context == ["doc chunk 1", "doc chunk 2"] + assert result.context == ["doc chunk 1", "doc chunk 2"] + assert result.actual_output == "answer" + + def test_uses_first_user_message_as_input(self): + spans = [ + _make_agent_span( + input_messages=[ + {"role": "user", "content": "first"}, + {"role": "user", "content": "second"}, + ], + output_messages=[{"role": "assistant", "content": "reply"}], + ) + ] + + result = parse_spans(spans) + + assert result.input == "first" + + def test_uses_last_assistant_message_as_output(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "q"}], + output_messages=[ + {"role": "assistant", "content": "first reply"}, + {"role": "assistant", "content": "final reply"}, + ], + ) + ] + + result = parse_spans(spans) + + assert result.actual_output == "final reply" + + def test_expected_output_from_reference_inputs(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": "q"}], + output_messages=[{"role": "assistant", "content": "a"}], + ) + ] + refs = [{"expectedResponse": "expected answer"}] + + result = parse_spans(spans, reference_inputs=refs) + + assert result.expected_output == "expected answer" + + def test_nested_content_dict(self): + spans = [ + _make_agent_span( + input_messages=[{"role": "user", "content": {"content": "nested"}}], + output_messages=[{"role": "assistant", "content": {"content": "nested out"}}], + ) + ] + + result = parse_spans(spans) + + assert result.input == "nested" + assert result.actual_output == "nested out" + + def test_body_as_json_string(self): + import json + + body = { + "input": {"messages": [{"role": "user", "content": "hello"}]}, + "output": {"messages": [{"role": "assistant", "content": "hi"}]}, + } + span = { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [{"body": json.dumps(body)}], + } + + result = parse_spans([span]) + + assert result.input == "hello" + assert result.actual_output == "hi" + + def test_to_dict_omits_none(self): + result = SpanParseResult(input="q", actual_output="a") + d = result.to_dict() + + assert d == {"input": "q", "actual_output": "a"} + assert "retrieval_context" not in d + + +class TestParseSpansErrors: + def test_no_agent_spans_raises(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "other_op"}, + "span_events": [], + } + ] + + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans(spans) + + def test_empty_spans_raises(self): + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans([]) + + def test_agent_span_without_events_raises(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [], + } + ] + + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans(spans) + + def test_non_agent_spans_ignored(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "chat"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "q"}]}, + "output": {"messages": [{"role": "assistant", "content": "a"}]}, + } + } + ], + } + ] + + with pytest.raises(ValueError, match="Could not extract evaluation fields"): + parse_spans(spans) diff --git a/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py b/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py deleted file mode 100644 index 17f674bd..00000000 --- a/tests/bedrock_agentcore/evaluation/integrations/autoevals/test_adapter.py +++ /dev/null @@ -1,217 +0,0 @@ -"""Tests for AutoevalsAdapter.""" - -import json -import time -from unittest.mock import MagicMock - -import pytest - -from bedrock_agentcore.evaluation.integrations.autoevals.adapter import AutoevalsAdapter - - -def _make_event( - level="TRACE", - trace_ids=None, - spans=None, - reference_inputs=None, -): - """Build a raw Lambda event dict for testing.""" - if spans is None: - log_records = [ - { - "body": { - "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, - "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, - } - } - ] - spans = [ - { - "traceId": "abc123", - "spanId": "span1", - "attributes": {"_eval_log_records": json.dumps(log_records)}, - } - ] - - event = { - "schemaVersion": "1.0", - "evaluationLevel": level, - "evaluationInput": {"sessionSpans": spans}, - "evaluationTarget": {}, - } - if trace_ids is not None: - event["evaluationTarget"]["traceIds"] = trace_ids - if reference_inputs is not None: - event["evaluationReferenceInputs"] = reference_inputs - return event - - -def _mock_scorer(score=0.9, rationale="Good answer"): - """Create a mock Autoevals scorer.""" - scorer = MagicMock() - type(scorer).__name__ = "MockScorer" - - result = MagicMock() - result.score = score - result.metadata = {"rationale": rationale} - - scorer.eval = MagicMock(return_value=result) - return scorer - - -class TestAutoevalsAdapterSuccess: - def test_returns_pass_when_score_above_half(self): - scorer = _mock_scorer(score=0.8) - adapter = AutoevalsAdapter(scorer=scorer) - - result = adapter(_make_event()) - - assert result["value"] == 0.8 - assert result["label"] == "Pass" - assert result["explanation"] == "Good answer" - - def test_returns_fail_when_score_below_half(self): - scorer = _mock_scorer(score=0.3) - adapter = AutoevalsAdapter(scorer=scorer) - - result = adapter(_make_event()) - - assert result["value"] == 0.3 - assert result["label"] == "Fail" - - def test_scorer_eval_called_with_input_and_output(self): - scorer = _mock_scorer() - adapter = AutoevalsAdapter(scorer=scorer) - - adapter(_make_event()) - - scorer.eval.assert_called_once() - call_kwargs = scorer.eval.call_args[1] - assert call_kwargs["input"] == "What is AI?" - assert call_kwargs["output"] == "AI is artificial intelligence." - - def test_expected_output_passed_as_expected(self): - scorer = _mock_scorer() - adapter = AutoevalsAdapter(scorer=scorer) - - refs = [{"expectedResponse": "AI stands for artificial intelligence."}] - result = adapter(_make_event(reference_inputs=refs)) - - call_kwargs = scorer.eval.call_args[1] - assert call_kwargs["expected"] == "AI stands for artificial intelligence." - - def test_no_expected_output_omits_expected_kwarg(self): - scorer = _mock_scorer() - adapter = AutoevalsAdapter(scorer=scorer) - - adapter(_make_event()) - - call_kwargs = scorer.eval.call_args[1] - assert "expected" not in call_kwargs - - def test_custom_field_mapper(self): - scorer = _mock_scorer() - adapter = AutoevalsAdapter( - scorer=scorer, - field_mapper=lambda event: { - "input": "custom input", - "actual_output": "custom output", - }, - ) - - result = adapter(_make_event()) - - call_kwargs = scorer.eval.call_args[1] - assert call_kwargs["input"] == "custom input" - assert call_kwargs["output"] == "custom output" - - -class TestAutoevalsAdapterErrors: - def test_invalid_event_returns_error(self): - scorer = _mock_scorer() - adapter = AutoevalsAdapter(scorer=scorer) - - result = adapter({}) - - assert result["errorCode"] == "INVALID_EVENT" - - def test_missing_input_returns_error(self): - log_records = [ - { - "body": { - "output": {"messages": [{"role": "assistant", "content": "answer"}]}, - } - } - ] - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"_eval_log_records": json.dumps(log_records)}, - } - ] - scorer = _mock_scorer() - adapter = AutoevalsAdapter(scorer=scorer) - - result = adapter(_make_event(spans=spans)) - - assert result["errorCode"] == "MISSING_REQUIRED_FIELD" - assert "input" in result["errorMessage"] - - def test_scorer_exception_returns_error(self): - scorer = _mock_scorer() - scorer.eval = MagicMock(side_effect=RuntimeError("API error")) - adapter = AutoevalsAdapter(scorer=scorer) - - result = adapter(_make_event()) - - assert result["errorCode"] == "METRIC_ERROR" - assert "API error" in result["errorMessage"] - - def test_never_raises_on_bad_input(self): - scorer = _mock_scorer() - adapter = AutoevalsAdapter(scorer=scorer) - - for bad_input in [None, [], "string", 42]: - result = adapter(bad_input) - assert "errorCode" in result - - -class TestAutoevalsAdapterTimeout: - def test_timeout_returns_error(self): - scorer = _mock_scorer() - scorer.eval = MagicMock(side_effect=lambda **kw: time.sleep(5)) - adapter = AutoevalsAdapter(scorer=scorer, timeout=1) - - result = adapter(_make_event()) - - assert result["errorCode"] == "METRIC_TIMEOUT" - - def test_default_timeout_is_290(self): - scorer = _mock_scorer() - adapter = AutoevalsAdapter(scorer=scorer) - - assert adapter.timeout == 290 - - -class TestAutoevalsAdapterEdgeCases: - def test_score_none_returns_fail(self): - scorer = _mock_scorer(score=None) - adapter = AutoevalsAdapter(scorer=scorer) - - result = adapter(_make_event()) - - assert result["label"] == "Fail" - - def test_no_metadata_returns_empty_explanation(self): - scorer = MagicMock() - type(scorer).__name__ = "MockScorer" - result_obj = MagicMock(spec=[]) - result_obj.score = 0.9 - scorer.eval = MagicMock(return_value=result_obj) - - adapter = AutoevalsAdapter(scorer=scorer) - - result = adapter(_make_event()) - - assert result["explanation"] == "" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py deleted file mode 100644 index 67bfda3d..00000000 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_handler.py +++ /dev/null @@ -1,427 +0,0 @@ -"""Tests for DeepEvalHandler and DeepEvalAdapter.""" - -import json -import time -from unittest.mock import MagicMock, patch - -import pytest - -from bedrock_agentcore.evaluation.integrations.deepeval.adapter import DeepEvalAdapter, DeepEvalHandler -from bedrock_agentcore.evaluation.integrations.base import BaseAdapter -from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput - - -def _make_event( - level="TRACE", - trace_ids=None, - spans=None, - reference_inputs=None, -): - """Build a raw Lambda event dict for testing.""" - if spans is None: - log_records = [ - { - "body": { - "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, - "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, - } - } - ] - spans = [ - { - "traceId": "abc123", - "spanId": "span1", - "attributes": {"_eval_log_records": json.dumps(log_records)}, - } - ] - - event = { - "schemaVersion": "1.0", - "evaluationLevel": level, - "evaluationInput": {"sessionSpans": spans}, - "evaluationTarget": {}, - } - if trace_ids is not None: - event["evaluationTarget"]["traceIds"] = trace_ids - if reference_inputs is not None: - event["evaluationReferenceInputs"] = reference_inputs - return event - - -def _mock_metric(score=0.85, reason="Looks good", threshold=0.7, name="MockMetric"): - """Create a mock metric that returns a fixed score on measure().""" - metric = MagicMock() - type(metric).__name__ = name - metric.threshold = threshold - metric.score = score - metric.reason = reason - metric._required_params = None - del metric._required_params - del metric.evaluation_params - del metric.success - - def measure_side_effect(test_case): - metric.score = score - metric.reason = reason - - metric.measure = MagicMock(side_effect=measure_side_effect) - return metric - - -class TestDeepEvalHandlerSuccess: - def test_returns_pass_when_score_above_threshold(self): - metric = _mock_metric(score=0.9, threshold=0.7) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["value"] == 0.9 - assert result["label"] == "Pass" - assert result["explanation"] == "Looks good" - - def test_returns_fail_when_score_below_threshold(self): - metric = _mock_metric(score=0.3, threshold=0.7) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["value"] == 0.3 - assert result["label"] == "Fail" - - def test_returns_pass_at_exact_threshold(self): - metric = _mock_metric(score=0.7, threshold=0.7) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["label"] == "Pass" - - def test_metric_measure_called_with_test_case(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - - handler(_make_event()) - - metric.measure.assert_called_once() - test_case = metric.measure.call_args[0][0] - assert test_case.input == "What is AI?" - assert test_case.actual_output == "AI is artificial intelligence." - - def test_context_parameter_ignored(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - mock_context = {"function_name": "my-lambda"} - - result = handler(_make_event(), mock_context) - - assert result["value"] == 0.85 - - def test_custom_field_mapper(self): - metric = _mock_metric() - handler = DeepEvalHandler( - metric=metric, - field_mapper=lambda event: { - "input": "mapped input", - "actual_output": "mapped output", - }, - ) - - result = handler(_make_event()) - - assert result["value"] == 0.85 - test_case = metric.measure.call_args[0][0] - assert test_case.input == "mapped input" - assert test_case.actual_output == "mapped output" - - -class TestDeepEvalHandlerErrors: - def test_invalid_event_returns_error(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - - result = handler({}) - - assert result["errorCode"] == "INVALID_EVENT" - assert "errorMessage" in result - assert "value" not in result - - def test_missing_evaluation_input_returns_error(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - - event = {"evaluationLevel": "TRACE", "evaluationTarget": {}} - result = handler(event) - - assert result["errorCode"] == "INVALID_EVENT" - - def test_missing_required_field_returns_error(self): - log_records = [ - { - "body": { - "input": {"messages": [{"role": "user", "content": "q"}]}, - "output": {"messages": [{"role": "assistant", "content": "a"}]}, - } - } - ] - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"_eval_log_records": json.dumps(log_records)}, - } - ] - metric = _mock_metric(name="FaithfulnessMetric") - handler = DeepEvalHandler(metric=metric) - - event = _make_event(spans=spans) - result = handler(event) - - assert result["errorCode"] == "MISSING_REQUIRED_FIELD" - assert "retrieval_context" in result["errorMessage"] - - def test_metric_measure_exception_returns_error(self): - metric = _mock_metric() - metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout")) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["errorCode"] == "METRIC_ERROR" - assert "LLM timeout" in result["errorMessage"] - - def test_never_raises_on_any_input(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - - for bad_input in [None, [], "string", 42, {"random": "keys"}]: - result = handler(bad_input) - assert "errorCode" in result or "value" in result - - -class TestDeepEvalHandlerEdgeCases: - def test_metric_with_no_reason(self): - metric = _mock_metric(score=0.8, reason=None) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["explanation"] == "" - - def test_metric_score_zero(self): - metric = _mock_metric(score=0.0, threshold=0.5) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["value"] == 0.0 - assert result["label"] == "Fail" - - def test_metric_score_one(self): - metric = _mock_metric(score=1.0, threshold=0.5) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["value"] == 1.0 - assert result["label"] == "Pass" - - def test_default_threshold_when_missing(self): - metric = _mock_metric(score=0.6) - del metric.threshold - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["label"] == "Pass" - - def test_label_uses_metric_success_true(self): - metric = _mock_metric(score=0.3, threshold=0.7) - metric.success = True - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["value"] == 0.3 - assert result["label"] == "Pass" - - def test_label_uses_metric_success_false(self): - metric = _mock_metric(score=0.9, threshold=0.7) - metric.success = False - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["value"] == 0.9 - assert result["label"] == "Fail" - - def test_label_falls_back_to_threshold_when_no_success(self): - metric = _mock_metric(score=0.8, threshold=0.7) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["label"] == "Pass" - - def test_model_override_sets_metric_model(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric, model="bedrock/anthropic.claude-3") - - assert metric.model == "bedrock/anthropic.claude-3" - - def test_no_model_override_leaves_metric_unchanged(self): - metric = _mock_metric() - metric.model = "original-model" - handler = DeepEvalHandler(metric=metric) - - handler(_make_event()) - - assert metric.model == "original-model" - - -class TestDeepEvalHandlerTimeout: - def test_timeout_returns_error(self): - metric = _mock_metric() - metric.measure = MagicMock(side_effect=lambda tc: time.sleep(5)) - handler = DeepEvalHandler(metric=metric, timeout=1) - - result = handler(_make_event()) - - assert result["errorCode"] == "METRIC_TIMEOUT" - assert "1s timeout" in result["errorMessage"] - - def test_no_timeout_when_measure_completes_in_time(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric, timeout=10) - - result = handler(_make_event()) - - assert result["value"] == 0.85 - assert "errorCode" not in result - - def test_default_timeout_is_290(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - - assert handler.timeout == 290 - - def test_custom_timeout_value(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric, timeout=60) - - assert handler.timeout == 60 - - def test_metric_exception_still_propagates_with_timeout(self): - metric = _mock_metric() - metric.measure = MagicMock(side_effect=RuntimeError("LLM error")) - handler = DeepEvalHandler(metric=metric, timeout=10) - - result = handler(_make_event()) - - assert result["errorCode"] == "METRIC_ERROR" - assert "LLM error" in result["errorMessage"] - - -class TestBackwardCompatibility: - def test_handler_is_alias_for_adapter(self): - assert DeepEvalHandler is DeepEvalAdapter - - def test_adapter_is_subclass_of_base(self): - assert issubclass(DeepEvalAdapter, BaseAdapter) - - def test_import_from_init(self): - from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalHandler as H - from bedrock_agentcore.evaluation.integrations.deepeval import DeepEvalAdapter as A - - assert H is A - - def test_handler_works_same_as_before(self): - metric = _mock_metric(score=0.9, threshold=0.7) - handler = DeepEvalHandler(metric=metric) - - result = handler(_make_event()) - - assert result["value"] == 0.9 - assert result["label"] == "Pass" - - -class TestEvaluatorInputAcceptance: - def _make_evaluator_input(self): - log_records = [ - { - "body": { - "input": {"messages": [{"role": "user", "content": "Hello"}]}, - "output": {"messages": [{"role": "assistant", "content": "Hi there"}]}, - } - } - ] - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"_eval_log_records": json.dumps(log_records)}, - } - ] - return EvaluatorInput( - evaluation_level="TRACE", - session_spans=spans, - target_trace_id="t1", - target_span_id=None, - ) - - def test_accepts_evaluator_input(self): - metric = _mock_metric(score=0.95) - handler = DeepEvalHandler(metric=metric) - - result = handler(self._make_evaluator_input()) - - assert result["value"] == 0.95 - assert result["label"] == "Pass" - - def test_evaluator_input_extracts_fields_correctly(self): - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - - handler(self._make_evaluator_input()) - - test_case = metric.measure.call_args[0][0] - assert test_case.input == "Hello" - assert test_case.actual_output == "Hi there" - - def test_evaluator_input_with_trace_id_filtering(self): - log_records = [ - { - "traceId": "target", - "body": { - "input": {"messages": [{"role": "user", "content": "relevant"}]}, - "output": {"messages": [{"role": "assistant", "content": "yes"}]}, - }, - }, - { - "traceId": "other", - "body": { - "input": {"messages": [{"role": "user", "content": "irrelevant"}]}, - "output": {"messages": [{"role": "assistant", "content": "no"}]}, - }, - }, - ] - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"_eval_log_records": json.dumps(log_records)}, - } - ] - evaluator_input = EvaluatorInput( - evaluation_level="TRACE", - session_spans=spans, - target_trace_id="target", - ) - - metric = _mock_metric() - handler = DeepEvalHandler(metric=metric) - - handler(evaluator_input) - - test_case = metric.measure.call_args[0][0] - assert test_case.input == "relevant" - assert test_case.actual_output == "yes" diff --git a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py b/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py deleted file mode 100644 index 2d6fbaea..00000000 --- a/tests/bedrock_agentcore/evaluation/integrations/deepeval/test_input_mapper.py +++ /dev/null @@ -1,581 +0,0 @@ -"""Tests for deepeval input mapping and test case building.""" - -import json -from unittest.mock import MagicMock - -import pytest -from deepeval.test_case import SingleTurnParams - -from bedrock_agentcore.evaluation.integrations.base import ( - ParsedEvaluationEvent, - extract_fields_from_spans as _extract_fields_from_spans, -) -from bedrock_agentcore.evaluation.integrations.deepeval.adapter import ( - _get_required_params, - build_test_case, -) - - -def _make_log_record( - input_messages=None, - output_messages=None, - trace_id=None, -): - """Build a single log record dict.""" - record = {"body": {}} - if input_messages is not None: - record["body"]["input"] = {"messages": input_messages} - if output_messages is not None: - record["body"]["output"] = {"messages": output_messages} - if trace_id is not None: - record["traceId"] = trace_id - return record - - -def _make_span_with_log_records(log_records, span_id="span1", as_json_string=True): - """Build a span dict with _eval_log_records in attributes.""" - value = json.dumps(log_records) if as_json_string else log_records - return { - "traceId": "abc123", - "spanId": span_id, - "attributes": {"_eval_log_records": value}, - } - - -def _make_event( - level="TRACE", - trace_ids=None, - span_ids=None, - spans=None, - reference_inputs=None, -): - """Build a raw Lambda event dict for testing.""" - if spans is None: - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "What is the capital of France?"}], - output_messages=[{"role": "assistant", "content": "The capital of France is Paris."}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - - event = { - "schemaVersion": "1.0", - "evaluationLevel": level, - "evaluationInput": {"sessionSpans": spans}, - "evaluationTarget": {}, - } - if trace_ids is not None: - event["evaluationTarget"]["traceIds"] = trace_ids - if span_ids is not None: - event["evaluationTarget"]["spanIds"] = span_ids - if reference_inputs is not None: - event["evaluationReferenceInputs"] = reference_inputs - return event - - -def _mock_metric(name="MockMetric", required_params=None, evaluation_params=None, threshold=0.5): - """Create a mock DeepEval metric.""" - metric = MagicMock() - type(metric).__name__ = name - metric.threshold = threshold - - if required_params is not None: - metric._required_params = required_params - else: - del metric._required_params - - if evaluation_params is not None: - metric.evaluation_params = evaluation_params - else: - del metric.evaluation_params - - return metric - - -class TestParsedEvaluationEvent: - def test_from_lambda_event_trace_level(self): - event = _make_event(level="TRACE", trace_ids=["trace-1"]) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - - assert parsed.evaluation_level == "TRACE" - assert parsed.target_trace_id == "trace-1" - assert parsed.target_span_id is None - assert len(parsed.session_spans) == 1 - - def test_from_lambda_event_tool_call_level(self): - event = _make_event(level="TOOL_CALL", span_ids=["span-42"]) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - - assert parsed.evaluation_level == "TOOL_CALL" - assert parsed.target_span_id == "span-42" - assert parsed.target_trace_id is None - - def test_from_lambda_event_session_level(self): - event = _make_event(level="SESSION") - parsed = ParsedEvaluationEvent.from_lambda_event(event) - - assert parsed.evaluation_level == "SESSION" - assert parsed.target_trace_id is None - assert parsed.target_span_id is None - - def test_from_lambda_event_with_reference_inputs(self): - refs = [{"expectedResponse": "Paris is the capital of France."}] - event = _make_event(reference_inputs=refs) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - - assert parsed.reference_inputs == refs - - def test_from_lambda_event_missing_reference_inputs(self): - event = _make_event() - parsed = ParsedEvaluationEvent.from_lambda_event(event) - - assert parsed.reference_inputs == [] - - def test_from_lambda_event_missing_evaluation_level_raises(self): - event = _make_event() - del event["evaluationLevel"] - - with pytest.raises(KeyError): - ParsedEvaluationEvent.from_lambda_event(event) - - def test_from_lambda_event_missing_evaluation_input_raises(self): - event = _make_event() - del event["evaluationInput"] - - with pytest.raises(KeyError): - ParsedEvaluationEvent.from_lambda_event(event) - - def test_from_lambda_event_missing_target_key_defaults(self): - event = _make_event() - del event["evaluationTarget"] - parsed = ParsedEvaluationEvent.from_lambda_event(event) - - assert parsed.target_trace_id is None - assert parsed.target_span_id is None - - -class TestGetRequiredParams: - def test_uses_required_params_attribute(self): - metric = _mock_metric( - required_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT] - ) - result = _get_required_params(metric) - - assert result == ["input", "actual_output"] - - def test_falls_back_to_static_registry(self): - metric = _mock_metric(name="FaithfulnessMetric") - result = _get_required_params(metric) - - assert result == ["input", "actual_output", "retrieval_context"] - - def test_falls_back_to_evaluation_params(self): - metric = _mock_metric( - name="UnknownMetric", - evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.RETRIEVAL_CONTEXT], - ) - result = _get_required_params(metric) - - assert result == ["input", "retrieval_context"] - - def test_defaults_to_input_and_actual_output(self): - metric = _mock_metric(name="UnknownMetric") - result = _get_required_params(metric) - - assert result == ["input", "actual_output"] - - def test_unmappable_required_params_skips_to_static_registry(self): - metric = _mock_metric(name="GEval", required_params=["SomeTypingObject", "AnotherType"]) - result = _get_required_params(metric) - - assert result == ["input", "actual_output"] - - def test_unmappable_required_params_falls_to_default(self): - metric = _mock_metric(name="UnknownMetric", required_params=["SomeTypingObject"]) - result = _get_required_params(metric) - - assert result == ["input", "actual_output"] - - def test_empty_required_params_falls_through(self): - metric = _mock_metric(name="UnknownMetric", required_params=[]) - result = _get_required_params(metric) - - assert result == ["input", "actual_output"] - - -class TestExtractFieldsFromSpans: - def test_basic_extraction(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "hello"}], - output_messages=[{"role": "assistant", "content": "world"}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "hello" - assert fields["actual_output"] == "world" - - def test_tool_messages_become_retrieval_context(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "query"}], - output_messages=[ - {"role": "tool", "content": "doc chunk 1"}, - {"role": "tool", "content": "doc chunk 2"}, - {"role": "assistant", "content": "answer"}, - ], - ) - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["retrieval_context"] == ["doc chunk 1", "doc chunk 2"] - assert fields["actual_output"] == "answer" - - def test_tool_messages_also_set_context_for_hallucination_metric(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "query"}], - output_messages=[ - {"role": "tool", "content": "context chunk"}, - {"role": "assistant", "content": "answer"}, - ], - ) - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["context"] == ["context chunk"] - assert fields["context"] == fields["retrieval_context"] - - def test_message_content_as_dict_with_content_key(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": {"content": "nested content"}}], - output_messages=[{"role": "assistant", "content": {"content": "nested output"}}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "nested content" - assert fields["actual_output"] == "nested output" - - def test_message_content_as_dict_with_message_key(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "message": "msg key input"}], - output_messages=[{"role": "assistant", "message": "msg key output"}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "msg key input" - assert fields["actual_output"] == "msg key output" - - def test_message_content_as_plain_string_in_content_field(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "plain string"}], - output_messages=[{"role": "assistant", "content": "plain response"}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "plain string" - assert fields["actual_output"] == "plain response" - - def test_target_trace_id_filters_records(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "relevant"}], - output_messages=[{"role": "assistant", "content": "relevant answer"}], - trace_id="target-trace", - ), - _make_log_record( - input_messages=[{"role": "user", "content": "irrelevant"}], - output_messages=[{"role": "assistant", "content": "irrelevant answer"}], - trace_id="other-trace", - ), - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", - session_spans=spans, - target_trace_id="target-trace", - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "relevant" - assert fields["actual_output"] == "relevant answer" - - def test_no_target_trace_id_includes_all_records(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "first"}], - output_messages=[{"role": "assistant", "content": "first answer"}], - trace_id="trace-1", - ), - _make_log_record( - input_messages=[{"role": "user", "content": "second"}], - output_messages=[{"role": "assistant", "content": "second answer"}], - trace_id="trace-2", - ), - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="SESSION", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "first\nsecond" - assert fields["actual_output"] == "first answer\nsecond answer" - - def test_log_records_as_parsed_list(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "from list"}], - output_messages=[{"role": "assistant", "content": "from list answer"}], - ) - ] - spans = [_make_span_with_log_records(log_records, as_json_string=False)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "from list" - assert fields["actual_output"] == "from list answer" - - def test_invalid_json_log_records_skipped(self): - spans = [ - { - "traceId": "t1", - "spanId": "s1", - "attributes": {"_eval_log_records": "not valid json{{{"}, - } - ] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields == {} - - def test_span_without_log_records_skipped(self): - spans = [{"traceId": "t1", "spanId": "s1", "attributes": {}}] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields == {} - - def test_multiple_spans_aggregated(self): - log_records_1 = [ - _make_log_record( - input_messages=[{"role": "user", "content": "q1"}], - output_messages=[{"role": "assistant", "content": "a1"}], - ) - ] - log_records_2 = [ - _make_log_record( - input_messages=[{"role": "user", "content": "q2"}], - output_messages=[{"role": "assistant", "content": "a2"}], - ) - ] - spans = [ - _make_span_with_log_records(log_records_1, span_id="s1"), - _make_span_with_log_records(log_records_2, span_id="s2"), - ] - parsed = ParsedEvaluationEvent( - evaluation_level="SESSION", session_spans=spans - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "q1\nq2" - assert fields["actual_output"] == "a1\na2" - - def test_reference_inputs_expected_output(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "q"}], - output_messages=[{"role": "assistant", "content": "a"}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", - session_spans=spans, - reference_inputs=[{"expectedResponse": "expected answer"}], - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["expected_output"] == "expected answer" - - def test_record_without_matching_trace_id_key_included(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "no trace id record"}], - output_messages=[{"role": "assistant", "content": "response"}], - ), - ] - spans = [_make_span_with_log_records(log_records)] - parsed = ParsedEvaluationEvent( - evaluation_level="TRACE", - session_spans=spans, - target_trace_id="target-trace", - ) - - fields = _extract_fields_from_spans(parsed) - - assert fields["input"] == "no trace id record" - - -class TestBuildTestCase: - def test_basic_span_extraction(self): - event = _make_event() - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="AnswerRelevancyMetric") - - test_case = build_test_case(parsed, metric) - - assert test_case.input == "What is the capital of France?" - assert test_case.actual_output == "The capital of France is Paris." - - def test_retrieval_context_from_tool_messages(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "query"}], - output_messages=[ - {"role": "tool", "content": "doc chunk 1"}, - {"role": "tool", "content": "doc chunk 2"}, - {"role": "assistant", "content": "answer"}, - ], - ) - ] - spans = [_make_span_with_log_records(log_records)] - event = _make_event(spans=spans) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="FaithfulnessMetric") - - test_case = build_test_case(parsed, metric) - - assert test_case.input == "query" - assert test_case.actual_output == "answer" - assert test_case.retrieval_context == ["doc chunk 1", "doc chunk 2"] - - def test_expected_output_from_reference_inputs(self): - refs = [{"expectedResponse": "Paris"}] - event = _make_event(reference_inputs=refs) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="AnswerRelevancyMetric") - - test_case = build_test_case(parsed, metric) - - assert test_case.expected_output == "Paris" - - def test_missing_required_field_raises_value_error(self): - log_records = [ - _make_log_record( - input_messages=[{"role": "user", "content": "query"}], - output_messages=[{"role": "assistant", "content": "answer"}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - event = _make_event(spans=spans) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="FaithfulnessMetric") - - with pytest.raises(ValueError, match="retrieval_context"): - build_test_case(parsed, metric) - - def test_custom_field_mapper_bypasses_extraction(self): - event = _make_event() - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="AnswerRelevancyMetric") - - def custom_mapper(raw_event): - return { - "input": "custom input", - "actual_output": "custom output", - } - - test_case = build_test_case(parsed, metric, field_mapper=custom_mapper) - - assert test_case.input == "custom input" - assert test_case.actual_output == "custom output" - - def test_field_mapper_receives_reconstructed_event(self): - refs = [{"expectedResponse": "expected"}] - event = _make_event(level="TRACE", trace_ids=["t1"], reference_inputs=refs) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="AnswerRelevancyMetric") - - received_events = [] - - def capture_mapper(raw_event): - received_events.append(raw_event) - return {"input": "x", "actual_output": "y"} - - build_test_case(parsed, metric, field_mapper=capture_mapper) - - raw = received_events[0] - assert raw["evaluationLevel"] == "TRACE" - assert raw["evaluationTarget"]["traceIds"] == ["t1"] - assert raw["evaluationReferenceInputs"] == refs - - def test_multiple_user_messages_concatenated(self): - log_records = [ - _make_log_record( - input_messages=[ - {"role": "user", "content": "hello"}, - {"role": "user", "content": "world"}, - ], - output_messages=[{"role": "assistant", "content": "hi"}], - ) - ] - spans = [_make_span_with_log_records(log_records)] - event = _make_event(spans=spans) - parsed = ParsedEvaluationEvent.from_lambda_event(event) - metric = _mock_metric(name="AnswerRelevancyMetric") - - test_case = build_test_case(parsed, metric) - - assert test_case.input == "hello\nworld" diff --git a/tests_integ/evaluation/test_third_party_adapters.py b/tests_integ/evaluation/test_third_party_adapters.py new file mode 100644 index 00000000..a9f0eac6 --- /dev/null +++ b/tests_integ/evaluation/test_third_party_adapters.py @@ -0,0 +1,171 @@ +"""Integration tests for third-party evaluation adapters. + +These tests require `deepeval` and `autoevals` packages to be installed. +They verify the full adapter flow from EvaluatorInput through span parsing +to metric execution, using real library metrics (not mocks). + +SETUP: + pip install deepeval autoevals + +RUN: + pytest tests_integ/evaluation/test_third_party_adapters.py -v +""" + +import pytest + +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput + + +def _make_agent_evaluator_input( + user_prompt="What is the capital of France?", + agent_response="The capital of France is Paris.", + tool_messages=None, +): + """Build an EvaluatorInput with agent-level spans.""" + output_messages = [] + if tool_messages: + for msg in tool_messages: + output_messages.append({"role": "tool", "content": msg}) + output_messages.append({"role": "assistant", "content": agent_response}) + + spans = [ + { + "traceId": "integ-trace-1", + "spanId": "integ-span-1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": user_prompt}]}, + "output": {"messages": output_messages}, + } + } + ], + } + ] + return EvaluatorInput( + evaluation_level="TRACE", + session_spans=spans, + target_trace_id="integ-trace-1", + ) + + +class TestDeepEvalAdapterIntegration: + """Integration tests for DeepEvalAdapter with real DeepEval metrics.""" + + @pytest.fixture(autouse=True) + def check_deepeval(self): + """Skip if deepeval is not installed.""" + pytest.importorskip("deepeval") + + def test_bias_metric_passes(self): + from deepeval.metrics import BiasMetric + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = BiasMetric(threshold=0.5) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + assert result.label in ("Pass", "Fail") + + def test_missing_retrieval_context_returns_error(self): + from deepeval.metrics import FaithfulnessMetric + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = FaithfulnessMetric(threshold=0.7) + adapter = DeepEvalAdapter(metric=metric) + + result = adapter( + _make_agent_evaluator_input( + user_prompt="Is the sky blue?", + agent_response="Yes, the sky is blue.", + ) + ) + + assert isinstance(result, EvaluatorOutput) + assert result.errorCode == "MISSING_REQUIRED_FIELD" or result.value is not None + + def test_with_field_mapper(self): + from deepeval.metrics import BiasMetric + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.deepeval import DeepEvalAdapter + + metric = BiasMetric(threshold=0.5) + adapter = DeepEvalAdapter( + metric=metric, + field_mapper=lambda ev: { + "input": "Is Python a good language?", + "actual_output": "Python is a versatile programming language used widely.", + }, + ) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + + +class TestAutoevalsAdapterIntegration: + """Integration tests for AutoevalsAdapter with real Autoevals scorers.""" + + @pytest.fixture(autouse=True) + def check_autoevals(self): + """Skip if autoevals is not installed.""" + pytest.importorskip("autoevals") + + def test_factuality_scorer(self): + from autoevals import Factuality + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter + + scorer = Factuality() + adapter = AutoevalsAdapter(scorer=scorer) + + evaluator_input = _make_agent_evaluator_input() + evaluator_input.session_spans[0]["span_events"][0]["body"]["output"]["messages"] = [ + {"role": "assistant", "content": "The capital of France is Paris."} + ] + + result = adapter(evaluator_input) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + assert result.label in ("Pass", "Fail") + + def test_custom_threshold(self): + from autoevals import Factuality + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter + + scorer = Factuality() + adapter = AutoevalsAdapter(scorer=scorer, threshold=0.9) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None + + def test_with_field_mapper(self): + from autoevals import Factuality + + from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.autoevals import AutoevalsAdapter + + scorer = Factuality() + adapter = AutoevalsAdapter( + scorer=scorer, + field_mapper=lambda ev: { + "input": "What is 2+2?", + "actual_output": "4", + "expected_output": "4", + }, + ) + + result = adapter(_make_agent_evaluator_input()) + + assert isinstance(result, EvaluatorOutput) + assert result.value is not None From 9a9b6a75053fb7de6016a54eed9835291ce59fcb Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Tue, 30 Jun 2026 11:53:47 -0700 Subject: [PATCH 12/13] Fix review items: add reference_inputs to model, tighten error detection, add validate_fields to DeepEvalAdapter --- .../custom_code_based_evaluators/models.py | 1 + .../third_party/autoevals/adapter.py | 2 +- .../third_party/base.py | 11 ++-- .../third_party/deepeval/adapter.py | 19 ++++-- .../third_party/span_parsers/common.py | 1 + .../third_party/deepeval/test_adapter.py | 60 ++++++++++++++++++- 6 files changed, 80 insertions(+), 14 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py index c876b145..5ff8fafa 100644 --- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py @@ -51,6 +51,7 @@ class EvaluatorInput(BaseModel): session_spans: List[Dict] target_trace_id: Optional[str] = None target_span_id: Optional[str] = None + reference_inputs: Optional[List[Dict]] = None schema_version: str = "1.0" evaluator_id: Optional[str] = None evaluator_name: Optional[str] = None diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py index fa2acba3..ae96a5b5 100644 --- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/autoevals/adapter.py @@ -40,7 +40,7 @@ def __init__( self.threshold = threshold def validate_fields(self, fields: Dict[str, Any]) -> None: - """Validate that input and actual_output are present.""" + """Validate minimum required fields; scorer raises on additional missing params.""" missing = [] if not fields.get("input"): missing.append("input") diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py index 1f28d2a5..3dfd545e 100644 --- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/base.py @@ -2,11 +2,10 @@ import abc import logging -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, Optional from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import EvaluatorInput, EvaluatorOutput from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import ( - SpanParseResult, parse_spans, ) @@ -31,8 +30,9 @@ def __init__( Args: field_mapper: Optional callable that receives the EvaluatorInput and - returns a dict of field values. Bypasses default span parsing - when provided. + returns a dict with keys: 'input', 'actual_output', and optionally + 'expected_output', 'context', 'retrieval_context'. Bypasses default + span parsing when provided. """ self.field_mapper = field_mapper @@ -81,8 +81,7 @@ def _extract_fields(self, evaluator_input: EvaluatorInput) -> Dict[str, Any]: if self.field_mapper is not None: return self.field_mapper(evaluator_input) - reference_inputs = getattr(evaluator_input, "reference_inputs", None) - result = parse_spans(evaluator_input.session_spans, reference_inputs) + result = parse_spans(evaluator_input.session_spans, evaluator_input.reference_inputs) return result.to_dict() @abc.abstractmethod diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py index 725584ef..9c7de325 100644 --- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/adapter.py @@ -45,7 +45,18 @@ def __init__( self.metric.model = model def validate_fields(self, fields: Dict[str, Any]) -> None: - """No pre-validation; let DeepEval raise on missing params.""" + """Validate that input and actual_output are present.""" + missing = [] + if not fields.get("input"): + missing.append("input") + if not fields.get("actual_output"): + missing.append("actual_output") + if missing: + metric_name = type(self.metric).__name__ + raise ValueError( + f"Field(s) {missing} required by {metric_name} but not found in evaluation event. " + f"Provide a field_mapper or ensure spans contain the necessary data." + ) def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: """Run the DeepEval metric and return formatted results.""" @@ -60,12 +71,12 @@ def execute(self, fields: Dict[str, Any]) -> EvaluatorOutput: try: self.metric.measure(test_case) except Exception as e: - error_type = type(e).__name__ - if "MissingTestCaseParams" in error_type or "missing" in str(e).lower(): + if type(e).__name__ == "MissingTestCaseParamsError": return EvaluatorOutput( label="Error", errorCode="MISSING_REQUIRED_FIELD", - errorMessage=f"{type(self.metric).__name__} requires fields not available: {e}", + errorMessage=f"{type(self.metric).__name__} requires fields not extracted from spans: {e}. " + f"Provide a field_mapper to supply custom fields from your trace data.", ) raise diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py index 6d69dbc6..0619be8c 100644 --- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/common.py @@ -94,6 +94,7 @@ def extract_from_agent_span_events( for span in session_spans: attributes = span.get("attributes", {}) operation_name = attributes.get("gen_ai.operation.name") + # Phase 1: only invoke_agent spans supported; others fall through to field_mapper if operation_name != "invoke_agent": continue diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py index 3c8a3d39..55e40cee 100644 --- a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py @@ -107,6 +107,36 @@ def test_custom_field_mapper(self): assert test_case.input == "mapped input" assert test_case.actual_output == "mapped output" + def test_reference_inputs_populates_expected_output(self): + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + evaluator_input = EvaluatorInput( + evaluation_level="TRACE", + session_spans=[ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "input": {"messages": [{"role": "user", "content": "What is AI?"}]}, + "output": {"messages": [{"role": "assistant", "content": "AI is artificial intelligence."}]}, + } + } + ], + } + ], + target_trace_id="t1", + reference_inputs=[{"expectedResponse": "AI stands for artificial intelligence."}], + ) + + result = adapter(evaluator_input) + + test_case = metric.measure.call_args[0][0] + assert test_case.expected_output == "AI stands for artificial intelligence." + def test_model_override_sets_metric_model(self): metric = _mock_metric() DeepEvalAdapter(metric=metric, model="bedrock/anthropic.claude-3") @@ -153,6 +183,31 @@ def test_no_agent_spans_returns_error(self): assert result.errorCode == "FIELD_EXTRACTION_ERROR" assert result.label == "Error" + def test_missing_input_returns_error(self): + spans = [ + { + "traceId": "t1", + "spanId": "s1", + "attributes": {"gen_ai.operation.name": "invoke_agent"}, + "span_events": [ + { + "body": { + "output": {"messages": [{"role": "assistant", "content": "answer"}]}, + } + } + ], + } + ] + metric = _mock_metric() + adapter = DeepEvalAdapter(metric=metric) + + result = adapter(_make_evaluator_input(spans=spans)) + + assert result.errorCode == "MISSING_REQUIRED_FIELD" + assert "input" in result.errorMessage + assert "field_mapper" in result.errorMessage + metric.measure.assert_not_called() + def test_metric_measure_exception_returns_error(self): metric = _mock_metric() metric.measure = MagicMock(side_effect=RuntimeError("LLM timeout")) @@ -166,9 +221,7 @@ def test_metric_measure_exception_returns_error(self): def test_missing_params_error_caught(self): metric = _mock_metric() - class MissingTestCaseParamsError(Exception): - pass - + MissingTestCaseParamsError = type("MissingTestCaseParamsError", (Exception,), {}) metric.measure = MagicMock( side_effect=MissingTestCaseParamsError("retrieval_context is required") ) @@ -178,6 +231,7 @@ class MissingTestCaseParamsError(Exception): assert result.errorCode == "MISSING_REQUIRED_FIELD" assert "retrieval_context" in result.errorMessage + assert "field_mapper" in result.errorMessage def test_never_raises(self): metric = _mock_metric() From 0499d4b31377ef38075930fd4af5bce7a18e4a80 Mon Sep 17 00:00:00 2001 From: Haomiao Shi Date: Tue, 30 Jun 2026 12:01:29 -0700 Subject: [PATCH 13/13] Adapt to upstream ReferenceInput model: remove duplicate field, use expected_response_text property --- .../evaluation/custom_code_based_evaluators/models.py | 1 - .../third_party/span_parsers/base.py | 7 ++++--- .../third_party/deepeval/test_adapter.py | 2 +- .../third_party/span_parsers/test_span_parsers.py | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py index 5ff8fafa..c876b145 100644 --- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/models.py @@ -51,7 +51,6 @@ class EvaluatorInput(BaseModel): session_spans: List[Dict] target_trace_id: Optional[str] = None target_span_id: Optional[str] = None - reference_inputs: Optional[List[Dict]] = None schema_version: str = "1.0" evaluator_id: Optional[str] = None evaluator_name: Optional[str] = None diff --git a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py index 3b88ff11..9869eab7 100644 --- a/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py +++ b/src/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/base.py @@ -28,7 +28,7 @@ def parse_spans( session_spans: List[Dict[str, Any]], - reference_inputs: Optional[List[Dict[str, Any]]] = None, + reference_inputs: Optional[List[Any]] = None, ) -> SpanParseResult: """Parse session spans using the first matching agent-level parser. @@ -38,7 +38,7 @@ def parse_spans( Args: session_spans: Raw ADOT span dicts from the evaluation service. - reference_inputs: Optional reference inputs for expected_output. + reference_inputs: Optional ReferenceInput list for expected_output. Returns: SpanParseResult with extracted fields. @@ -50,7 +50,8 @@ def parse_spans( result = parser(session_spans) if result is not None: if reference_inputs: - expected = reference_inputs[0].get("expectedResponse") + ref = reference_inputs[0] + expected = getattr(ref, "expected_response_text", None) if expected: result.expected_output = expected return result diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py index 55e40cee..e7efef2a 100644 --- a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/deepeval/test_adapter.py @@ -129,7 +129,7 @@ def test_reference_inputs_populates_expected_output(self): } ], target_trace_id="t1", - reference_inputs=[{"expectedResponse": "AI stands for artificial intelligence."}], + reference_inputs=[{"expectedResponse": {"text": "AI stands for artificial intelligence."}}], ) result = adapter(evaluator_input) diff --git a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py index de2a1bb5..9669e5e5 100644 --- a/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py +++ b/tests/bedrock_agentcore/evaluation/custom_code_based_evaluators/third_party/span_parsers/test_span_parsers.py @@ -2,6 +2,7 @@ import pytest +from bedrock_agentcore.evaluation.custom_code_based_evaluators.models import ReferenceInput from bedrock_agentcore.evaluation.custom_code_based_evaluators.third_party.span_parsers import ( SpanParseResult, parse_spans, @@ -96,7 +97,7 @@ def test_expected_output_from_reference_inputs(self): output_messages=[{"role": "assistant", "content": "a"}], ) ] - refs = [{"expectedResponse": "expected answer"}] + refs = [ReferenceInput(expectedResponse={"text": "expected answer"})] result = parse_spans(spans, reference_inputs=refs)