Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ def _before_notify(event: bugsnag_event.Event) -> None:
if context:
event.add_tab("tangle_context", context)
if _CUSTOM_GROUPING_KEY and event.original_error:
normalized = error_normalization.normalize_error_message(
# Use the full chain for grouping so that "LauncherError <- TimeoutError"
# and "LauncherError <- ApiException" land in separate, stable groups.
chain = error_normalization.normalize_error_chain(
exception=event.original_error
)
prefix = (event.metadata.get("extra") or {}).get("grouping_prefix")
key_value = f"{prefix}: {normalized}" if prefix else normalized
key_value = f"{prefix}: {chain}" if prefix else chain
event.add_tab("custom", {_CUSTOM_GROUPING_KEY: key_value})
if prefix and event.errors:
try:
Expand All @@ -57,6 +59,18 @@ def _before_notify(event: bugsnag_event.Event) -> None:
_logger.debug(
"Could not prepend grouping prefix to errorClass", exc_info=True
)
# For chained exceptions, override the root-cause title (what Bugsnag
# displays in the error list) with the full chain so reviewers can see
# the complete raise path without opening the stacktrace.
chain_title = error_normalization.build_chain_title(
exception=event.original_error
)
if chain_title and event.errors:
try:
title = f"{prefix}: {chain_title}" if prefix else chain_title
event.errors[-1].error_class = title
except Exception:
_logger.debug("Could not set chain title on errorClass", exc_info=True)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 This is an AI-generated code review comment.

DEBUG-level logging hides SDK API drift in the title override.

except Exception:
    _logger.debug("Could not set chain title on errorClass", exc_info=True)

The code is built on a load-bearing assumption that event.errors[-1].error_class is the field Bugsnag displays as the error-list title. If a future SDK changes that shape, the assignment will throw AttributeError, the chain title silently disappears from Bugsnag, and the DEBUG log will be invisible in any production logging config. This is the kind of integration drift that doesn't break unit tests but quietly degrades the product.

Fix: log at warning (or error) level. Still non-fatal — the original error still reports — but the next time the SDK shape drifts, you find out instead of guessing why the error list looks the way it used to.

Note: the same DEBUG pattern at the prefix-prepending block above (~lines 55-59) has the same issue; worth bumping that too, although it's pre-existing.



def setup(*, service_name: str | None = None) -> None:
Expand Down
45 changes: 44 additions & 1 deletion cloud_pipelines_backend/instrumentation/error_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _normalize_launcher_error(*, exception: BaseException) -> str | None:


def normalize_error_message(*, exception: BaseException) -> str:
"""Return a stable normalized string for error grouping."""
"""Return a stable normalized string for a single exception (no chain traversal)."""
for normalizer in (
_normalize_k8s_api_exception,
_normalize_max_retry_error,
Expand All @@ -117,3 +117,46 @@ def normalize_error_message(*, exception: BaseException) -> str:
return result

return f"{type(exception).__name__}: {_strip_generic(message=str(exception))}"


_CHAIN_PART_MAX_LEN = 80


def normalize_error_chain(*, exception: BaseException) -> str:
"""Return a stable normalized string covering the full exception chain.

Walks ``__cause__`` (and ``__context__`` when not suppressed) and joins
each level with `` <- ``. Use this for grouping keys so that chained
exceptions like ``LauncherError <- TimeoutError`` produce one stable group
rather than one per root cause.
"""
parts: list[str] = []
seen: set[int] = set()
exc: BaseException | None = exception
while exc is not None and id(exc) not in seen and len(parts) < 4:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 This is an AI-generated code review comment.

Grouping key produced by normalize_error_chain is length-unbounded — Bugsnag may silently truncate.

parts.append(normalize_error_message(exception=exc))  # no per-part cap

This function is used as the Bugsnag custom grouping key. Each part has no length bound — only build_chain_title caps at _CHAIN_PART_MAX_LEN. With 4 chained exceptions each carrying a long _strip_generic-processed message, the grouping key can run into the thousands of characters. If Bugsnag silently truncates custom grouping keys beyond some limit, two chains that differ only past the truncation point group together — exactly the opposite of this PR's intent.

Fix: either (a) verify Bugsnag's actual custom-grouping-key length limit and document it in a comment, or (b) apply a generous per-part cap inside normalize_error_chain too — e.g. 200 chars, separate from the display-oriented 80-char cap. The cap is stable across chain orderings and protects against accidental key blow-up.

seen.add(id(exc))
parts.append(normalize_error_message(exception=exc))
exc = exc.__cause__ or (None if exc.__suppress_context__ else exc.__context__)
return " <- ".join(parts)


def build_chain_title(*, exception: BaseException) -> str | None:
"""Return a human-readable chain title for display, or ``None`` for single exceptions.

Like ``normalize_error_chain`` but truncates each level so the result fits
in a Bugsnag error list title. Returns ``None`` when there is no
chain so callers can skip overriding the default title.
"""
parts: list[str] = []
seen: set[int] = set()
exc: BaseException | None = exception

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 This is an AI-generated code review comment.

Duplicated chain-walking loop between normalize_error_chain and build_chain_title.

Both functions share identical traversal logic — same seen set, same len(parts) < 4 cap, same __cause__ or __context__ walk. The only divergence is the per-part truncation step and the None return for single exceptions.

Two places that have to stay in lockstep on every future change (max depth, BaseExceptionGroup support, additional cycle detection, etc.). Easy to drift.

Fix: extract one private helper:

def _walk_chain(exception: BaseException, max_depth: int = 4) -> list[BaseException]:
    excs: list[BaseException] = []
    seen: set[int] = set()
    exc: BaseException | None = exception
    while exc is not None and id(exc) not in seen and len(excs) < max_depth:
        seen.add(id(exc))
        excs.append(exc)
        exc = exc.__cause__ or (None if exc.__suppress_context__ else exc.__context__)
    return excs

Then normalize_error_chain is one comprehension and build_chain_title is one comprehension with the truncation step.

while exc is not None and id(exc) not in seen and len(parts) < 4:
seen.add(id(exc))
part = normalize_error_message(exception=exc)
if len(part) > _CHAIN_PART_MAX_LEN:
part = part[:_CHAIN_PART_MAX_LEN].rstrip() + "..."
parts.append(part)
exc = exc.__cause__ or (None if exc.__suppress_context__ else exc.__context__)
if len(parts) <= 1:
return None
return " <- ".join(parts)
73 changes: 73 additions & 0 deletions tests/instrumentation/test_error_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,76 @@ def test_strips_json_object_double_quotes(self):
exc = RuntimeError('operation failed: {"key": "value"}')
result = error_normalization.normalize_error_message(exception=exc)
assert result == "RuntimeError: operation failed: {...}"


class TestNormalizeErrorChain:
def _chained(
self, outer_msg: str, inner: BaseException, outer_cls: type = RuntimeError
) -> BaseException:
try:
raise outer_cls(outer_msg) from inner
except outer_cls as exc:
return exc

def test_single_exception_no_arrow(self):
exc = ValueError("something went wrong")
result = error_normalization.normalize_error_chain(exception=exc)
assert result == "ValueError: something went wrong"
assert " <- " not in result

def test_two_level_chain(self):
inner = TimeoutError("The read operation timed out")
outer = self._chained("Failed to create pod: {'apiVersion': 'v1'}", inner)
result = error_normalization.normalize_error_chain(exception=outer)
assert result == (
"RuntimeError: Failed to create pod: {...} <- TimeoutError: The read operation timed out"
)

def test_launcher_error_chain(self):
try:
from cloud_pipelines_backend.launchers.interfaces import LauncherError
except ImportError:
pytest.skip("LauncherError not importable")
inner = TimeoutError("The read operation timed out")
try:
raise LauncherError("Failed to create pod: {spec}") from inner
except LauncherError as outer:
result = error_normalization.normalize_error_chain(exception=outer)
assert result == (
"LauncherError: Failed to create pod <- TimeoutError: The read operation timed out"
)

def test_caps_at_four_levels(self):
exc: BaseException = ValueError("level 4")
for i in range(3, 0, -1):
exc = self._chained(f"level {i}", exc)
# Chain is 4 deep; add a 5th
exc = self._chained("level 0", exc)
result = error_normalization.normalize_error_chain(exception=exc)
assert result.count(" <- ") == 3 # 4 parts max


class TestBuildChainTitle:
def test_single_exception_returns_none(self):
exc = ValueError("nothing to chain")
assert error_normalization.build_chain_title(exception=exc) is None

def test_two_level_chain_returns_string(self):
inner = TimeoutError("The read operation timed out")
try:
raise RuntimeError("outer problem") from inner
except RuntimeError as outer:
result = error_normalization.build_chain_title(exception=outer)
assert result == (
"RuntimeError: outer problem <- TimeoutError: The read operation timed out"
)

def test_truncates_long_parts(self):
inner = ValueError("x" * 200)
try:
raise RuntimeError("outer") from inner
except RuntimeError as outer:
result = error_normalization.build_chain_title(exception=outer)
assert result is not None
inner_part = result.split(" <- ")[1]
assert len(inner_part) <= 83 # 80 + "..."