Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions cloud_pipelines_backend/instrumentation/execution_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from opentelemetry.trace import StatusCode

from .. import backend_types_sql as bts
from ..launchers import kubernetes_launchers

_logger = logging.getLogger(__name__)
_tracer = trace.get_tracer("tangle.orchestrator")
Expand Down Expand Up @@ -144,6 +145,23 @@ def _pipeline_attrs(*, execution: bts.ExecutionNode) -> dict[str, object]:
return {"execution.parent_id": execution.parent_execution_id}


def _resource_attrs(*, execution: bts.ExecutionNode, status: str) -> dict[str, object]:
"""CPU, memory, and accelerator requests for the PENDING span."""
if status != bts.ContainerExecutionStatus.PENDING:
return {}
annotations: dict = (execution.task_spec or {}).get("annotations", {})
attrs: dict[str, object] = {}
if cpu := annotations.get(kubernetes_launchers.RESOURCES_CPU_ANNOTATION_KEY):
attrs["execution.resources.cpu"] = cpu
if memory := annotations.get(kubernetes_launchers.RESOURCES_MEMORY_ANNOTATION_KEY):
attrs["execution.resources.memory"] = memory
if accelerators := annotations.get(
kubernetes_launchers.RESOURCES_ACCELERATORS_ANNOTATION_KEY
):
attrs["execution.resources.accelerators"] = accelerators
return attrs


def _ns(*, dt: datetime.datetime) -> int:
"""Return *dt* as nanoseconds since the Unix epoch (required by OTel SDK)."""
if dt.tzinfo is None:
Expand Down Expand Up @@ -189,13 +207,18 @@ def emit_execution_trace(*, execution: bts.ExecutionNode) -> None:
"execution.status": entry["status"],
**_error_attrs(execution=execution, status=entry["status"]),
**_launcher_pod_attrs(execution=execution, status=entry["status"]),
**_resource_attrs(execution=execution, status=entry["status"]),
}
start_ns = _ns(dt=t_start)
end_ns = _ns(dt=t_end)
if end_ns <= start_ns:
end_ns = start_ns + 1
_tracer.start_span(
f"execution.status {entry['status']}",
context=root_ctx,
attributes=attrs,
start_time=_ns(dt=t_start),
).end(end_time=_ns(dt=t_end))
start_time=start_ns,
).end(end_time=end_ns)

if history[-1]["status"] in _ERROR_TERMINAL_STATUSES:
root.set_status(status=StatusCode.ERROR)
Expand Down
2 changes: 1 addition & 1 deletion cloud_pipelines_backend/instrumentation/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,4 @@ def _handle_before_commit(session: orm.Session) -> None:
exc_info=True,
)
obj._status_changed = False
execution_tracing.try_emit_execution_trace(execution=obj)
execution_tracing.emit_execution_trace(execution=obj)
65 changes: 65 additions & 0 deletions tests/instrumentation/test_execution_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,3 +396,68 @@ def test_root_execution_omits_parent_id_when_absent(
s for s in span_exporter.get_finished_spans() if s.name == "execution"
)
assert "execution.parent_id" not in (root.attributes or {})


class TestResourceAttrs:
def test_pending_span_carries_cpu_and_memory(
self, span_exporter: InMemorySpanExporter
) -> None:
from cloud_pipelines_backend.launchers import kubernetes_launchers

execution = _make_execution(
statuses=["QUEUED", "PENDING", "RUNNING", "SUCCEEDED"]
)
execution.task_spec = {
"annotations": {
kubernetes_launchers.RESOURCES_CPU_ANNOTATION_KEY: "4",
kubernetes_launchers.RESOURCES_MEMORY_ANNOTATION_KEY: "16Gi",
}
}
execution_tracing.emit_execution_trace(execution=execution)

pending_span = next(
s
for s in span_exporter.get_finished_spans()
if s.attributes.get("execution.status") == "PENDING"
)
assert pending_span.attributes["execution.resources.cpu"] == "4"
assert pending_span.attributes["execution.resources.memory"] == "16Gi"

def test_pending_span_carries_accelerators_when_present(
self, span_exporter: InMemorySpanExporter
) -> None:
from cloud_pipelines_backend.launchers import kubernetes_launchers

execution = _make_execution(statuses=["QUEUED", "PENDING", "SUCCEEDED"])
execution.task_spec = {
"annotations": {
kubernetes_launchers.RESOURCES_ACCELERATORS_ANNOTATION_KEY: '{"H100": 1}',
}
}
execution_tracing.emit_execution_trace(execution=execution)

pending_span = next(
s
for s in span_exporter.get_finished_spans()
if s.attributes.get("execution.status") == "PENDING"
)
assert (
pending_span.attributes["execution.resources.accelerators"] == '{"H100": 1}'
)

def test_non_pending_spans_have_no_resource_attrs(
self, span_exporter: InMemorySpanExporter
) -> None:
from cloud_pipelines_backend.launchers import kubernetes_launchers

execution = _make_execution(statuses=["QUEUED", "SUCCEEDED"])
execution.task_spec = {
"annotations": {
kubernetes_launchers.RESOURCES_ACCELERATORS_ANNOTATION_KEY: '{"H100": 1}',
}
}
execution_tracing.emit_execution_trace(execution=execution)

for span in span_exporter.get_finished_spans():
assert "execution.resources.cpu" not in (span.attributes or {})
Comment thread
yuechao-qin marked this conversation as resolved.
assert "execution.resources.accelerators" not in (span.attributes or {})
Loading