diff --git a/lelab/rollout.py b/lelab/rollout.py index bdffdf1..5e2b013 100644 --- a/lelab/rollout.py +++ b/lelab/rollout.py @@ -153,6 +153,84 @@ def _format_cameras_arg(cameras: dict[str, dict[str, Any]]) -> str: return "{" + ", ".join(parts) + "}" +# Exception lines at the tail of a Python traceback look like +# "RuntimeError: ..." or "lerobot.errors.DeviceNotConnectedError: ...". +_EXC_LINE_RE = re.compile(r"^[A-Za-z_][\w.]*(?:Error|Exception|Interrupt|Timeout|Failure)\b") + + +def _extract_error_from_log(log_path: str | None) -> str | None: + """Pull the meaningful error out of a failed rollout's log so the UI can + show it directly instead of telling the user to open a file in the cache.""" + if not log_path: + return None + try: + with open(log_path, encoding="utf-8", errors="replace") as fh: + lines = fh.readlines() + except OSError: + return None + tail = [ln.rstrip("\n") for ln in lines[-50:]] + # Prefer the last exception line + everything after it (the message body). + exc_idx = next((i for i in range(len(tail) - 1, -1, -1) if _EXC_LINE_RE.match(tail[i])), None) + if exc_idx is not None: + snippet = "\n".join(tail[exc_idx:]).strip() + else: + non_empty = [ln for ln in tail if ln.strip()] + snippet = "\n".join(non_empty[-6:]).strip() + snippet = re.sub(r"\n\s*\n+", "\n", snippet) + if len(snippet) > 500: + snippet = snippet[:500].rstrip() + "…" + return snippet or None + + +def _friendly_hint(error_text: str | None) -> str | None: + """A plain-language, actionable headline for the common SO-101 failures.""" + if not error_text: + return None + low = error_text.lower() + if "overload" in low or "torque_enable" in low: + return ( + "A motor overloaded — usually the gripper holding an object too hard. Release the object / " + "open the gripper and power-cycle the arm before trying again." + ) + if "missing motor ids" in low or "motor check failed" in low: + return ( + "A follower motor isn't responding (often the gripper, id 6). If a skill was holding an object " + "it likely overloaded — remove it, power-cycle the arm, then try teleoperation first." + ) + if "could not connect" in low or "failed to connect" in low or "not connected" in low: + return "Couldn't connect to the arm — make sure it's plugged in, powered on, and on the right port." + if "frame is too old" in low or "no frame" in low or "frame timeout" in low: + return ( + "A camera can't keep up — frames are arriving too slowly. Lower its resolution/FPS, " + "set FOURCC=MJPG, and close other heavy apps, then try again." + ) + if "failed to set capture_" in low or "actual_width" in low or "actual_height" in low: + return "A camera doesn't support the configured resolution — open camera settings and click Auto." + if "permission" in low and ("port" in low or "com" in low): + return "Couldn't open the serial port — close anything else using it, or run `lelab --stop`." + return None + + +# Errors that mean the policy actually ran and only shutdown/cleanup tripped — +# e.g. disabling torque on a gripper still holding an object. +_CLEANUP_MARKERS = ("overload", "torque_enable", "disconnect", "not connected") + + +def _classify_outcome(rc: int | None, rollout_started: bool, error_text: str | None) -> str: + """ok | ran_with_warning | failed. + + A non-zero exit *after* the rollout main loop started, where the error is a + torque-disable/overload on shutdown, means the skill ran but a motor (usually + the loaded gripper) complained during cleanup — that's a warning, not a + failure, so the UI shouldn't call a working run "failed".""" + if not rc: + return "ok" + low = (error_text or "").lower() + if rollout_started and any(marker in low for marker in _CLEANUP_MARKERS): + return "ran_with_warning" + return "failed" + + def handle_start_inference(request: InferenceRequest) -> dict[str, Any]: """Start a one-shot rollout subprocess. Returns a dict — the route layer turns it into a JSON response or HTTPException as appropriate.""" @@ -308,10 +386,17 @@ def handle_inference_status() -> dict[str, Any]: _inference_started_at = None _inference_rollout_started_at = None _inference_meta = {} + # On failure, surface the real error from the log so the UI doesn't + # have to send the user digging through the cache. + error = _extract_error_from_log(finished_meta.get("log_path")) if rc else None + outcome = _classify_outcome(rc, finished_rollout_started is not None, error) return { "inference_active": False, "exited": True, "exit_code": rc, + "outcome": outcome, + "error": error, + "hint": _friendly_hint(error), "policy_ref": finished_meta.get("policy_ref"), "duration_s": finished_meta.get("duration_s"), "log_path": finished_meta.get("log_path"), diff --git a/tests/test_rollout.py b/tests/test_rollout.py index 3a061d0..224e465 100644 --- a/tests/test_rollout.py +++ b/tests/test_rollout.py @@ -261,3 +261,43 @@ def test_handle_start_inference_blocked_when_already_active(monkeypatch) -> None assert result["success"] is False assert result["status_code"] == 409 assert "already active" in result["message"] + + +def test_classify_outcome_ok_warns_and_fails() -> None: + from lelab.rollout import _classify_outcome + + # rc 0/None => the run was fine. + assert _classify_outcome(0, True, "overload") == "ok" + assert _classify_outcome(None, True, None) == "ok" + # Non-zero AFTER the rollout started, with a torque-disable/overload on + # shutdown => the skill ran; only cleanup tripped. + assert _classify_outcome(1, True, "Motor 6 overload, torque_enable failed") == "ran_with_warning" + # Never started, or an unrelated error => a real failure. + assert _classify_outcome(1, False, "overload") == "failed" + assert _classify_outcome(1, True, "could not connect to the arm") == "failed" + + +def test_friendly_hint_maps_common_failures() -> None: + from lelab.rollout import _friendly_hint + + assert "gripper" in (_friendly_hint("Motor overload detected") or "").lower() + assert "connect" in (_friendly_hint("Failed to connect to the follower") or "").lower() + assert _friendly_hint("some unrecognised traceback") is None + assert _friendly_hint(None) is None + + +def test_extract_error_from_log_pulls_exception_tail(tmp_path) -> None: + from lelab.rollout import _extract_error_from_log + + log = tmp_path / "rollout.log" + log.write_text( + "INFO starting rollout\n" + "Traceback (most recent call last):\n" + ' File "x.py", line 1\n' + "RuntimeError: gripper overload during shutdown\n", + encoding="utf-8", + ) + out = _extract_error_from_log(str(log)) + assert out is not None and "RuntimeError: gripper overload during shutdown" in out + assert _extract_error_from_log(None) is None + assert _extract_error_from_log(str(tmp_path / "missing.log")) is None