Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion libraries/chain/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1095,10 +1095,23 @@ namespace graphene { namespace chain {
return held_us > 0 ? held_us / 1000 : 0;
}

bool database::push_block_appears_stalled() const {
// Progress-aware verdict maintained by the monitor thread (held long AND
// no operation applied in that window). Use THIS, not raw hold time, to
// drive any external restart — a heavy deterministic block holds the lock
// long but keeps making progress, and is identical network-wide.
return _push_block_stalled.load(std::memory_order_acquire);
}

void database::start_push_block_monitor() {
bool expected = false;
if (!_push_block_monitor_run.compare_exchange_strong(expected, true, std::memory_order_acq_rel))
return; // already running
// Clear any stale verdict before the new monitor's first poll. Without
// this, after a close()->open() recovery cycle push_block_appears_stalled()
// could return a stale `true` for ~1s (the poll interval) until the loop
// re-evaluates.
_push_block_stalled.store(false, std::memory_order_release);
_push_block_monitor = std::thread([this]() { push_block_monitor_loop(); });
}

Expand All @@ -1107,6 +1120,9 @@ namespace graphene { namespace chain {
if (_push_block_monitor.joinable())
_push_block_monitor.join();
}
// A stopped monitor reports not-stalled: clear the verdict so a reader
// between stop and the next lazy start never sees a stale `true`.
_push_block_stalled.store(false, std::memory_order_release);
}

void database::push_block_monitor_loop() {
Expand All @@ -1117,7 +1133,14 @@ namespace graphene { namespace chain {
// diagnostic (stderr, so the monitor itself can never block on node locks).
// LOG-ONLY: it never calls std::_Exit and never mutates state — a false
// positive can only produce an extra log line. Auto-restart, if wanted, is
// an external decision driven by push_block_lock_held_ms().
// an external decision driven by push_block_appears_stalled() (the same
// held-AND-frozen verdict this loop publishes), never raw hold time.
// NB: a future hardfork block whose migration runs long in pure C++
// without calling apply_operation could trip the WARNING (and the stalled
// verdict) on every node at that height. It is log-only, so the cost is
// synchronized log noise, not a synchronized restart — provided external
// consumers gate on a SUSTAINED stalled verdict rather than acting on the
// first true reading.
const int64_t warn_us = (int64_t)PUSH_BLOCK_STALL_WARN_SEC * 1000000;
const int64_t relog_us = (int64_t)PUSH_BLOCK_STALL_RELOG_SEC * 1000000;
uint64_t last_progress = _apply_progress.load(std::memory_order_acquire);
Expand All @@ -1132,6 +1155,7 @@ namespace graphene { namespace chain {
if (cur != last_progress) { last_progress = cur; last_progress_us = now_us; }
const int64_t since = _push_block_lock_since_us.load(std::memory_order_acquire);
if (since == 0) {
_push_block_stalled.store(false, std::memory_order_release);
if (warned) {
std::cerr << "[push_block-monitor] write lock released; the node has recovered from the stall."
<< std::endl;
Expand All @@ -1142,6 +1166,9 @@ namespace graphene { namespace chain {
const int64_t held_us = now_us - since;
const int64_t frozen_us = now_us - last_progress_us;
if (held_us > warn_us && frozen_us > warn_us) {
// Held long AND no progress in that window: publish the wedged
// verdict for push_block_appears_stalled() and log (throttled).
_push_block_stalled.store(true, std::memory_order_release);
if (!warned || (now_us - last_log_us) > relog_us) {
const uint32_t num = _push_block_watch_num.load(std::memory_order_acquire);
std::cerr << "[push_block-monitor] WARNING: push_block() has held the chainbase write lock for "
Expand All @@ -1152,6 +1179,10 @@ namespace graphene { namespace chain {
warned = true;
last_log_us = now_us;
}
} else {
// Holding the lock but still under threshold, or progress is
// recent (heavy-but-healthy block): not wedged.
_push_block_stalled.store(false, std::memory_order_release);
}
}
}
Expand Down
33 changes: 30 additions & 3 deletions libraries/chain/include/graphene/chain/database.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,11 +501,27 @@ namespace graphene { namespace chain {

/// Milliseconds the current push_block() has held the chainbase write
/// lock, or 0 if push_block is not currently holding it. Lock-free and
/// safe to call from any thread (e.g. a healthcheck/RPC). A value that
/// stays high (tens of seconds) means the node is wedged under the write
/// lock — see the push_block stall monitor.
/// safe to call from any thread (e.g. a healthcheck/RPC).
///
/// NOTE: a high value alone is NOT a wedge signal — a legitimately heavy
/// block (a hardfork batch, a large reorg) can hold the lock for a long
/// time while making real progress. Such blocks are DETERMINISTIC at the
/// same height on every node, so restarting purely on hold time would
/// restart the whole network at once. Drive restart decisions off
/// push_block_appears_stalled() (held long AND no progress), never off
/// this raw timer.
int64_t push_block_lock_held_ms() const;

/// True only when push_block() has held the write lock past
/// PUSH_BLOCK_STALL_WARN_SEC AND no operation has been applied in that
/// window — i.e. the node is wedged, not just doing heavy work. This is
/// the progress-aware signal an external healthcheck/orchestrator should
/// use to decide whether to restart; it cannot false-positive on a heavy
/// deterministic block the way raw hold time can. Lock-free; safe from any
/// thread. Updated by the stall monitor, so it lags reality by up to its
/// ~1s poll interval (irrelevant at the 60s threshold).
bool push_block_appears_stalled() const;

uint32_t last_non_undoable_block_num() const;
//////////////////// db_init.cpp ////////////////////

Expand Down Expand Up @@ -712,7 +728,18 @@ namespace graphene { namespace chain {
std::atomic<int64_t> _push_block_lock_since_us{0};
std::atomic<uint32_t> _push_block_watch_num{0};
std::atomic<bool> _push_block_monitor_run{false};
// Progress-aware "wedged" verdict published by the monitor each poll:
// true only while (held > WARN_SEC AND no operation applied in that
// window). Backs push_block_appears_stalled() for external consumers so
// they never key a restart off raw hold time. (Maintained solely by the
// monitor thread; read lock-free elsewhere.)
std::atomic<bool> _push_block_stalled{false};
std::thread _push_block_monitor;
// start/stop are expected to be called only from the single block-apply
// path (start, on each push_block) and shutdown (stop, in close()/dtor),
// which are never concurrent. The lazy start CAS + thread assignment is
// not hardened against a concurrent stop; keep that single-caller
// assumption if this is ever reused.
void start_push_block_monitor();
void stop_push_block_monitor();
void push_block_monitor_loop();
Expand Down
Loading