diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index 9b08923d3..abf8589a4 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -654,37 +654,80 @@ void CLG_(setup_bbcc)(BB* bb) csp = CLG_(current_call_stack).sp; - /* A return not matching the top call in our callstack is a jump */ + /* Classify a `ret` and compute popcount_on_return: how many *SP-equal* shadow + * frames it must pop. SP-lower frames (sub-calls the returning function made, + * and on x86 the returning frame itself, since `call` pushes a return address) + * always unwind and are not counted here. The three cases below are keyed on + * where the top (newest, lowest-SP) frame sits relative to the post-return SP, + * preserving the original x86 behaviour exactly: on x86 a return is always the + * SP-lower-top case and never demotes; the SP-equal-top case only arises on + * AArch64/PPC, where the call instruction leaves SP unchanged. */ if ( (jmpkind == jk_Return) && (csp >0)) { - Int csp_up = csp-1; - call_entry* top_ce = &(CLG_(current_call_stack).entry[csp_up]); - - /* We have a real return if - * - the stack pointer (SP) left the current stack frame, or - * - SP has the same value as when reaching the current function - * and the address of this BB is the return address of last call - * (we even allow to leave multiple frames if the SP stays the - * same and we find a matching return address) - * The latter condition is needed because on PPC, SP can stay - * the same over CALL=b(c)l / RET=b(c)lr boundaries - */ - if (sp < top_ce->sp) popcount_on_return = 0; - else if (top_ce->sp == sp) { - while(1) { - if (top_ce->ret_addr == bb_addr(bb)) break; - if (csp_up>0) { - csp_up--; - top_ce = &(CLG_(current_call_stack).entry[csp_up]); - if (top_ce->sp == sp) { - popcount_on_return++; - continue; - } + call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]); + Bool is_jump = False; + + if (top_ce->sp > sp) { + /* Post-return SP is below the top frame's entry SP: SP moved *deeper*, + * so this `ret` did not leave the current frame — treat it as a jump. + * (The top frame has the lowest SP, so every frame is SP-higher here.) + * This is the original `sp < top_ce->sp` case. */ + is_jump = True; + } + else { + Addr ret_target = bb_addr(bb); + /* Skip the SP-lower frames the return vacated. On x86 the returning + * frame itself is SP-lower (its `call` pushed a return address), so + * note whether any skipped frame returns to the target: that marks an + * x86-style return *from an SP-lower frame*, where the frame at the + * return SP is the caller we return into — not part of the chain. */ + Int i = csp-1; + Bool sp_lower_match = False; + while (i >= 0 && CLG_(current_call_stack).entry[i].sp < sp) { + if (CLG_(current_call_stack).entry[i].ret_addr == ret_target) + sp_lower_match = True; + i--; + } + + if (sp_lower_match) { + /* x86-style return: the returning frame was SP-lower and pops on + * its own; the SP-equal frame at the return SP is the caller, which + * must not be popped (it may even share the return address under + * recursion). No SP-equal frames to pop. */ + popcount_on_return = 0; + } + else if (i >= 0 && CLG_(current_call_stack).entry[i].sp == sp) { + /* The returning frame is SP-equal — the case that exists where the + * call instruction leaves SP unchanged (AArch64 bl/blr, PPC b(c)l). + * Pop the SP-equal block down to AND INCLUDING the *deepest* frame + * whose recorded return address is the return target; the frame + * below it is the caller we return into. The deepest match pops a + * whole PLT-stub + callee pair (same return address) or a tail-call + * chain a->b->c (c's `ret` lands past b and a) in one go, instead of + * leaving the stub/tail frames stuck — which would re-root the + * caller's continuation under the callee (inverted edges) and + * fabricate '2 recursion clones. */ + Int count = 0; + popcount_on_return = 0; + for (Int j = i; j >= 0 && CLG_(current_call_stack).entry[j].sp == sp; j--) { + count++; + if (CLG_(current_call_stack).entry[j].ret_addr == ret_target) + popcount_on_return = count; } + /* Top frame is SP-equal and nothing in the block returns to the + * target: a `ret` that does not match our call stack — a jump (the + * original SP-equal-top behaviour). */ + if (popcount_on_return == 0 && i == csp-1) + is_jump = True; + } + else { + /* An SP-higher caller reached after skipping SP-lower frames (the + * common x86 return), or every frame was SP-lower: no SP-equal + * frames to pop. */ popcount_on_return = 0; - break; } } - if (popcount_on_return == 0) { + + if (is_jump) { jmpkind = jk_Jump; ret_without_call = True; } @@ -789,7 +832,12 @@ void CLG_(setup_bbcc)(BB* bb) CLG_(pop_call_stack)(); } else { - CLG_ASSERT(popcount_on_return >0); + /* popcount_on_return is the number of *SP-equal* frames this return + * pops; it may legitimately be 0 when the return vacates only SP-lower + * frames (every return on x86, where `call` pushes a return address, and + * AArch64/PPC returns whose SP-equal frame is the caller we stop at). + * unwind_call_stack still pops the SP-lower frames in that case. */ + CLG_ASSERT(popcount_on_return >= 0); CLG_(unwind_call_stack)(sp, popcount_on_return); } } diff --git a/callgrind/callstack.c b/callgrind/callstack.c index 8951639d7..86c23bdbd 100644 --- a/callgrind/callstack.c +++ b/callgrind/callstack.c @@ -431,13 +431,33 @@ Int CLG_(unwind_call_stack)(Addr sp, Int minpops) while( (csp=CLG_(current_call_stack).sp) >0) { call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]); - if ((top_ce->sp < sp) || - ((top_ce->sp == sp) && minpops>0)) { - + /* A frame whose entry SP lies strictly below the post-return SP has + * been vacated by this return and is unwound unconditionally. + * + * `minpops` only bounds how many *SP-equal* frames a single return may + * pop (the count setup_bbcc proved belong to this return by matching the + * return target against recorded return addresses). SP-equal entry frames + * exist on targets whose call instruction leaves SP unchanged — AArch64 + * bl/blr (return address in the link register) and PPC b(c)l — so a + * callee's own entry frame records the caller's SP and ends up *beneath* + * the SP-lower frames of any sub-calls the callee made (e.g. a libc/libm + * function reached via the PLT that itself calls other functions). + * + * Therefore SP-lower pops must not consume the budget: if they did, the + * still-open sub-call frames would exhaust it before the callee's + * SP-equal entry frame is reached, leaving that frame stuck. A stuck + * frame keeps the callee's context active (so the caller's continuation + * is mis-attributed to the callee — inverted edges) and never decrements + * the callee's recursion depth (fabricating spurious '2 clones). */ + if (top_ce->sp < sp) { + unwind_count++; + CLG_(pop_call_stack)(); + continue; + } + if ((top_ce->sp == sp) && minpops>0) { minpops--; unwind_count++; CLG_(pop_call_stack)(); - csp=CLG_(current_call_stack).sp; continue; } break; @@ -521,7 +541,18 @@ void CLG_(reconstruct_call_stack_from_native)(ThreadId tid) * SP, sps[frame+1]; the outermost frame keeps its own SP as nothing * returns past it during measurement. */ ce->sp = (frame + 1 < (Int)n) ? sps[frame + 1] : sps[frame]; - ce->ret_addr = (frame + 1 < (Int)n) ? ips[frame + 1] : 0; + /* setup_bbcc classifies a return by matching the top frame's ret_addr + * against bb_addr(return-target) — the return PC, i.e. the instruction + * after the call — and push_call_stack stores exactly that for real + * calls. VG_(get_StackTrace), however, reports each caller frame at the + * last byte of its call instruction (m_stacktrace.c: `ips[i] = pc - 1`), + * i.e. return_PC - 1. Seeding ret_addr straight from ips[frame+1] would + * therefore be one byte low. That matters on AArch64/PPC, where a `ret` + * lands at SP *equal* to the seeded entry SP so the classifier falls + * back on ret_addr: the off-by-one fails the match, the return is + * demoted to a jump and re-promoted to a call, inverting the edge across + * the seeded frame. Normalize to the return PC with +1. */ + ce->ret_addr = (frame + 1 < (Int)n) ? ips[frame + 1] + 1 : 0; cs->sp++; ensure_stack_size(cs->sp + 1); cs->entry[cs->sp].cxt = 0;