Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 75 additions & 27 deletions callgrind/bbcc.c
Original file line number Diff line number Diff line change
Expand Up @@ -654,37 +654,80 @@ void CLG_(setup_bbcc)(BB* bb)

csp = CLG_(current_call_stack).sp;

/* A return not matching the top call in our callstack is a jump */
/* Classify a `ret` and compute popcount_on_return: how many *SP-equal* shadow
* frames it must pop. SP-lower frames (sub-calls the returning function made,
* and on x86 the returning frame itself, since `call` pushes a return address)
* always unwind and are not counted here. The three cases below are keyed on
* where the top (newest, lowest-SP) frame sits relative to the post-return SP,
* preserving the original x86 behaviour exactly: on x86 a return is always the
* SP-lower-top case and never demotes; the SP-equal-top case only arises on
* AArch64/PPC, where the call instruction leaves SP unchanged. */
if ( (jmpkind == jk_Return) && (csp >0)) {
Int csp_up = csp-1;
call_entry* top_ce = &(CLG_(current_call_stack).entry[csp_up]);

/* We have a real return if
* - the stack pointer (SP) left the current stack frame, or
* - SP has the same value as when reaching the current function
* and the address of this BB is the return address of last call
* (we even allow to leave multiple frames if the SP stays the
* same and we find a matching return address)
* The latter condition is needed because on PPC, SP can stay
* the same over CALL=b(c)l / RET=b(c)lr boundaries
*/
if (sp < top_ce->sp) popcount_on_return = 0;
else if (top_ce->sp == sp) {
while(1) {
if (top_ce->ret_addr == bb_addr(bb)) break;
if (csp_up>0) {
csp_up--;
top_ce = &(CLG_(current_call_stack).entry[csp_up]);
if (top_ce->sp == sp) {
popcount_on_return++;
continue;
}
call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]);
Bool is_jump = False;

if (top_ce->sp > sp) {
/* Post-return SP is below the top frame's entry SP: SP moved *deeper*,
* so this `ret` did not leave the current frame — treat it as a jump.
* (The top frame has the lowest SP, so every frame is SP-higher here.)
* This is the original `sp < top_ce->sp` case. */
is_jump = True;
}
else {
Addr ret_target = bb_addr(bb);
/* Skip the SP-lower frames the return vacated. On x86 the returning
* frame itself is SP-lower (its `call` pushed a return address), so
* note whether any skipped frame returns to the target: that marks an
* x86-style return *from an SP-lower frame*, where the frame at the
* return SP is the caller we return into — not part of the chain. */
Int i = csp-1;
Bool sp_lower_match = False;
while (i >= 0 && CLG_(current_call_stack).entry[i].sp < sp) {
if (CLG_(current_call_stack).entry[i].ret_addr == ret_target)
sp_lower_match = True;
i--;
}

if (sp_lower_match) {
/* x86-style return: the returning frame was SP-lower and pops on
* its own; the SP-equal frame at the return SP is the caller, which
* must not be popped (it may even share the return address under
* recursion). No SP-equal frames to pop. */
popcount_on_return = 0;
}
else if (i >= 0 && CLG_(current_call_stack).entry[i].sp == sp) {
/* The returning frame is SP-equal — the case that exists where the
* call instruction leaves SP unchanged (AArch64 bl/blr, PPC b(c)l).
* Pop the SP-equal block down to AND INCLUDING the *deepest* frame
* whose recorded return address is the return target; the frame
* below it is the caller we return into. The deepest match pops a
* whole PLT-stub + callee pair (same return address) or a tail-call
* chain a->b->c (c's `ret` lands past b and a) in one go, instead of
* leaving the stub/tail frames stuck — which would re-root the
* caller's continuation under the callee (inverted edges) and
* fabricate '2 recursion clones. */
Int count = 0;
popcount_on_return = 0;
for (Int j = i; j >= 0 && CLG_(current_call_stack).entry[j].sp == sp; j--) {
count++;
if (CLG_(current_call_stack).entry[j].ret_addr == ret_target)
popcount_on_return = count;
}
/* Top frame is SP-equal and nothing in the block returns to the
* target: a `ret` that does not match our call stack — a jump (the
* original SP-equal-top behaviour). */
if (popcount_on_return == 0 && i == csp-1)
is_jump = True;
}
else {
/* An SP-higher caller reached after skipping SP-lower frames (the
* common x86 return), or every frame was SP-lower: no SP-equal
* frames to pop. */
popcount_on_return = 0;
break;
}
}
if (popcount_on_return == 0) {

if (is_jump) {
jmpkind = jk_Jump;
ret_without_call = True;
}
Expand Down Expand Up @@ -789,7 +832,12 @@ void CLG_(setup_bbcc)(BB* bb)
CLG_(pop_call_stack)();
}
else {
CLG_ASSERT(popcount_on_return >0);
/* popcount_on_return is the number of *SP-equal* frames this return
* pops; it may legitimately be 0 when the return vacates only SP-lower
* frames (every return on x86, where `call` pushes a return address, and
* AArch64/PPC returns whose SP-equal frame is the caller we stop at).
* unwind_call_stack still pops the SP-lower frames in that case. */
CLG_ASSERT(popcount_on_return >= 0);
CLG_(unwind_call_stack)(sp, popcount_on_return);
}
}
Expand Down
41 changes: 36 additions & 5 deletions callgrind/callstack.c
Original file line number Diff line number Diff line change
Expand Up @@ -431,13 +431,33 @@ Int CLG_(unwind_call_stack)(Addr sp, Int minpops)
while( (csp=CLG_(current_call_stack).sp) >0) {
call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]);

if ((top_ce->sp < sp) ||
((top_ce->sp == sp) && minpops>0)) {

/* A frame whose entry SP lies strictly below the post-return SP has
* been vacated by this return and is unwound unconditionally.
*
* `minpops` only bounds how many *SP-equal* frames a single return may
* pop (the count setup_bbcc proved belong to this return by matching the
* return target against recorded return addresses). SP-equal entry frames
* exist on targets whose call instruction leaves SP unchanged — AArch64
* bl/blr (return address in the link register) and PPC b(c)l — so a
* callee's own entry frame records the caller's SP and ends up *beneath*
* the SP-lower frames of any sub-calls the callee made (e.g. a libc/libm
* function reached via the PLT that itself calls other functions).
*
* Therefore SP-lower pops must not consume the budget: if they did, the
* still-open sub-call frames would exhaust it before the callee's
* SP-equal entry frame is reached, leaving that frame stuck. A stuck
* frame keeps the callee's context active (so the caller's continuation
* is mis-attributed to the callee — inverted edges) and never decrements
* the callee's recursion depth (fabricating spurious '2 clones). */
if (top_ce->sp < sp) {
unwind_count++;
CLG_(pop_call_stack)();
continue;
}
if ((top_ce->sp == sp) && minpops>0) {
minpops--;
unwind_count++;
CLG_(pop_call_stack)();
csp=CLG_(current_call_stack).sp;
continue;
}
break;
Expand Down Expand Up @@ -521,7 +541,18 @@ void CLG_(reconstruct_call_stack_from_native)(ThreadId tid)
* SP, sps[frame+1]; the outermost frame keeps its own SP as nothing
* returns past it during measurement. */
ce->sp = (frame + 1 < (Int)n) ? sps[frame + 1] : sps[frame];
ce->ret_addr = (frame + 1 < (Int)n) ? ips[frame + 1] : 0;
/* setup_bbcc classifies a return by matching the top frame's ret_addr
* against bb_addr(return-target) — the return PC, i.e. the instruction
* after the call — and push_call_stack stores exactly that for real
* calls. VG_(get_StackTrace), however, reports each caller frame at the
* last byte of its call instruction (m_stacktrace.c: `ips[i] = pc - 1`),
* i.e. return_PC - 1. Seeding ret_addr straight from ips[frame+1] would
* therefore be one byte low. That matters on AArch64/PPC, where a `ret`
* lands at SP *equal* to the seeded entry SP so the classifier falls
* back on ret_addr: the off-by-one fails the match, the return is
* demoted to a jump and re-promoted to a call, inverting the edge across
* the seeded frame. Normalize to the return PC with +1. */
ce->ret_addr = (frame + 1 < (Int)n) ? ips[frame + 1] + 1 : 0;
cs->sp++;
ensure_stack_size(cs->sp + 1);
cs->entry[cs->sp].cxt = 0;
Expand Down
Loading