Skip to content

Commit f14445e

Browse files
ahunter6acmel
authored andcommitted
perf intel-pt: Support generating branch stack
Add support for generating branch stack context for PT samples. The decoder reports a configurable number of branches as branch context for each sample. Internally it keeps track of them by using a simple sliding window. We also flush the last branch buffer on each sample to avoid overlapping intervals. This is useful for: - Reporting accurate basic block edge frequencies through the perf report branch view - Using with --branch-history to get the wider context of samples - Other users of LBRs Also the Documentation is updated. Examples: Record with Intel PT: perf record -e intel_pt//u ls Branch stacks are used by default if synthesized so: perf report --itrace=ile is the same as: perf report --itrace=ile -b Branch history can be requested also: perf report --itrace=igle --branch-history Based-on-patch-by: Andi Kleen <[email protected]> Signed-off-by: Adrian Hunter <[email protected]> Cc: Jiri Olsa <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent 385e330 commit f14445e

File tree

2 files changed

+125
-0
lines changed

2 files changed

+125
-0
lines changed

tools/perf/Documentation/intel-pt.txt

+10
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,7 @@ The letters are:
671671
e synthesize tracing error events
672672
d create a debug log
673673
g synthesize a call chain (use with i or x)
674+
l synthesize last branch entries (use with i or x)
674675

675676
"Instructions" events look like they were recorded by "perf record -e
676677
instructions".
@@ -718,6 +719,15 @@ transactions events can be specified. e.g.
718719
--itrace=ig32
719720
--itrace=xg32
720721

722+
Also the number of last branch entries (default 64, max. 1024) for instructions or
723+
transactions events can be specified. e.g.
724+
725+
--itrace=il10
726+
--itrace=xl10
727+
728+
Note that last branch entries are cleared for each sample, so there is no overlap
729+
from one sample to the next.
730+
721731
To disable trace decoding entirely, use the option --no-itrace.
722732

723733

tools/perf/util/intel-pt.c

+115
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "../perf.h"
2323
#include "session.h"
2424
#include "machine.h"
25+
#include "sort.h"
2526
#include "tool.h"
2627
#include "event.h"
2728
#include "evlist.h"
@@ -115,6 +116,9 @@ struct intel_pt_queue {
115116
void *decoder;
116117
const struct intel_pt_state *state;
117118
struct ip_callchain *chain;
119+
struct branch_stack *last_branch;
120+
struct branch_stack *last_branch_rb;
121+
size_t last_branch_pos;
118122
union perf_event *event_buf;
119123
bool on_heap;
120124
bool stop;
@@ -675,6 +679,19 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
675679
goto out_free;
676680
}
677681

682+
if (pt->synth_opts.last_branch) {
683+
size_t sz = sizeof(struct branch_stack);
684+
685+
sz += pt->synth_opts.last_branch_sz *
686+
sizeof(struct branch_entry);
687+
ptq->last_branch = zalloc(sz);
688+
if (!ptq->last_branch)
689+
goto out_free;
690+
ptq->last_branch_rb = zalloc(sz);
691+
if (!ptq->last_branch_rb)
692+
goto out_free;
693+
}
694+
678695
ptq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
679696
if (!ptq->event_buf)
680697
goto out_free;
@@ -732,6 +749,8 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
732749

733750
out_free:
734751
zfree(&ptq->event_buf);
752+
zfree(&ptq->last_branch);
753+
zfree(&ptq->last_branch_rb);
735754
zfree(&ptq->chain);
736755
free(ptq);
737756
return NULL;
@@ -746,6 +765,8 @@ static void intel_pt_free_queue(void *priv)
746765
thread__zput(ptq->thread);
747766
intel_pt_decoder_free(ptq->decoder);
748767
zfree(&ptq->event_buf);
768+
zfree(&ptq->last_branch);
769+
zfree(&ptq->last_branch_rb);
749770
zfree(&ptq->chain);
750771
free(ptq);
751772
}
@@ -876,6 +897,57 @@ static int intel_pt_setup_queues(struct intel_pt *pt)
876897
return 0;
877898
}
878899

900+
static inline void intel_pt_copy_last_branch_rb(struct intel_pt_queue *ptq)
901+
{
902+
struct branch_stack *bs_src = ptq->last_branch_rb;
903+
struct branch_stack *bs_dst = ptq->last_branch;
904+
size_t nr = 0;
905+
906+
bs_dst->nr = bs_src->nr;
907+
908+
if (!bs_src->nr)
909+
return;
910+
911+
nr = ptq->pt->synth_opts.last_branch_sz - ptq->last_branch_pos;
912+
memcpy(&bs_dst->entries[0],
913+
&bs_src->entries[ptq->last_branch_pos],
914+
sizeof(struct branch_entry) * nr);
915+
916+
if (bs_src->nr >= ptq->pt->synth_opts.last_branch_sz) {
917+
memcpy(&bs_dst->entries[nr],
918+
&bs_src->entries[0],
919+
sizeof(struct branch_entry) * ptq->last_branch_pos);
920+
}
921+
}
922+
923+
static inline void intel_pt_reset_last_branch_rb(struct intel_pt_queue *ptq)
924+
{
925+
ptq->last_branch_pos = 0;
926+
ptq->last_branch_rb->nr = 0;
927+
}
928+
929+
static void intel_pt_update_last_branch_rb(struct intel_pt_queue *ptq)
930+
{
931+
const struct intel_pt_state *state = ptq->state;
932+
struct branch_stack *bs = ptq->last_branch_rb;
933+
struct branch_entry *be;
934+
935+
if (!ptq->last_branch_pos)
936+
ptq->last_branch_pos = ptq->pt->synth_opts.last_branch_sz;
937+
938+
ptq->last_branch_pos -= 1;
939+
940+
be = &bs->entries[ptq->last_branch_pos];
941+
be->from = state->from_ip;
942+
be->to = state->to_ip;
943+
be->flags.abort = !!(state->flags & INTEL_PT_ABORT_TX);
944+
be->flags.in_tx = !!(state->flags & INTEL_PT_IN_TX);
945+
/* No support for mispredict */
946+
947+
if (bs->nr < ptq->pt->synth_opts.last_branch_sz)
948+
bs->nr += 1;
949+
}
950+
879951
static int intel_pt_inject_event(union perf_event *event,
880952
struct perf_sample *sample, u64 type,
881953
bool swapped)
@@ -890,6 +962,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
890962
struct intel_pt *pt = ptq->pt;
891963
union perf_event *event = ptq->event_buf;
892964
struct perf_sample sample = { .ip = 0, };
965+
struct dummy_branch_stack {
966+
u64 nr;
967+
struct branch_entry entries;
968+
} dummy_bs;
893969

894970
if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
895971
return 0;
@@ -912,6 +988,21 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
912988
sample.flags = ptq->flags;
913989
sample.insn_len = ptq->insn_len;
914990

991+
/*
992+
* perf report cannot handle events without a branch stack when using
993+
* SORT_MODE__BRANCH so make a dummy one.
994+
*/
995+
if (pt->synth_opts.last_branch && sort__mode == SORT_MODE__BRANCH) {
996+
dummy_bs = (struct dummy_branch_stack){
997+
.nr = 1,
998+
.entries = {
999+
.from = sample.ip,
1000+
.to = sample.addr,
1001+
},
1002+
};
1003+
sample.branch_stack = (struct branch_stack *)&dummy_bs;
1004+
}
1005+
9151006
if (pt->synth_opts.inject) {
9161007
ret = intel_pt_inject_event(event, &sample,
9171008
pt->branches_sample_type,
@@ -961,6 +1052,11 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
9611052
sample.callchain = ptq->chain;
9621053
}
9631054

1055+
if (pt->synth_opts.last_branch) {
1056+
intel_pt_copy_last_branch_rb(ptq);
1057+
sample.branch_stack = ptq->last_branch;
1058+
}
1059+
9641060
if (pt->synth_opts.inject) {
9651061
ret = intel_pt_inject_event(event, &sample,
9661062
pt->instructions_sample_type,
@@ -974,6 +1070,9 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
9741070
pr_err("Intel Processor Trace: failed to deliver instruction event, error %d\n",
9751071
ret);
9761072

1073+
if (pt->synth_opts.last_branch)
1074+
intel_pt_reset_last_branch_rb(ptq);
1075+
9771076
return ret;
9781077
}
9791078

@@ -1008,6 +1107,11 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
10081107
sample.callchain = ptq->chain;
10091108
}
10101109

1110+
if (pt->synth_opts.last_branch) {
1111+
intel_pt_copy_last_branch_rb(ptq);
1112+
sample.branch_stack = ptq->last_branch;
1113+
}
1114+
10111115
if (pt->synth_opts.inject) {
10121116
ret = intel_pt_inject_event(event, &sample,
10131117
pt->transactions_sample_type,
@@ -1021,6 +1125,9 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
10211125
pr_err("Intel Processor Trace: failed to deliver transaction event, error %d\n",
10221126
ret);
10231127

1128+
if (pt->synth_opts.callchain)
1129+
intel_pt_reset_last_branch_rb(ptq);
1130+
10241131
return ret;
10251132
}
10261133

@@ -1116,6 +1223,9 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
11161223
return err;
11171224
}
11181225

1226+
if (pt->synth_opts.last_branch)
1227+
intel_pt_update_last_branch_rb(ptq);
1228+
11191229
if (!pt->sync_switch)
11201230
return 0;
11211231

@@ -1763,6 +1873,8 @@ static int intel_pt_synth_events(struct intel_pt *pt,
17631873
pt->instructions_sample_period = attr.sample_period;
17641874
if (pt->synth_opts.callchain)
17651875
attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
1876+
if (pt->synth_opts.last_branch)
1877+
attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
17661878
pr_debug("Synthesizing 'instructions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
17671879
id, (u64)attr.sample_type);
17681880
err = intel_pt_synth_event(session, &attr, id);
@@ -1782,6 +1894,8 @@ static int intel_pt_synth_events(struct intel_pt *pt,
17821894
attr.sample_period = 1;
17831895
if (pt->synth_opts.callchain)
17841896
attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
1897+
if (pt->synth_opts.last_branch)
1898+
attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
17851899
pr_debug("Synthesizing 'transactions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
17861900
id, (u64)attr.sample_type);
17871901
err = intel_pt_synth_event(session, &attr, id);
@@ -1808,6 +1922,7 @@ static int intel_pt_synth_events(struct intel_pt *pt,
18081922
attr.sample_period = 1;
18091923
attr.sample_type |= PERF_SAMPLE_ADDR;
18101924
attr.sample_type &= ~(u64)PERF_SAMPLE_CALLCHAIN;
1925+
attr.sample_type &= ~(u64)PERF_SAMPLE_BRANCH_STACK;
18111926
pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
18121927
id, (u64)attr.sample_type);
18131928
err = intel_pt_synth_event(session, &attr, id);

0 commit comments

Comments
 (0)