Recording call-graphs with perf

perf record -g g++ -w -Ofast tramp3d-v4.cpp

perf report -g "graph,0.5,caller"

Samples: 121K of event 'cycles', Event count (approx.): 97086533769
  Children      Self  Command   Shared Object               Symbol
+   89.66%     0.00%  cc1plus   cc1plus                     [.] toplev::main
+   89.66%     0.00%  cc1plus   cc1plus                     [.] compile_file
+   89.66%     0.00%  cc1plus   cc1plus                     [.] main
+   89.65%     0.00%  cc1plus   libc-2.22.90.so             [.] __libc_start_main
+   89.64%     0.00%  cc1plus   [unknown]                   [.] 0x4c5441554100647c
+   74.91%     0.00%  cc1plus   cc1plus                     [.] symbol_table::finalize_compilation_unit
+   71.82%     0.00%  cc1plus   cc1plus                     [.] symbol_table::compile
+   70.09%     0.09%  cc1plus   cc1plus                     [.] execute_one_pass
+   68.68%     0.01%  cc1plus   cc1plus                     [.] execute_pass_list
+   68.67%     0.02%  cc1plus   cc1plus                     [.] execute_pass_list_1
+   56.39%     0.00%  cc1plus   cc1plus                     [.] cgraph_node::expand
+   14.22%     0.00%  cc1plus   cc1plus                     [.] execute_ipa_pass_list
+   12.58%     0.01%  cc1plus   cc1plus                     [.] do_per_function_toporder
+    7.89%     0.03%  cc1plus   cc1plus                     [.] instantiate_decl
+    7.88%     0.02%  cc1plus   cc1plus                     [.] c_parse_final_cleanups
+    7.81%     0.02%  cc1plus   cc1plus                     [.] instantiate_pending_templates
+    7.19%     0.10%  cc1plus   cc1plus                     [.] dom_walker::walk
+    6.87%     0.00%  cc1plus   cc1plus                     [.] c_common_parse_file
+    6.86%     0.01%  cc1plus   cc1plus                     [.] c_parse_file

You can navigate the list with the cursor keys. Notice the + symbols on the far left. You can expand (and collapse) these call-graph nodes by pressing Enter.

Samples: 121K of event 'cycles', Event count (approx.): 97086533769
  Children      Self  Command   Shared Object               Symbol
-   89.66%     0.00%  cc1plus   cc1plus                     [.] toplev::main
   - toplev::main
      - 89.65% compile_file
         - 74.91% symbol_table::finalize_compilation_unit
            - 71.82% symbol_table::compile
               - 56.39% cgraph_node::expand
                  - 54.99% execute_pass_list
                     - execute_pass_list_1
                        + 52.90% execute_pass_list_1
                        + 2.07% execute_one_pass
                          0.00% (anonymous namespace)::pass_expand::execute
                          0.00% (anonymous namespace)::pass_lower_eh_dispatch::gate
                          0.00% (anonymous namespace)::pass_lower_vector::gate
                          0.00% (anonymous namespace)::pass_rest_of_compilation::gate
                          0.00% (anonymous namespace)::pass_tsan_O0::gate
                          0.00% (anonymous namespace)::pass_vtable_verify::gate
                          0.00% execute_todo
                          0.00% ggc_collect 
                          0.00% (anonymous namespace)::pass_lower_resx::execute
                          0.00% (anonymous namespace)::pass_lower_vaarg::gate
                          0.00% opt_pass::gate
                  + 1.36% execute_all_ipa_transforms
                  + 0.02% init_function_start
                  + 0.01% cgraph_node::assemble_thunks_and_aliases
                    0.00% invoke_set_current_function_hook
               + 14.22% execute_ipa_pass_list
               + 1.12% execute_ipa_summary_passes
               + 0.05% symbol_table::materialize_all_clones 
               + 0.03% symbol_table::remove_unreachable_nodes 
               + 0.01% symbol_table::output_variables
                 0.01% ipa_reverse_postorder 
                 0.00% output_in_order 
                 0.00% announce_function 
                 0.00% gimple_set_body 
                 0.00% cgraph_node::release_body
                 0.00% execute_pass_list 
                 0.00% type_in_anonymous_namespace_p   
            + 2.46% analyze_functions 
            + 0.62% handle_alias_pairs 
              0.00% gimple_has_body_p 
              0.00% decl_function_context
         + 7.86% c_parse_final_cleanups 
         + 6.87% c_common_parse_file   
           0.00% decl_needed_p        
           0.00% emit_tinfo_decl     
      + 0.01% cxx_init              
      + 0.00% init_ttree           
      + 0.00% gcc::context::context 
      + 0.00% init_emit_regs       
        0.00% init_reg_sets_1 

...
       │      static inline void
       │      gsi_prev (gimple_stmt_iterator *i)
       │      {
       │        gimple *prev = i->ptr->prev; 
       │ a58:┌─→mov    0x20(%rbx),%rbx  
       │     │  if (prev->next)
  5.26 │     │  cmpq   $0x0,0x18(%rbx)
       │     │↓ je     af0
       │     │    i->ptr = prev;
       │ a67:│  mov    %rbx,-0x50(%rbp)
       │ a6b:│  test   %rbx,%rbx
       │     │↓ je     af8
       │     │        {
       │     │          stmt = gsi_stmt (gsi);
       │     │          if (gimple_code (stmt) != GIMPLE_CALL)
       │     │  cmpb   $0x8,(%rbx)
  5.26 │     │↑ jne    a58
       │     │            continue;
       │     │          if (!gimple_call_internal_p (stmt)
       │     │              || gimple_call_internal_fn (stmt) != IFN_ANNOTATE)
       │     │  testb  $0x40,0x2(%rbx)
 10.53 │     └──je     a58
       │        cmpl   $0x8,0x60(%rbx)
       │      ↑ jne    a58
...

See man perf-report for further options.

Kudos to: CppCon 2015: Chandler Carruth "Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My!"

None: Perf_Callgraph (last edited 2015-09-29 10:53:54 by MarkusTrippelsdorf)