[RFC] costs and it's use in assign_reg_parm

Ramana Radhakrishnan ramana.radhakrishnan@arm.com
Thu Oct 2 08:31:00 GMT 2014


Hi,

    I've been digging into why on AArch64 we generate pretty bad code 
for the following testcase.

void g2(float, float, float, float, float, float, float, float);

void f2a(void)
{
    float x0 = 1.0, x1 = 2.0, x2 = 3.0, x3 = 4.0, x4 = 5.0, x5 = 6.0, x6 
= 7.0, x7 = 8.0;
    float x8 = 0.5, x9 = 1.5, x10 = 2.5, x11 = 0.25, x12 = 0.125, x13 = 
3.5, x14 = 0.75, x15 = 1.25;

    g2(x0, x1, x2, x3, x4, x5, x6, x7);
    g2(x8, x9, x10, x11, x12, x13, x14, x15);
    g2(x0, x1, x2, x3, x4, x5, x6, x7);
    g2(x8, x9, x10, x11, x12, x13, x14, x15);
}

And a couple of items caught my attention.

For one the backend doesn't set the costs of a reg-reg move to be the 
same as a reg-const move. In the AArch64 backend the approach appears to 
be in line with the documentation which is to set the costs of various 
instructions relative to a fast integer instruction. The hack to 
aarch64.c in the attached patch is for setting the cost properly for a 
reg-reg move of the appropriate mode and is only for demonstration 
purposes. I expect this to be replaced by an equivalent bit of code in 
the backend to achieve the same thing.

However the code in precompute_register_parameters assumes that the 
value is forced into a register if the set_src_cost of a constant is  > 
COSTS_N_INSNS(1). Now this appears to be checking the cost of a set of 
an FP immediate constant to a register and the backend not unreasonably 
sets it to an appropriate cost. Now to assume that this number should 
always be less than 1 is really not appropriate.

The same can be achieved with removing the fpconst case in 
aarch64.c:rtx_costs but ...

Instead of putting in what's effectively a lie in the backend, should we 
just be moving the midend to a world where all such numbers are compared 
to costs from the backend rather than relying on magic numbers. The 
costs comparison logic is similar to whats used in lower-subreg. The 
thought was to move this to a common area (Richard Sandiford suggested 
expmed.h in a private conversation) so that we had common APIs to check 
the cost of a SET in this form rather than relying on the rtx_cost 
interface.

Some of you might ask what's the impact on other backends, I still need 
to do the due diligence there with various test programs but my 
expectation based on reading the code is that a sample of backends (x86, 
mips, aarch64 and powerpc) handle the reg-reg move cost with a "set" 
already and I would expect the same to be in line with other costs. 
AArch32 does not but I'll take care of that in a follow up.

Longer term should we move towards cleaning up such magic numbers from 
the mid-end and that would make for a more maintainable compiler IMHO.

Thoughts ?

Lightly tested with the testcase and nothing more.


regards
Ramana



2a:
         fmov    s23, 8.0e+0
         fmov    s22, 7.0e+0
         fmov    s21, 6.0e+0
         fmov    s20, 5.0e+0
         fmov    s19, 4.0e+0
         fmov    s18, 3.0e+0
         fmov    s17, 2.0e+0
         fmov    s16, 1.0e+0
         stp     x29, x30, [sp, -112]!
         fmov    s7, s23
         fmov    s6, s22
         add     x29, sp, 0
         fmov    s5, s21
         fmov    s4, s20
         stp     d8, d9, [sp, 16]
         fmov    s3, s19
         stp     d10, d11, [sp, 32]
         fmov    s2, s18
         stp     d12, d13, [sp, 48]
         fmov    s1, s17
         stp     d14, d15, [sp, 64]
         fmov    s0, s16
         fmov    s15, 1.25e+0
         fmov    s14, 7.5e-1
         fmov    s13, 3.5e+0
         fmov    s12, 1.25e-1
         fmov    s11, 2.5e-1
         fmov    s10, 2.5e+0
         fmov    s9, 1.5e+0
         fmov    s8, 5.0e-1
         str     s23, [x29, 80]
         str     s22, [x29, 84]
         str     s21, [x29, 88]
         str     s20, [x29, 92]
         str     s19, [x29, 96]
         str     s18, [x29, 100]
         str     s17, [x29, 104]
         str     s16, [x29, 108]
         bl      g2
         fmov    s7, s15
         fmov    s6, s14
         fmov    s5, s13
         fmov    s4, s12
         fmov    s3, s11
         fmov    s2, s10
         fmov    s1, s9
         fmov    s0, s8
         bl      g2
         ldr     s23, [x29, 80]




TO

f2a:
         stp     x29, x30, [sp, -16]!
         fmov    s7, 8.0e+0
         add     x29, sp, 0
         fmov    s6, 7.0e+0
         fmov    s5, 6.0e+0
         fmov    s4, 5.0e+0
         fmov    s3, 4.0e+0
         fmov    s2, 3.0e+0
         fmov    s1, 2.0e+0
         fmov    s0, 1.0e+0
         bl      g2
         fmov    s7, 1.25e+0
         fmov    s6, 7.5e-1
         fmov    s5, 3.5e+0
         fmov    s4, 1.25e-1
         fmov    s3, 2.5e-1
         fmov    s2, 2.5e+0
         fmov    s1, 1.5e+0
         fmov    s0, 5.0e-1
         bl      g2
         fmov    s7, 8.0e+0
         fmov    s6, 7.0e+0
         fmov    s5, 6.0e+0
         fmov    s4, 5.0e+0
         fmov    s3, 4.0e+0
         fmov    s2, 3.0e+0
         fmov    s1, 2.0e+0
         fmov    s0, 1.0e+0
         bl      g2
         ldp     x29, x30, [sp], 16
         fmov    s7, 1.25e+0
         fmov    s6, 7.5e-1
         fmov    s5, 3.5e+0
         fmov    s4, 1.25e-1
         fmov    s3, 2.5e-1
         fmov    s2, 2.5e+0
         fmov    s1, 1.5e+0
         fmov    s0, 5.0e-1
         b       g2

-------------- next part --------------
diff --git a/gcc/calls.c b/gcc/calls.c
index 345331f..a34da07 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -813,11 +813,26 @@ call_expr_flags (const_tree t)
 
    Set REG_PARM_SEEN if we encounter a register parameter.  */
 
+/* RTXes used while computing costs.  */
+struct cost_rtxes {
+  /* Source and target registers.  */
+  rtx source;
+  rtx target;
+  /* A SET of TARGET.  */
+  rtx set;
+};
+
+
 static void
 precompute_register_parameters (int num_actuals, struct arg_data *args,
 				int *reg_parm_seen)
 {
   int i;
+  static struct cost_rtxes cost_rtx;
+
+  cost_rtx.target = gen_rtx_REG (word_mode, FIRST_PSEUDO_REGISTER);
+  cost_rtx.source = gen_rtx_REG (word_mode, FIRST_PSEUDO_REGISTER + 1);
+  cost_rtx.set = gen_rtx_SET (VOIDmode, cost_rtx.target, cost_rtx.source);
 
   *reg_parm_seen = 0;
 
@@ -825,6 +840,8 @@ precompute_register_parameters (int num_actuals, struct arg_data *args,
     if (args[i].reg != 0 && ! args[i].pass_on_stack)
       {
 	*reg_parm_seen = 1;
+	PUT_MODE (cost_rtx.target, args[i].mode);
+	PUT_MODE (cost_rtx.source, args[i].mode);
 
 	if (args[i].value == 0)
 	  {
@@ -872,8 +889,13 @@ precompute_register_parameters (int num_actuals, struct arg_data *args,
 		     || (GET_CODE (args[i].value) == SUBREG
 			 && REG_P (SUBREG_REG (args[i].value)))))
 		 && args[i].mode != BLKmode
-		 && set_src_cost (args[i].value, optimize_insn_for_speed_p ())
-		    > COSTS_N_INSNS (1)
+		 && (args[i].mode != VOIDmode
+		     && optimize
+		     && CONSTANT_P (args[i].value)
+		     && (set_src_cost (args[i].value, 
+				       optimize_insn_for_speed_p ())
+			 > set_src_cost (cost_rtx.set, 
+					 optimize_insn_for_speed_p ())))
 		 && ((*reg_parm_seen
 		      && targetm.small_register_classes_for_mode_p (args[i].mode))
 		     || optimize))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4e0cba8..91ece7b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -5083,6 +5083,7 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
 			      / UNITS_PER_WORD;
               *cost = COSTS_N_INSNS (n_minus_1 + 1);
+	      *cost = COSTS_N_INSNS (3);
             }
           else
 	    /* Cost is just the cost of the RHS of the set.  */


More information about the Gcc-patches mailing list