This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

short integer multiplication problem on IA64


Hello,

	I am trying to solve a performance problem on
IA64 when two short integers are multiplied together.
This was a problem noticed by Sverre Jarp when he
compared different IA64 compilers.  On IA64,
gcc generates the pmpy instruction for short integer 
multiplication.  The problem is that gcc does not add the 
appropriate delay before the next instruction that uses
the result of the pmpy instruction is issued.  On IA64, when
you try to use the result of a pmpy instruction too early, a 
pipeline flush occurs which is a 16 cycle delay.  This is why 
for short integer multiplication, gcc generated code has
a performance hit which does not occur when the same
code is compiled by other IA64 compilers.

	To demonstrate the problem, I have the following
code fragment:


	short func ( short a, short b) 
	{
	        return a * b;
	}

On IA64, gcc generates the following code:

	

        pmpy2.r r14 = r15, r14  ==> no delay after pmpy instruction
				    before r14 is used.
        ;;
        sxt2 r14 = r14
        ;;
        mov r8 = r14
        .restore sp


	According to Sverre, what gcc really should have generated is 
the following:

        pmpy2.r r14 = r15, r14  ==> no delay after pmpy instruction
        ;;
	nop 0
	;;
	nop 0
	;;
	nop 0
	;;
        sxt2 r14 = r14
        ;;


	I attempted to fix this problem by creating a routine called
ia64_add_mmmul_delay() in ia64.c which is called from ia64_emit_nops()
when optimization is present and emit_all_insn_group_barriers() when
optimization is not being used.  The routine is called when we find
an instruction of class ITANIUM_CLASS_MMMUL.  The routine 
checks for an instruction which uses the result of the pmpy instruction
and is not at least three instruction groups away.  If such an 
instruction is found, the routine generates the appropriate nops and 
stop bits to add enough delay so that a pipeline flush will
not occur.   

	In some limited testing that I have done so far, I
am seeing a performance improvement when the delay is inserted.

	The patch is not yet ready for prime time as a lot 
more testing needs to be done, but I wanted to send it out to
see if I'm on the right track.  A similar thing will need to be
done for variable shifts where the shift count is kept in a register.
I do have a couple of questions:

	1.  Is this the right approach?  Am I doing the
            right thing in ia64_add_mmmul_delay() and is
	    it being called from the right places?

	2.  Will I see stop bits in the instruction stream
            when this function is called?  I'm checking for
            them, but so far in my testing, I haven't seen any.

	3.  Is calling this routine for all instructions of
	    class ITANIUM_CLASS_MMMUL the right thing to do?
	    There are other instructions of this class so maybe
	    this is too general.


Any help in this matter is greatly appreciated!

Reva Cuthbertson
reva@cup.hp.com

*** ia64.c@@/main/hp/LATEST	Thu Feb 21 09:46:44 2002
--- ia64.c	Mon Feb 25 18:30:43 2002
*************** static void ia64_mark_machine_status PAR
*** 125,130 ****
--- 125,131 ----
  static void ia64_free_machine_status PARAMS ((struct function *));
  static void emit_insn_group_barriers PARAMS ((FILE *, rtx));
  static void emit_all_insn_group_barriers PARAMS ((FILE *, rtx));
+ static void ia64_add_mmmul_delay PARAMS ((rtx));
  static void emit_predicate_relation_info PARAMS ((void));
  static void process_epilogue PARAMS ((void));
  static int process_set PARAMS ((FILE *, rtx));
*************** emit_all_insn_group_barriers (dump, insn
*** 4950,4955 ****
--- 4951,4957 ----
       rtx insns;
  {
    rtx insn;
+   enum attr_itanium_class class;
  
    init_insn_group_barriers ();
  
*************** emit_all_insn_group_barriers (dump, insn
*** 4976,4981 ****
--- 4978,4988 ----
  	  else if (group_barrier_needed_p (insn))
  	    {
  	      emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), insn);
+ 
+ 	      class = ia64_safe_itanium_class (insn);
+ 	      if (class == ITANIUM_CLASS_MMMUL)
+ 		ia64_add_mmmul_delay(insn);
+ 
  	      init_insn_group_barriers ();
  	      group_barrier_needed_p (insn);
  	    }
*************** ia64_emit_nops ()
*** 6721,6726 ****
--- 6728,6742 ----
      {
        rtx pat;
        enum attr_type t;
+       enum attr_itanium_class class;
+ 
+       if (INSN_P (insn))
+ 	class = ia64_safe_itanium_class (insn);
+       if (class == ITANIUM_CLASS_MMMUL)
+ 	{
+ 	  ia64_add_mmmul_delay(insn);
+ 	  continue;
+ 	}
        pat = INSN_P (insn) ? PATTERN (insn) : const0_rtx;
        if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
  	continue;
*************** ia64_hpux_function_arg_padding (mode, ty
*** 7884,7886 ****
--- 7900,7987 ----
         : GET_MODE_BITSIZE (mode) < PARM_BOUNDARY)
        ? downward : upward);
  }
+ 
+ /*  This function adds a delay between a parallel multiply 
+     instruction and the first instruction which tries to use
+     the result of the parallel multiply.  If the result of
+     a parallel multiply is used too quickly, the pipeline is
+     flushed causing a 16 cycle delay.  This function makes sure
+     that an appropriate delay is added after the parallel multiply
+     if it is needed. */
+ 
+ static void
+ ia64_add_mmmul_delay (insn)
+      rtx insn;
+ {
+   rtx next;
+   enum attr_itanium_class class;
+   int insn_groups_needed = 3;
+   int insn_groups_found = 0;
+   rtx dest;
+       
+   /* Get the destination for the parallel multiply instruction */
+ 
+   dest = SET_DEST (PATTERN(insn));
+ 
+   /* Find the first instruction after the parallel multiply */
+ 
+   next = NEXT_INSN (insn);
+ 
+   class = ia64_safe_itanium_class (next);
+ 
+   /* Add a stop bit if the next instruction is not a stop bit */
+ 
+   if (class != ITANIUM_CLASS_STOP_BIT) 
+     {
+       emit_insn_after (gen_insn_group_barrier (GEN_INT (3)), insn);
+     }
+ 
+   /* Loop through the instructions looking for any instruction
+      that uses the result of a parallel multiply.  The loop
+      will terminate when there are no more valid instructions,
+      3 instruction groups have passed without seeing an 
+      instruction depending on the parallel multiply, or an
+      instruction that depends on the parallel multiply is found. */
+ 
+   while (INSN_P (next)) {
+ 
+     /* Did we already process three instruction groups?  If so,
+        no delay is needed and we can return */
+ 
+     if (insn_groups_found == insn_groups_needed)
+       break;
+ 
+     else if (reg_mentioned_p (dest, next))
+       {
+ 
+ 	/* Did we find an instruction dependent on the
+            the parallel multiply?  If so, add a delay
+ 	   so that 3 instruction groups go by before we
+ 	   use the result of the parallel multiply */
+ 
+ 	while (insn_groups_found < insn_groups_needed)
+ 	  {
+ 	    emit_insn_before (gen_nop (), next);
+ 	    emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), next);
+ 	    insn_groups_found++;
+ 	  }
+ 	break;
+       }
+     next = NEXT_INSN (next);
+     
+     if (INSN_P (next)) 
+       {
+ 	class = ia64_safe_itanium_class (next);
+ 
+ 	/* Did we see a stop bit?  If so, mark it */
+ 	if (class == ITANIUM_CLASS_STOP_BIT)
+ 	  insn_groups_found++;
+       }
+ 
+   }
+ 
+ }
+ 
+ 
+ 
+ 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]