This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [Patch 001] [x86 backend] Define march/mtune for upcoming AMD Bulldozer procesor.

From: Uros Bizjak <ubizjak at gmail dot com>
To: GCC Patches <gcc-patches at gcc dot gnu dot org>
Cc: "Jagasia, Harsha" <harsha dot jagasia at amd dot com>, gcc dot amd at mpdtxmail dot amd dot com
Date: Sun, 25 Apr 2010 11:58:38 +0200
Subject: Re: [Patch 001] [x86 backend] Define march/mtune for upcoming AMD Bulldozer procesor.
Hello!

> This patch defines -march=bdver1 and -mtune=bdver1 flag for the upcoming
> AMD Bulldozer processor.

> For eg in ac.f90 benchmark in the polyhedron benchmark suite, different
> register allocation between the Barcelona and Bulldozer binaries results
> in redundant saving and restoring of xmm registers for Bulldozer. It
> would be helpful if there are any suggestions on how to fix this.
> 
> See inline attached reg-alloc-problems at the end of this mail.
> 
> Make check of i386 tests passes. I will update the list with the bootstrap
> results when its done.
> 
> Ok for check in?

Please see comments in the code below.

> Index: config/i386/sse.md
> ===================================================================
> --- config/i386/sse.md	(revision 158653)
> +++ config/i386/sse.md	(working copy)
> @@ -204,7 +204,20 @@
>  }
>    [(set_attr "type" "sselog1,ssemov,ssemov")
>     (set_attr "prefix" "vex")
> -   (set_attr "mode" "<avxvecmode>")])
> +   (set (attr "mode")
> +        (cond [(and (ior (eq (const_string "<MODE>mode") (const_string "OImode"))
> +                         (eq (const_string "<MODE>mode") (const_string "V4DFmode")))
> +                    (ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0)))
> +                 (const_string "V8SF")
> +               (eq (const_string "<MODE>mode") (const_string "V8SFmode"))
> +                 (const_string "V8SF")
> +               (and (and (and (ne (const_string "<MODE>mode") (const_string "OImode"))
> +                              (ne (const_string "<MODE>mode") (const_string "V4DFmode")))
> +                         (ne (const_string "<MODE>mode") (const_string "V8SFmode")))
> +                    (ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0)))
> +                 (const_string "V4SF")
> +	      ]
> +	      (const_string "<avxvecmode>")))])

Uh, shouldn't we just split the pattern instead of checking <MODE>mode
macroized attribute for various modes?
[This happens in a couple of places in the code below.]

>  
>  ;; All of these patterns are enabled for SSE1 as well as SSE2.
>  ;; This is essential for maintaining stable calling conventions.
> @@ -246,8 +259,9 @@
>  }
>    [(set_attr "type" "sselog1,ssemov,ssemov")
>     (set (attr "mode")
> -	(cond [(ior (ior (ne (symbol_ref "optimize_function_for_size_p (cfun)") (const_int 0))
> -			 (eq (symbol_ref "TARGET_SSE2") (const_int 0)))
> +	(cond [(ior (ior (ior (ne (symbol_ref "optimize_function_for_size_p (cfun)") (const_int 0))
> +	                      (eq (symbol_ref "TARGET_SSE2") (const_int 0)))
> +	                 (ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0)))
>  		    (and (eq_attr "alternative" "2")
>  			 (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES")
>  			     (const_int 0))))
> @@ -1597,10 +1611,33 @@
>  	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
>    "AVX_VEC_FLOAT_MODE_P (<MODE>mode)
>     && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> -  "v<logic>p<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
> +{
> +  switch (get_attr_mode (insn))
> +    {
> +      case MODE_V8SF:
> +      case MODE_V4SF:
> +        return "v<logic>ps\t{%2, %1, %0|%0, %1, %2}";
> +      case MODE_V4DF:
> +      case MODE_V2DF:
> +        return "v<logic>pd\t{%2, %1, %0|%0, %1, %2}";
> +      default:
> +        return "v<logic>ps\t{%2, %1, %0|%0, %1, %2}";
> +    }
> +}

I guess that "default:" will handle MODE_V8SF and MODE_V4SF just fine.
[This happens in a couple of places in the code below.]

>    [(set_attr "type" "sselog")
>     (set_attr "prefix" "vex")
> -   (set_attr "mode" "<avxvecmode>")])
> +   (set (attr "mode")
> +        (cond [(and (eq (const_string "<MODE>mode") (const_string "V4DFmode"))
> +                    (ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0)))
> +                 (const_string "V8SF")
> +               (eq (const_string "<MODE>mode") (const_string "V8SFmode"))
> +                 (const_string "V8SF")
> +               (and (and (ne (const_string "<MODE>mode") (const_string "V4DFmode"))
> +                         (ne (const_string "<MODE>mode") (const_string "V8SFmode")))
> +                    (ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0)))
> +                 (const_string "V4SF")
> +	      ]
> +	      (const_string "<avxvecmode>")))])
>  
>  (define_expand "<code><mode>3"
>    [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
> @@ -1617,9 +1654,23 @@
>  	  (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))]
>    "SSE_VEC_FLOAT_MODE_P (<MODE>mode)
>     && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> -  "<logic>p<ssemodesuffixf2c>\t{%2, %0|%0, %2}"
> +{
> +  switch (get_attr_mode (insn))
> +    {
> +      case MODE_V4SF:
> +        return "<logic>ps\t{%2, %0|%0, %2}";
> +      case MODE_V2DF:
> +        return "<logic>pd\t{%2, %0|%0, %2}";
> +      default:
> +        return "<logic>ps\t{%2, %0|%0, %2}";
> +    }
> +}
>    [(set_attr "type" "sselog")
> -   (set_attr "mode" "<MODE>")])
> +   (set (attr "mode")
> +        (cond [(ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0))
> +                   (const_string "V4SF")
> +              ]
> +              (const_string "<MODE>")))])
>  
>  (define_expand "copysign<mode>3"
>    [(set (match_dup 4)
> @@ -1673,10 +1724,24 @@
>  	  (match_operand:MODEF 1 "register_operand" "x")
>  	  (match_operand:MODEF 2 "register_operand" "x")))]
>    "AVX_FLOAT_MODE_P (<MODE>mode)"
> -  "v<logic>p<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
> +{
> +  switch (get_attr_mode (insn))
> +    {
> +      case MODE_SF:
> +        return "v<logic>ps\t{%2, %1, %0|%0, %1, %2}";
> +      case MODE_DF:
> +        return "v<logic>pd\t{%2, %1, %0|%0, %1, %2}";
> +      default:
> +        return "v<logic>ps\t{%2, %1, %0|%0, %1, %2}";
> +    }
> +}
>    [(set_attr "type" "sselog")
>     (set_attr "prefix" "vex")
> -   (set_attr "mode" "<ssevecmode>")])
> +   (set (attr "mode")
> +        (cond [(ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0))
> +               (const_string "SF")
> +              ]
> +              (const_string "<ssevecmode>")))])
>  
>  (define_insn "*<code><mode>3"
>    [(set (match_operand:MODEF 0 "register_operand" "=x")
> @@ -1684,9 +1749,23 @@
>  	  (match_operand:MODEF 1 "register_operand" "0")
>  	  (match_operand:MODEF 2 "register_operand" "x")))]
>    "SSE_FLOAT_MODE_P (<MODE>mode)"
> -  "<logic>p<ssemodefsuffix>\t{%2, %0|%0, %2}"
> +{
> +  switch (get_attr_mode (insn))
> +    {
> +      case MODE_SF:
> +        return "<logic>ps\t{%2, %0|%0, %2}";
> +      case MODE_DF:
> +        return "<logic>pd\t{%2, %0|%0, %2}";
> +      default:
> +        return "<logic>ps\t{%2, %0|%0, %2}";
> +    }
> +}
>    [(set_attr "type" "sselog")
> -   (set_attr "mode" "<ssevecmode>")])
> +   (set (attr "mode")
> +        (cond [(ne (symbol_ref "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") (const_int 0))
> +                   (const_string "SF")
> +              ]
> +              (const_string "<ssevecmode>")))])
>  
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>  ;;

BTW: There are many whitespace errors in *.md code, tabs vs spaces. IMO,
the easiest way is to use emacs and by temporarily adding following
lines to the end of *.md, source will be aligned automatically by
pressing Tab key:

;; Local Variables:
;; mode: lisp
;; indent-tabs-mode: t
;; End:

(probably, this could be added to some .config file, I didn't
investigate this possibility).

> Index: config/i386/i386-c.c
> ===================================================================
> --- config/i386/i386-c.c	(revision 158653)
> +++ config/i386/i386-c.c	(working copy)
> @@ -107,6 +107,10 @@ ix86_target_macros_internal (int isa_fla
>        def_or_undef (parse_in, "__amdfam10");
>        def_or_undef (parse_in, "__amdfam10__");
>        break;
> +    case PROCESSOR_BDVER1:
> +      def_or_undef (parse_in, "__bdver1");
> +      def_or_undef (parse_in, "__bdver1__");
> +      break;
>      case PROCESSOR_PENTIUM4:
>        def_or_undef (parse_in, "__pentium4");
>        def_or_undef (parse_in, "__pentium4__");
> @@ -182,6 +186,9 @@ ix86_target_macros_internal (int isa_fla
>      case PROCESSOR_AMDFAM10:
>        def_or_undef (parse_in, "__tune_amdfam10__");
>        break;
> +    case PROCESSOR_BDVER1:
> +      def_or_undef (parse_in, "__tune_bdver1__");
> +      break;
>      case PROCESSOR_PENTIUM4:
>        def_or_undef (parse_in, "__tune_pentium4__");
>        break;
> Index: config/i386/driver-i386.c
> ===================================================================
> --- config/i386/driver-i386.c	(revision 158653)
> +++ config/i386/driver-i386.c	(working copy)
> @@ -396,6 +396,7 @@ const char *host_detect_local_cpu (int a
>    unsigned int has_movbe = 0, has_sse4_1 = 0, has_sse4_2 = 0;
>    unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0;
>    unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
> +  unsigned int has_fma4 = 0, has_xop = 0;
>  
>    bool arch;
>  
> @@ -460,6 +461,8 @@ const char *host_detect_local_cpu (int a
>        has_sse4a = ecx & bit_SSE4a;
>        has_abm = ecx & bit_ABM;
>        has_lwp = ecx & bit_LWP;
> +      has_fma4 = ecx & bit_FMA4;
> +      has_xop = ecx & bit_XOP;
>  
>        has_longmode = edx & bit_LM;
>        has_3dnowp = edx & bit_3DNOWP;
> @@ -490,6 +493,8 @@ const char *host_detect_local_cpu (int a
>  
>        if (name == SIG_GEODE)
>  	processor = PROCESSOR_GEODE;
> +      else if (has_xop)
> +	processor = PROCESSOR_BDVER1;
>        else if (has_sse4a)
>  	processor = PROCESSOR_AMDFAM10;
>        else if (has_sse2 || has_longmode)
> @@ -602,6 +607,9 @@ const char *host_detect_local_cpu (int a
>      case PROCESSOR_AMDFAM10:
>        cpu = "amdfam10";
>        break;
> +    case PROCESSOR_BDVER1:
> +      cpu = "bdver1";
> +      break;
>  
>      default:
>        /* Use something reasonable.  */
> @@ -647,6 +655,10 @@ const char *host_detect_local_cpu (int a
>  	options = concat (options, " -mabm", NULL);
>        if (has_lwp)
>  	options = concat (options, " -mlwp", NULL);
> +      if (has_fma4)
> +	options = concat (options, " -mfma4", NULL);
> +      if (has_xop)
> +	options = concat (options, " -mxop", NULL);
>  
>        if (has_avx)
>  	options = concat (options, " -mavx", NULL);
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c	(revision 158653)
> +++ config/i386/i386.c	(working copy)
> @@ -819,6 +819,93 @@ struct processor_costs amdfam10_cost = {
>    1,                                    /* cond_not_taken_branch_cost.  */
>  };
>  
> +struct processor_costs bdver1_cost = {
> +  COSTS_N_INSNS (1),                    /* cost of an add instruction */
> +  COSTS_N_INSNS (2),                    /* cost of a lea instruction */
> +  COSTS_N_INSNS (1),                    /* variable shift costs */
> +  COSTS_N_INSNS (1),                    /* constant shift costs */
> +  {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
> +   COSTS_N_INSNS (4),                   /*                               HI */
> +   COSTS_N_INSNS (3),                   /*                               SI */
> +   COSTS_N_INSNS (4),                   /*                               DI */
> +   COSTS_N_INSNS (5)},                  /*                               other */
> +  0,                                    /* cost of multiply per each bit set */
> +  {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
> +   COSTS_N_INSNS (35),                  /*                          HI */
> +   COSTS_N_INSNS (51),                  /*                          SI */
> +   COSTS_N_INSNS (83),                  /*                          DI */
> +   COSTS_N_INSNS (83)},                 /*                          other */
> +  COSTS_N_INSNS (1),			/* cost of movsx */
> +  COSTS_N_INSNS (1),			/* cost of movzx */
> +  8,					/* "large" insn */
> +  9,					/* MOVE_RATIO */
> +  4,					/* cost for loading QImode using movzbl */
> +  {3, 4, 3},				/* cost of loading integer registers
> +					   in QImode, HImode and SImode.
> +					   Relative to reg-reg move (2).  */
> +  {3, 4, 3},				/* cost of storing integer registers */
> +  4,					/* cost of reg,reg fld/fst */
> +  {4, 4, 12},				/* cost of loading fp registers
> +		   			   in SFmode, DFmode and XFmode */
> +  {6, 6, 8},				/* cost of storing fp registers
> + 		   			   in SFmode, DFmode and XFmode */
> +  2,					/* cost of moving MMX register */
> +  {3, 3},				/* cost of loading MMX registers
> +					   in SImode and DImode */
> +  {4, 4},				/* cost of storing MMX registers
> +					   in SImode and DImode */
> +  2,					/* cost of moving SSE register */
> +  {4, 4, 3},				/* cost of loading SSE registers
> +					   in SImode, DImode and TImode */
> +  {4, 4, 5},				/* cost of storing SSE registers
> +					   in SImode, DImode and TImode */
> +  3,					/* MMX or SSE register to integer */
> +  					/* On K8
> +  					    MOVD reg64, xmmreg 	Double	FSTORE 4
> +					    MOVD reg32, xmmreg 	Double	FSTORE 4
> +					   On AMDFAM10
> +					    MOVD reg64, xmmreg 	Double	FADD 3
> +                                                                1/1  1/1
> +					    MOVD reg32, xmmreg 	Double	FADD 3
> +                                                                1/1  1/1 */
> +  64,					/* size of l1 cache.  */
> +  1024,					/* size of l2 cache.  */
> +  64,					/* size of prefetch block */
> +  /* New AMD processors never drop prefetches; if they cannot be performed
> +     immediately, they are queued.  We set number of simultaneous prefetches
> +     to a large constant to reflect this (it probably is not a good idea not
> +     to limit number of prefetches at all, as their execution also takes some
> +     time).  */
> +  100,					/* number of parallel prefetches */
> +  2,					/* Branch cost */
> +  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
> +  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
> +  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
> +  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
> +  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
> +  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
> +
> +  /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
> +      very small blocks it is better to use loop. For large blocks, libcall can
> +      do nontemporary accesses and beat inline considerably.  */
> +  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
> +   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
> +  {{libcall, {{8, loop}, {24, unrolled_loop},
> +	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
> +   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
> +  4,                                    /* scalar_stmt_cost.  */
> +  2,                                    /* scalar load_cost.  */
> +  2,                                    /* scalar_store_cost.  */
> +  6,                                    /* vec_stmt_cost.  */
> +  0,                                    /* vec_to_scalar_cost.  */
> +  2,                                    /* scalar_to_vec_cost.  */
> +  2,                                    /* vec_align_load_cost.  */
> +  2,                                    /* vec_unalign_load_cost.  */
> +  2,                                    /* vec_store_cost.  */
> +  2,                                    /* cond_taken_branch_cost.  */
> +  1,                                    /* cond_not_taken_branch_cost.  */
> +};
> +
>  static const
>  struct processor_costs pentium4_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
> @@ -1276,7 +1363,8 @@ const struct processor_costs *ix86_cost 
>  #define m_ATHLON  (1<<PROCESSOR_ATHLON)
>  #define m_ATHLON_K8  (m_K8 | m_ATHLON)
>  #define m_AMDFAM10  (1<<PROCESSOR_AMDFAM10)
> -#define m_AMD_MULTIPLE  (m_K8 | m_ATHLON | m_AMDFAM10)
> +#define m_BDVER1  (1<<PROCESSOR_BDVER1)
> +#define m_AMD_MULTIPLE  (m_K8 | m_ATHLON | m_AMDFAM10 | m_BDVER1)
>  
>  #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
>  #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
> @@ -1321,7 +1409,7 @@ static unsigned int initial_ix86_tune_fe
>    ~m_386,
>  
>    /* X86_TUNE_USE_SAHF */
> -  m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
> +  m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER1 | m_PENT4
>    | m_NOCONA | m_CORE2 | m_GENERIC,
>  
>    /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
> @@ -1425,10 +1513,16 @@ static unsigned int initial_ix86_tune_fe
>       while enabling it on K8 brings roughly 2.4% regression that can be partly
>       masked by careful scheduling of moves.  */
>    m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC
> -  | m_AMDFAM10,
> +  | m_AMDFAM10 | m_BDVER1,
>  
> -  /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
> -  m_AMDFAM10,
> +  /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
> +  m_AMDFAM10 | m_BDVER1,
> +
> +  /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
> +  m_BDVER1,
> +
> +  /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
> +  m_BDVER1,
>  
>    /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
>       are resolved on SSE register parts instead of whole registers, so we may
> @@ -1461,7 +1555,7 @@ static unsigned int initial_ix86_tune_fe
>    ~(m_AMD_MULTIPLE | m_GENERIC),
>  
>    /* X86_TUNE_INTER_UNIT_CONVERSIONS */
> -  ~(m_AMDFAM10),
> +  ~(m_AMDFAM10 | m_BDVER1),
>  
>    /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
>       than 4 branch instructions in the 16 byte window.  */
> @@ -1497,11 +1591,11 @@ static unsigned int initial_ix86_tune_fe
>  
>    /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
>       vector path on AMD machines.  */
> -  m_K8 | m_GENERIC64 | m_AMDFAM10,
> +  m_K8 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1,
>  
>    /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
>       machines.  */
> -  m_K8 | m_GENERIC64 | m_AMDFAM10,
> +  m_K8 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1,
>  
>    /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
>       than a MOV.  */
> @@ -1527,7 +1621,7 @@ static unsigned int initial_ix86_tune_fe
>    /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
>       with a subsequent conditional jump instruction into a single
>       compare-and-branch uop.  */
> -  m_CORE2,
> +  m_CORE2 | m_BDVER1,
>  
>    /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>       will impact LEA instruction selection. */
> @@ -2066,6 +2160,7 @@ static const struct ptt processor_target
>    {&generic32_cost, 16, 7, 16, 7, 16},
>    {&generic64_cost, 16, 10, 16, 10, 16},
>    {&amdfam10_cost, 32, 24, 32, 7, 32},
> +  {&bdver1_cost, 32, 24, 32, 7, 32},
>    {&atom_cost, 16, 7, 16, 7, 16}
>  };
>  
> @@ -2092,7 +2187,8 @@ static const char *const cpu_names[TARGE
>    "athlon",
>    "athlon-4",
>    "k8",
> -  "amdfam10"
> +  "amdfam10",
> +  "bdver1"
>  };
>  
>  /* Implement TARGET_HANDLE_OPTION.  */
> @@ -2750,6 +2846,11 @@ override_options (bool main_args_p)
>        {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
>  	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
>  	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
> +      {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
> +	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
> +	| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM
> +	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES 
> +	| PTA_PCLMUL | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP},
>        {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
>  	0 /* flags are only used for -march switch.  */ },
>        {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
> @@ -7464,15 +7565,27 @@ standard_sse_constant_opcode (rtx insn, 
>  	case MODE_V4SF:
>  	  return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";
>  	case MODE_V2DF:
> -	  return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0";
> +	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> +	    return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0";
> +	  else
> +	    return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";	    
>  	case MODE_TI:
> -	  return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0";
> +	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> +	    return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0";
> +	  else
> +	    return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";
>  	case MODE_V8SF:
>  	  return "vxorps\t%x0, %x0, %x0";
>  	case MODE_V4DF:
> -	  return "vxorpd\t%x0, %x0, %x0";
> +	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> +	    return "vxorpd\t%x0, %x0, %x0";
> +	  else
> +	    return "vxorps\t%x0, %x0, %x0";
>  	case MODE_OI:
> -	  return "vpxor\t%x0, %x0, %x0";
> +	  if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> +	    return "vpxor\t%x0, %x0, %x0";
> +	  else
> +	    return "vxorps\t%x0, %x0, %x0";
>  	default:
>  	  break;

Please reverse the arms of the conditional to loose "!" in the code above.

>  	}
> @@ -13222,6 +13335,14 @@ ix86_expand_vector_move_misalign (enum m
>  	  switch (GET_MODE_SIZE (mode))
>  	    {
>  	    case 16:
> +	      /*  If we're optimizing for size, movups is the smallest.  */
> +	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> +		{
> +		  op0 = gen_lowpart (V4SFmode, op0);
> +		  op1 = gen_lowpart (V4SFmode, op1);
> +		  emit_insn (gen_avx_movups (op0, op1));
> +		  return;
> +		}
>  	      op0 = gen_lowpart (V16QImode, op0);
>  	      op1 = gen_lowpart (V16QImode, op1);
>  	      emit_insn (gen_avx_movdqu (op0, op1));
> @@ -13248,6 +13369,13 @@ ix86_expand_vector_move_misalign (enum m
>  	      emit_insn (gen_avx_movups256 (op0, op1));
>  	      break;
>  	    case V2DFmode:
> +	      if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> +		{
> +		  op0 = gen_lowpart (V4SFmode, op0);
> +		  op1 = gen_lowpart (V4SFmode, op1);
> +		  emit_insn (gen_avx_movups (op0, op1));
> +		  return;
> +		}
>  	      emit_insn (gen_avx_movupd (op0, op1));
>  	      break;
>  	    case V4DFmode:
> @@ -13268,7 +13396,8 @@ ix86_expand_vector_move_misalign (enum m
>    if (MEM_P (op1))
>      {
>        /* If we're optimizing for size, movups is the smallest.  */
> -      if (optimize_insn_for_size_p ())
> +      if (optimize_insn_for_size_p () 
> +	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
>  	{
>  	  op0 = gen_lowpart (V4SFmode, op0);
>  	  op1 = gen_lowpart (V4SFmode, op1);
> @@ -13291,7 +13420,7 @@ ix86_expand_vector_move_misalign (enum m
>          {
>            rtx zero;
>  
> -          if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
> +          if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
>              {
>                op0 = gen_lowpart (V2DFmode, op0);
>                op1 = gen_lowpart (V2DFmode, op1);
> @@ -13326,7 +13455,7 @@ ix86_expand_vector_move_misalign (enum m
>  	}
>        else
>          {
> -          if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
> +          if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
>              {
>                op0 = gen_lowpart (V4SFmode, op0);
>                op1 = gen_lowpart (V4SFmode, op1);
> @@ -13350,7 +13479,8 @@ ix86_expand_vector_move_misalign (enum m
>    else if (MEM_P (op0))
>      {
>        /* If we're optimizing for size, movups is the smallest.  */
> -      if (optimize_insn_for_size_p ())
> +      if (optimize_insn_for_size_p ()
> +	  || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
>  	{
>  	  op0 = gen_lowpart (V4SFmode, op0);
>  	  op1 = gen_lowpart (V4SFmode, op1);
> @@ -13371,19 +13501,37 @@ ix86_expand_vector_move_misalign (enum m
>  
>        if (TARGET_SSE2 && mode == V2DFmode)
>  	{
> -	  m = adjust_address (op0, DFmode, 0);
> -	  emit_insn (gen_sse2_storelpd (m, op1));
> -	  m = adjust_address (op0, DFmode, 8);
> -	  emit_insn (gen_sse2_storehpd (m, op1));
> +	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
> +	    {
> +	      op0 = gen_lowpart (V2DFmode, op0);
> +	      op1 = gen_lowpart (V2DFmode, op1);
> +	      emit_insn (gen_sse2_movupd (op0, op1));	      
> +	    }
> +	  else
> +	    {
> +	      m = adjust_address (op0, DFmode, 0);
> +	      emit_insn (gen_sse2_storelpd (m, op1));
> +	      m = adjust_address (op0, DFmode, 8);
> +	      emit_insn (gen_sse2_storehpd (m, op1));
> +	    }
>  	}
>        else
>  	{
>  	  if (mode != V4SFmode)
>  	    op1 = gen_lowpart (V4SFmode, op1);
> -	  m = adjust_address (op0, V2SFmode, 0);
> -	  emit_insn (gen_sse_storelps (m, op1));
> -	  m = adjust_address (op0, V2SFmode, 8);
> -	  emit_insn (gen_sse_storehps (m, op1));
> +
> +	  if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
> +	    {
> +	      op0 = gen_lowpart (V4SFmode, op0);
> +	      emit_insn (gen_sse_movups (op0, op1));	      
> +	    }
> +	  else
> +	    {
> +	      m = adjust_address (op0, V2SFmode, 0);
> +	      emit_insn (gen_sse_storelps (m, op1));
> +	      m = adjust_address (op0, V2SFmode, 8);
> +	      emit_insn (gen_sse_storehps (m, op1));
> +	    }
>  	}
>      }
>    else
> @@ -19703,6 +19851,7 @@ ix86_issue_rate (void)
>      case PROCESSOR_NOCONA:
>      case PROCESSOR_GENERIC32:
>      case PROCESSOR_GENERIC64:
> +    case PROCESSOR_BDVER1:
>        return 3;
>  
>      case PROCESSOR_CORE2:
> @@ -19892,6 +20041,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
>      case PROCESSOR_ATHLON:
>      case PROCESSOR_K8:
>      case PROCESSOR_AMDFAM10:
> +    case PROCESSOR_BDVER1:
>      case PROCESSOR_ATOM:
>      case PROCESSOR_GENERIC32:
>      case PROCESSOR_GENERIC64:
> 
> 
> Register allocation problems.
> -----------------------------
> Excerpt from the diff between assembly listing for amdfam10 and bdver1
> binaries.
> 
> ac.f90
> ----
> Example 1:
> 
> Redundant save and restore of xmm15 is occuring.
> 
>  	vmovaps	%xmm15, 256(%rsp)
> +	vmovaps	256(%rsp), %xmm15
>  	vdivpd	32192(%rsp), %xmm15, %xmm15
> 
>  	vmovaps	%xmm15, 288(%rsp)
> +	vmovaps	288(%rsp), %xmm15
>  	vdivpd	32208(%rsp), %xmm15, %xmm15
> 
>  	vmovaps	%xmm15, 320(%rsp)
> +	vmovaps	320(%rsp), %xmm15
>  	vdivpd	32224(%rsp), %xmm15, %xmm15
> 
>  	vmovaps	%xmm15, 208(%rsp)
> +	vmovaps	208(%rsp), %xmm15
>  	vdivpd	32384(%rsp), %xmm15, %xmm15
> 
>  	vmovaps	%xmm15, 240(%rsp)
> +	vmovaps	240(%rsp), %xmm15
>  	vdivpd	32400(%rsp), %xmm15, %xmm15
> 
>  	vmovaps	%xmm15, 272(%rsp)
> +	vmovaps	272(%rsp), %xmm15
>  	vdivpd	32416(%rsp), %xmm15, %xmm15
> 
>  	vmovaps	%xmm15, 304(%rsp)
> +	vmovaps	304(%rsp), %xmm15
>  	vdivpd	32432(%rsp), %xmm15, %xmm15

It is hard to tell without the info from the detailed asm dump. Please
add -dP to compile flags, this will add RTX from which the instruction
was generated to asm dumps. I suspect that RA doesn't like the
conversion from VxSFmode to VxDFmode and it performs the conversion
through memory. Maybe something like PR34283.

OTOH, RA can produce quite unoptimal code when subregs are involved.
Perhaps you can use DFmode moves that are only emitted as vmovaps, but
treated as DFmode throughout the compilation? This would make more
sense, since the moved value is in fact DFmode.

Uros.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]