This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH, x86] merge movsd/movhpd pair in peephole
- From: Xinliang David Li <davidxl at google dot com>
- To: "Bin.Cheng" <amker dot cheng at gmail dot com>
- Cc: Wei Mi <wmi at google dot com>, GCC Patches <gcc-patches at gcc dot gnu dot org>, Uros Bizjak <ubizjak at gmail dot com>, "H.J. Lu" <hjl dot tools at gmail dot com>
- Date: Mon, 21 Apr 2014 11:59:36 -0700
- Subject: Re: [PATCH, x86] merge movsd/movhpd pair in peephole
- Authentication-results: sourceware.org; auth=none
- References: <CA+4CFy4oGjcv36T3j_pmAe9zEMdZCDG7UTez1p+=Yz2eiKfQPw at mail dot gmail dot com> <CAHFci28JaB_45BishBAXsoRBp_c=4Y1yVeCutYt6=y_ivApfyg at mail dot gmail dot com>
Bin, when will the patch for the generic pass be available for review?
David
On Wed, Apr 9, 2014 at 7:27 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Thu, Apr 10, 2014 at 8:18 AM, Wei Mi <wmi@google.com> wrote:
>> Hi,
>>
>> For the testcase 1.c
>>
>> #include <emmintrin.h>
>>
>> double a[1000];
>>
>> __m128d foo1() {
>> __m128d res;
>> res = _mm_load_sd(&a[1]);
>> res = _mm_loadh_pd(res, &a[2]);
>> return res;
>> }
>>
>> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
>> is beneficial on x86 machines starting from Nehalem.
>>
>> The patch is to add the merging in peephole.
>> bootstrap and regression pass. Is it ok for stage1?
>>
>> Thanks,
>> Wei.
>>
>> gcc/ChangeLog:
>>
>> 2014-04-09 Wei Mi <wmi@google.com>
>>
>> * config/i386/i386.c (get_memref_parts): New function.
>> (adjacent_mem_locations): Ditto.
>> * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>> * config/i386/sse.md: Add define_peephole rule.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2014-04-09 Wei Mi <wmi@google.com>
>>
>> * gcc.target/i386/sse2-unaligned-mov.c: New test.
>>
>> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
>> index 6e32978..3ae0d6d 100644
>> --- a/gcc/config/i386/i386-protos.h
>> +++ b/gcc/config/i386/i386-protos.h
>> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>> #endif
>>
>> extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
>> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>>
>> #ifdef RTX_CODE
>> /* Target data for multipass lookahead scheduling.
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 3eefe4a..a330e84 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
>> tree *clear, tree *update)
>> atomic_feraiseexcept_call);
>> }
>>
>> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
>> + Return true if successful, false if all the values couldn't
>> + be determined.
>> +
>> + This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
>> + address forms. */
>> +
>> +static bool
>> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
>> + HOST_WIDE_INT *size)
>> +{
>> + rtx addr_rtx;
>> + if MEM_SIZE_KNOWN_P (mem)
>> + *size = MEM_SIZE (mem);
>> + else
>> + return false;
>> +
>> + if (GET_CODE (XEXP (mem, 0)) == CONST)
>> + addr_rtx = XEXP (XEXP (mem, 0), 0);
>> + else
>> + addr_rtx = (XEXP (mem, 0));
>> +
>> + if (GET_CODE (addr_rtx) == REG
>> + || GET_CODE (addr_rtx) == SYMBOL_REF)
>> + {
>> + *base = addr_rtx;
>> + *offset = 0;
>> + }
>> + else if (GET_CODE (addr_rtx) == PLUS
>> + && CONST_INT_P (XEXP (addr_rtx, 1)))
>> + {
>> + *base = XEXP (addr_rtx, 0);
>> + *offset = INTVAL (XEXP (addr_rtx, 1));
>> + }
>> + else
>> + return false;
>> +
>> + return true;
>> +}
>> +
>> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
>> + return true. */
>> +
>> +extern bool
>> +adjacent_mem_locations (rtx mem1, rtx mem2)
>> +{
>> + rtx base1, base2;
>> + HOST_WIDE_INT off1, size1, off2, size2;
>> +
>> + if (get_memref_parts (mem1, &base1, &off1, &size1)
>> + && get_memref_parts (mem2, &base2, &off2, &size2))
>> + {
>> + if (GET_CODE (base1) == SYMBOL_REF
>> + && GET_CODE (base2) == SYMBOL_REF
>> + && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
>> + return (off1 + size1 == off2);
>> + else if (REG_P (base1)
>> + && REG_P (base2)
>> + && REGNO (base1) == REGNO (base2))
>> + return (off1 + size1 == off2);
>> + }
>> + return false;
>> +}
>> +
>> /* Initialize the GCC target structure. */
>> #undef TARGET_RETURN_IN_MEMORY
>> #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> index 72a4d6d..4bf8461 100644
>> --- a/gcc/config/i386/sse.md
>> +++ b/gcc/config/i386/sse.md
>> @@ -15606,3 +15606,37 @@
>> [(set_attr "type" "sselog1")
>> (set_attr "length_immediate" "1")
>> (set_attr "mode" "TI")])
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> + [(set (match_operand:DF 0 "register_operand")
>> + (match_operand:DF 1 "memory_operand"))
>> + (set (match_operand:V2DF 2 "register_operand")
>> + (vec_concat:V2DF (match_dup 0)
>> + (match_operand:DF 3 "memory_operand")))]
>> + "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> + && REGNO (operands[0]) == REGNO (operands[2])
>> + && adjacent_mem_locations (operands[1], operands[3])"
>> + [(set (match_dup 2)
>> + (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
>> +{
>> + operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
>> +})
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> + [(set (match_operand:DF 0 "memory_operand")
>> + (vec_select:DF (match_operand:V2DF 1 "register_operand")
>> + (parallel [(const_int 0)])))
>> + (set (match_operand:DF 2 "memory_operand")
>> + (vec_select:DF (match_dup 1)
>> + (parallel [(const_int 1)])))]
>> + "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> + && adjacent_mem_locations (operands[0], operands[2])"
>> + [(set (match_dup 3)
>> + (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
>> +{
>> + operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
>> +})
>> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> new file mode 100644
>> index 0000000..28470ce
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> @@ -0,0 +1,20 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-mtune=corei7 -O2" } */
>> +
>> +#include <emmintrin.h>
>> +
>> +double a[1000];
>> +
>> +__m128d foo1() {
>> + __m128d res;
>> + res = _mm_load_sd(&a[1]);
>> + res = _mm_loadh_pd(res, &a[2]);
>> + return res;
>> +}
>> +
>> +void foo2(__m128d res) {
>> + _mm_store_sd(&a[1], res);
>> + _mm_storeh_pd(&a[2], res);
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "movup" 2 } } */
>
> Hi Wei,
> Just FYI, though I am not sure whether it can help. I am working on
> load store pairing on ARM too. On ARM (maybe other machines like
> mips), the problem is more general because we can pair memory accesses
> with respect to both core register and fpu register. The current
> strategy taken by GCC is to use peephole2 but it's too weak, for
> example, it can't handle pairs which are intervened by other
> instructions. Right now I am working on a new pass in GCC to do that
> work, and have already worked out a draft patch. The instrumental
> data looks promising with many opportunities captured besides the
> original peephole2 pass. Furthermore, the result could be even better
> if register renaming is enabled. Hopefully I will try to bring back
> register renaming for this purpose later.
>
> Thanks,
> bin