Bug 97387 - we are near 2021, add carry intrinsic still does the wrong thing and generates silly code.
Summary: we are near 2021, add carry intrinsic still does the wrong thing and generate...
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: ---
Assignee: Jakub Jelinek
URL:
Keywords: missed-optimization
Depends on: 93990
Blocks:
  Show dependency treegraph
 
Reported: 2020-10-12 15:33 UTC by fdlbxtqi
Modified: 2023-06-04 00:16 UTC (History)
9 users (show)

See Also:
Host:
Target: x86_64-*-* i?86-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2020-10-13 00:00:00


Attachments
gcc11-pr97387.patch (1.64 KB, patch)
2020-10-13 12:01 UTC, Jakub Jelinek
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description fdlbxtqi 2020-10-12 15:33:56 UTC
#include <stdint.h>
#include <x86intrin.h>

void add256(uint64_t a[4], uint64_t b[4]){
  uint8_t carry = 0;
  for (int i = 0; i < 4; ++i)
    carry = _addcarry_u64(carry, a[i], b[i], &a[i]);
}

People have reported the issue for many many times but the carry is still buggy.

https://gcc.godbolt.org/z/TnM8cv
Comment 1 fdlbxtqi 2020-10-13 02:45:45 UTC
All these instructions generated by GCC are very very wrong.

https://gcc.godbolt.org/z/asMrKv
Comment 2 Hongtao.liu 2020-10-13 03:23:46 UTC
Same issue as PR93990?
Comment 3 fdlbxtqi 2020-10-13 03:25:02 UTC
(In reply to Hongtao.liu from comment #2)
> Same issue as PR93990?

many of them. It has been reported similar bugs since 2015.
Comment 4 Richard Biener 2020-10-13 06:20:00 UTC
I guess the best way forward is to create sth like .ADD_OVERFLOW for add-with-carry and open-code those intrinsics in a form we can recognize.

Obviously there seems to be pieces on RTL missing since the x86 builtin
should better expand to (generic!) RTL that can be recognized by adc patterns.
Comment 5 Jakub Jelinek 2020-10-13 09:32:12 UTC
I actually think that what we emit for these builtins is right, the problem is that combiner is not able to optimize
(insn 10 9 11 2 (set (reg:QI 88 [ _31 ])
        (ltu:QI (reg:CCC 17 flags)
            (const_int 0 [0]))) "include/adxintrin.h":69:10 784 {*setcc_qi}
     (expr_list:REG_DEAD (reg:CCC 17 flags)
        (nil)))
followed (with instructions that don't clobber flags in between) by:
(insn 17 15 18 2 (parallel [
            (set (reg:CCC 17 flags)
                (compare:CCC (plus:QI (reg:QI 88 [ _31 ])
                        (const_int -1 [0xffffffffffffffff]))
                    (reg:QI 88 [ _31 ])))
            (clobber (scratch:QI))
        ]) "include/adxintrin.h":69:10 349 {*addqi3_cconly_overflow_1}
     (expr_list:REG_DEAD (reg:QI 88 [ _31 ])
        (nil)))
into nothing.  It tries that:
Trying 10 -> 17:
   10: r88:QI=ltu(flags:CCC,0)
      REG_DEAD flags:CCC
   17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;}
      REG_DEAD r88:QI
Failed to match this instruction:
(parallel [
        (set (reg:CC 17 flags)
            (compare:CC (neg:QI (geu:QI (reg:CCC 17 flags)
                        (const_int 0 [0])))
                (ltu:QI (reg:CCC 17 flags)
                    (const_int 0 [0]))))
        (clobber (scratch:QI))
    ])
Failed to match this instruction:
(set (reg:CC 17 flags)
    (compare:CC (neg:QI (geu:QI (reg:CCC 17 flags)
                (const_int 0 [0])))
        (ltu:QI (reg:CCC 17 flags)
            (const_int 0 [0]))))

Similarly, the
Trying 10, 17 -> 18:
   10: r88:QI=ltu(flags:CCC,0)
      REG_DEAD flags:CCC
   17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;}
      REG_DEAD r88:QI
   18: {flags:CCC=cmp(zero_extend(ltu(flags:CCC,0)+r106:DI+r107:DI),zero_extend(r107:DI)+ltu(flags:CCC,0));r109:DI=ltu(flags:CCC,0)+r106:DI+r107:DI
;}
      REG_DEAD r107:DI
      REG_DEAD r106:DI
Can't combine i1 into i3

fails because it would want to set flags multiple times and punts because of that.
The 10 -> 17 combination seems more promissing, though I'm not sure the CCmode rather than CCCmode in that case is desirable.
Comment 6 Jakub Jelinek 2020-10-13 09:32:26 UTC
Trying 10, 17 -> 18:
   10: r88:QI=ltu(flags:CCC,0)
      REG_DEAD flags:CCC
   17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;}
      REG_DEAD r88:QI
   18: {flags:CCC=cmp(zero_extend(ltu(flags:CCC,0)+r106:DI+r107:DI),zero_extend(r107:DI)+ltu(flags:CCC,0));r109:DI=ltu(flags:CCC,0)+r106:DI+r107:DI
;}
      REG_DEAD r107:DI
      REG_DEAD r106:DI
Can't combine i1 into i3
Comment 7 fdlbxtqi 2020-10-13 09:43:56 UTC
(In reply to Jakub Jelinek from comment #6)
> Trying 10, 17 -> 18:
>    10: r88:QI=ltu(flags:CCC,0)
>       REG_DEAD flags:CCC
>    17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;}
>       REG_DEAD r88:QI
>    18:
> {flags:CCC=cmp(zero_extend(ltu(flags:CCC,0)+r106:DI+r107:DI),
> zero_extend(r107:DI)+ltu(flags:CCC,0));r109:DI=ltu(flags:CCC,0)+r106:DI+r107:
> DI
> ;}
>       REG_DEAD r107:DI
>       REG_DEAD r106:DI
> Can't combine i1 into i3


BTW.

clang does not remove the
movl    $0, %esi
before
sbbq    %rsi, %rsi

sbbq itself does not matter what the value itself is.

https://godbolt.org/z/szYb55

I do not know whether GCC would remove this after patch. If you can optimize this away, I would thank you.
Comment 8 Jakub Jelinek 2020-10-13 10:24:34 UTC
I've tried:
--- gcc/config/i386/i386.md.jj	2020-10-01 10:40:09.955758167 +0200
+++ gcc/config/i386/i386.md	2020-10-13 11:59:58.924599503 +0200
@@ -7039,6 +7039,20 @@
       (set (match_operand:SWI48 0 "register_operand")
 	   (minus:SWI48 (match_dup 1) (match_dup 2)))])]
   "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)")
+
+;; Pre-reload splitter to optimize
+;; *setcc_qi followed by *addqi3_cconly_overflow_1 with the same QI
+;; operand and no intervening flags modifications into nothing.
+(define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1"
+  [(parallel
+     [(set (reg:CC FLAGS_REG)
+	   (compare:CC (neg:QI (geu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		       (ltu:QI (reg:CCC FLAGS_REG) (const_int 0))))
+      (clobber (scratch:QI))])]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
 

 ;; Overflow setting add instructions
but unfortunately that doesn't really work:
Trying 10 -> 17:
   10: r88:QI=ltu(flags:CCC,0)
      REG_DEAD flags:CCC
   17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;}
      REG_DEAD r88:QI
Successfully matched this instruction:
(parallel [
        (set (reg:CC 17 flags)
            (compare:CC (neg:QI (geu:QI (reg:CCC 17 flags)
                        (const_int 0 [0])))
                (ltu:QI (reg:CCC 17 flags)
                    (const_int 0 [0]))))
        (clobber (scratch:QI))
    ])
Failed to match this instruction:
(parallel [
        (set (reg:CCC 17 flags)
            (compare:CCC (zero_extend:TI (plus:DI (plus:DI (ltu:DI (reg:CCC 17 flags)
                                (const_int 0 [0]))
                            (reg:DI 106 [ MEM[(long long unsigned int *)a_12(D) + 8B] ]))
                        (reg:DI 107 [ MEM[(long long unsigned int *)b_13(D) + 8B] ])))
                (plus:TI (zero_extend:TI (reg:DI 107 [ MEM[(long long unsigned int *)b_13(D) + 8B] ]))
                    (ltu:TI (reg:CCC 17 flags)
                        (const_int 0 [0])))))
        (set (reg:DI 109)
            (plus:DI (plus:DI (ltu:DI (reg:CC 17 flags)
                        (const_int 0 [0]))
                    (reg:DI 106 [ MEM[(long long unsigned int *)a_12(D) + 8B] ]))
                (reg:DI 107 [ MEM[(long long unsigned int *)b_13(D) + 8B] ])))
    ])
I guess that is caused by the CCmode vs. CCCmode difference.
substing produces the desired:
(set (reg:CCC 17 flags)
    (compare:CCC (neg:QI (geu:QI (reg:CCC 17 flags)
                (const_int 0 [0])))
        (ltu:QI (reg:CCC 17 flags)
            (const_int 0 [0]))))
but unfortunately combine_simplify_rtx -> simplify_set breaks that.
Comment 9 Jakub Jelinek 2020-10-13 10:56:42 UTC
--- gcc/config/i386/i386.md.jj	2020-10-01 10:40:09.955758167 +0200
+++ gcc/config/i386/i386.md	2020-10-13 12:30:13.438172411 +0200
@@ -7039,6 +7039,18 @@ (define_expand "subborrow<mode>_0"
       (set (match_operand:SWI48 0 "register_operand")
 	   (minus:SWI48 (match_dup 1) (match_dup 2)))])]
   "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)")
+
+;; Pre-reload splitter to optimize
+;; *setcc_qi followed by *addqi3_cconly_overflow_1 with the same QI
+;; operand and no intervening flags modifications into nothing.
+(define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC (neg:QI (geu:QI (reg:CCC FLAGS_REG) (const_int 0)))
+		     (ltu:QI (reg:CCC FLAGS_REG) (const_int 0))))]
+  "ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)])
 

 ;; Overflow setting add instructions
 
--- gcc/config/i386/i386.c.jj	2020-10-01 10:40:09.951758225 +0200
+++ gcc/config/i386/i386.c	2020-10-13 12:45:33.209848289 +0200
@@ -15136,6 +15136,20 @@ ix86_cc_mode (enum rtx_code code, rtx op
 	  && (rtx_equal_p (op1, XEXP (op0, 0))
 	      || rtx_equal_p (op1, XEXP (op0, 1))))
 	return CCCmode;
+      /* Similarly for *setcc_qi_addqi3_cconly_overflow_1 pattern.  */
+      else if (code == LTU
+	       && GET_CODE (op0) == NEG
+	       && GET_CODE (XEXP (op0, 0)) == GEU
+	       && REG_P (XEXP (XEXP (op0, 0), 0))
+	       && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
+	       && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG
+	       && XEXP (XEXP (op0, 0), 1) == const0_rtx
+	       && GET_CODE (op1) == LTU
+	       && REG_P (XEXP (op1, 0))
+	       && GET_MODE (XEXP (op1, 0)) == CCCmode
+	       && REGNO (XEXP (op1, 0)) == FLAGS_REG
+	       && XEXP (op1, 1) == const0_rtx)
+	return CCCmode;
       else
 	return CCmode;
     case GTU:			/* CF=0 & ZF=0 */
@@ -19773,6 +19787,24 @@ ix86_rtx_costs (rtx x, machine_mode mode
 	  return true;
 	}
 
+      if (mode == CCCmode
+	  && GET_CODE (XEXP (x, 0)) == NEG
+	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == GEU
+	  && REG_P (XEXP (XEXP (XEXP (x, 0), 0), 0))
+	  && GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == CCCmode
+	  && REGNO (XEXP (XEXP (XEXP (x, 0), 0), 0)) == FLAGS_REG
+	  && XEXP (XEXP (XEXP (x, 0), 0), 1) == const0_rtx
+	  && GET_CODE (XEXP (x, 1)) == LTU
+	  && REG_P (XEXP (XEXP (x, 1), 0))
+	  && GET_MODE (XEXP (XEXP (x, 1), 0)) == CCCmode
+	  && REGNO (XEXP (XEXP (x, 1), 0)) == FLAGS_REG
+	  && XEXP (XEXP (x, 1), 1) == const0_rtx)
+	{
+	  /* This is *setcc_qi_addqi3_cconly_overflow_1 pattern, a nop.  */
+	  *total = 0;
+	  return true;
+	}
+
       /* The embedded comparison operand is completely free.  */
       if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
 	  && XEXP (x, 1) == const0_rtx)

seems to work.
Comment 10 Jakub Jelinek 2020-10-13 12:01:46 UTC
Created attachment 49365 [details]
gcc11-pr97387.patch

Full untested patch.
Comment 11 GCC Commits 2020-10-14 15:29:44 UTC
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>:

https://gcc.gnu.org/g:06bec55e80d98419121f3998d98d969990a75b0b

commit r11-3882-g06bec55e80d98419121f3998d98d969990a75b0b
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Wed Oct 14 17:14:47 2020 +0200

    i386: Improve chaining of _{addcarry,subborrow}_u{32,64} [PR97387]
    
    These builtins have two known issues and this patch fixes one of them.
    
    One issue is that the builtins effectively return two results and
    they make the destination addressable until expansion, which means
    a stack slot is allocated for them and e.g. with -fstack-protector*
    DSE isn't able to optimize that away.  I think for that we want to use
    the technique of returning complex value; the patch doesn't handle that
    though.  See PR93990 for that.
    
    The other problem is optimization of successive uses of the builtin
    e.g. for arbitrary precision arithmetic additions/subtractions.
    As shown PR93990, combine is able to optimize the case when the first
    argument to these builtins is 0 (the first instance when several are used
    together), and also the last one if the last one ignores its result (i.e.
    the carry/borrow is dead and thrown away in that case).
    As shown in this PR, combiner refuses to optimize the rest, where it sees:
    (insn 10 9 11 2 (set (reg:QI 88 [ _31 ])
            (ltu:QI (reg:CCC 17 flags)
                (const_int 0 [0]))) "include/adxintrin.h":69:10 785 {*setcc_qi}
         (expr_list:REG_DEAD (reg:CCC 17 flags)
            (nil)))
    - set pseudo 88 to CF from flags, then some uninteresting insns that
    don't modify flags, and finally:
    (insn 17 15 18 2 (parallel [
                (set (reg:CCC 17 flags)
                    (compare:CCC (plus:QI (reg:QI 88 [ _31 ])
                            (const_int -1 [0xffffffffffffffff]))
                        (reg:QI 88 [ _31 ])))
                (clobber (scratch:QI))
            ]) "include/adxintrin.h":69:10 350 {*addqi3_cconly_overflow_1}
         (expr_list:REG_DEAD (reg:QI 88 [ _31 ])
            (nil)))
    to set CF in flags back to what we saved earlier.  The combiner just punts
    trying to combine the 10, 17 and following addcarrydi (etc.) instruction,
    because
      if (i1 && !can_combine_p (i1, i3, i0, NULL, i2, NULL, &i1dest, &i1src))
        {
          if (dump_file && (dump_flags & TDF_DETAILS))
            fprintf (dump_file, "Can't combine i1 into i3\n");
          undo_all ();
          return 0;
        }
    fails - the 3 insns aren't all adjacent and
          || (! all_adjacent
              && (((!MEM_P (src)
                    || ! find_reg_note (insn, REG_EQUIV, src))
                   && modified_between_p (src, insn, i3))
    src (flags hard register) is modified between the first and third insn - in
    the second insn.
    
    The following patch optimizes this by optimizing just the two insns,
    10 and 17 above, i.e. save CF into pseudo, set CF from that pseudo, into
    a nop.  The new define_insn_and_split matches how combine simplifies those
    two together (except without the ix86_cc_mode change it was choosing CCmode
    for the destination instead of CCCmode, so had to change that function too,
    and also adjust costs so that combiner understand it is beneficial).
    
    With this, all the testcases are optimized, so that the:
            setc    %dl
    ...
            addb    $-1, %dl
    insns in between the ad[dc][lq] or s[ub]b[lq] instructions are all optimized
    away (sure, if something would clobber flags in between they wouldn't, but
    there is nothing that can be done about that).
    
    2020-10-14  Jakub Jelinek  <jakub@redhat.com>
    
            PR target/97387
            * config/i386/i386.md (CC_CCC): New mode iterator.
            (*setcc_qi_addqi3_cconly_overflow_1_<mode>): New
            define_insn_and_split.
            * config/i386/i386.c (ix86_cc_mode): Return CCCmode
            for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern operands.
            (ix86_rtx_costs): Return true and *total = 0;
            for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern.  Use op0 and
            op1 temporaries to simplify COMPARE checks.
    
            * gcc.target/i386/pr97387-1.c: New test.
            * gcc.target/i386/pr97387-2.c: New test.
Comment 12 fdlbxtqi 2020-10-14 16:19:42 UTC
(In reply to CVS Commits from comment #11)
> The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>:
> 
> https://gcc.gnu.org/g:06bec55e80d98419121f3998d98d969990a75b0b
> 
> commit r11-3882-g06bec55e80d98419121f3998d98d969990a75b0b
> Author: Jakub Jelinek <jakub@redhat.com>
> Date:   Wed Oct 14 17:14:47 2020 +0200
> 
>     i386: Improve chaining of _{addcarry,subborrow}_u{32,64} [PR97387]
>     
>     These builtins have two known issues and this patch fixes one of them.
>     
>     One issue is that the builtins effectively return two results and
>     they make the destination addressable until expansion, which means
>     a stack slot is allocated for them and e.g. with -fstack-protector*
>     DSE isn't able to optimize that away.  I think for that we want to use
>     the technique of returning complex value; the patch doesn't handle that
>     though.  See PR93990 for that.
>     
>     The other problem is optimization of successive uses of the builtin
>     e.g. for arbitrary precision arithmetic additions/subtractions.
>     As shown PR93990, combine is able to optimize the case when the first
>     argument to these builtins is 0 (the first instance when several are used
>     together), and also the last one if the last one ignores its result (i.e.
>     the carry/borrow is dead and thrown away in that case).
>     As shown in this PR, combiner refuses to optimize the rest, where it
> sees:
>     (insn 10 9 11 2 (set (reg:QI 88 [ _31 ])
>             (ltu:QI (reg:CCC 17 flags)
>                 (const_int 0 [0]))) "include/adxintrin.h":69:10 785
> {*setcc_qi}
>          (expr_list:REG_DEAD (reg:CCC 17 flags)
>             (nil)))
>     - set pseudo 88 to CF from flags, then some uninteresting insns that
>     don't modify flags, and finally:
>     (insn 17 15 18 2 (parallel [
>                 (set (reg:CCC 17 flags)
>                     (compare:CCC (plus:QI (reg:QI 88 [ _31 ])
>                             (const_int -1 [0xffffffffffffffff]))
>                         (reg:QI 88 [ _31 ])))
>                 (clobber (scratch:QI))
>             ]) "include/adxintrin.h":69:10 350 {*addqi3_cconly_overflow_1}
>          (expr_list:REG_DEAD (reg:QI 88 [ _31 ])
>             (nil)))
>     to set CF in flags back to what we saved earlier.  The combiner just
> punts
>     trying to combine the 10, 17 and following addcarrydi (etc.) instruction,
>     because
>       if (i1 && !can_combine_p (i1, i3, i0, NULL, i2, NULL, &i1dest, &i1src))
>         {
>           if (dump_file && (dump_flags & TDF_DETAILS))
>             fprintf (dump_file, "Can't combine i1 into i3\n");
>           undo_all ();
>           return 0;
>         }
>     fails - the 3 insns aren't all adjacent and
>           || (! all_adjacent
>               && (((!MEM_P (src)
>                     || ! find_reg_note (insn, REG_EQUIV, src))
>                    && modified_between_p (src, insn, i3))
>     src (flags hard register) is modified between the first and third insn -
> in
>     the second insn.
>     
>     The following patch optimizes this by optimizing just the two insns,
>     10 and 17 above, i.e. save CF into pseudo, set CF from that pseudo, into
>     a nop.  The new define_insn_and_split matches how combine simplifies
> those
>     two together (except without the ix86_cc_mode change it was choosing
> CCmode
>     for the destination instead of CCCmode, so had to change that function
> too,
>     and also adjust costs so that combiner understand it is beneficial).
>     
>     With this, all the testcases are optimized, so that the:
>             setc    %dl
>     ...
>             addb    $-1, %dl
>     insns in between the ad[dc][lq] or s[ub]b[lq] instructions are all
> optimized
>     away (sure, if something would clobber flags in between they wouldn't,
> but
>     there is nothing that can be done about that).
>     
>     2020-10-14  Jakub Jelinek  <jakub@redhat.com>
>     
>             PR target/97387
>             * config/i386/i386.md (CC_CCC): New mode iterator.
>             (*setcc_qi_addqi3_cconly_overflow_1_<mode>): New
>             define_insn_and_split.
>             * config/i386/i386.c (ix86_cc_mode): Return CCCmode
>             for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern operands.
>             (ix86_rtx_costs): Return true and *total = 0;
>             for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern.  Use op0
> and
>             op1 temporaries to simplify COMPARE checks.
>     
>             * gcc.target/i386/pr97387-1.c: New test.
>             * gcc.target/i386/pr97387-2.c: New test.

awesome
Comment 13 fdlbxtqi 2020-10-14 21:56:59 UTC
https://godbolt.org/z/fqGrz1

After this patch, the assembly generated is much better now. However, it still contains many optimization problems.

The problem is the code like this.

Let's just walk through the assembly and see the problems here.

field_number operator-(field_number const& x,field_number const& y) noexcept
{
	using namespace intrinsics;
	using unsigned_type = field_number::value_type;
	constexpr unsigned_type zero{};
    field_number f;
    bool borrow{sub_borrow(false,x[0],y[0],f[0])};
    borrow=sub_borrow(borrow,x[1],y[1],f[1]);
    borrow=sub_borrow(borrow,x[2],y[2],f[2]);
    borrow=sub_borrow(borrow,x[3],y[3],f[3]);
    unsigned_type v{};
    sub_borrow(borrow,v,v,v);
    v&=static_cast<unsigned_type>(38);
    borrow=sub_borrow(false,f[0],v,f[0]);
    borrow=sub_borrow(borrow,f[1],zero,f[1]);
    borrow=sub_borrow(borrow,f[2],zero,f[2]);
    borrow=sub_borrow(borrow,f[3],zero,f[3]);
    sub_borrow(borrow,v,v,v);
    v&=static_cast<unsigned_type>(38);
    borrow=sub_borrow(false,f[0],v,f[0]);
    borrow=sub_borrow(borrow,f[1],zero,f[1]);
    borrow=sub_borrow(borrow,f[2],zero,f[2]);
    borrow=sub_borrow(borrow,f[2],zero,f[3]);
    return f;
}


_ZN7fast_io10curve25519miERKNS0_12field_numberES3_:
.LFB5431:
	.cfi_startproc
	.cfi_personality 0x3,__gxx_personality_v0
	movq	%rsi, %rcx
	movq	%rdx, %rax
	movq	%rdi, %r8
	movq	(%rsi), %rdi
	movq	24(%rcx), %r9
	subq	(%rdx), %rdi
	movq	8(%rsi), %rsi
	sbbq	8(%rdx), %rsi
	movq	16(%rcx), %rdx
	sbbq	16(%rax), %rdx
	movq	24(%rax), %rax
	sbbq	%rax, %r9

	movl	$0, %eax
	movq	%rax, %rcx
//The value of sbbq to itself does not matter I should be sbbq %r9 %r9 and you are done.

	sbbq	%rax, %rcx
	andl	$38, %ecx
	subq	%rcx, %rdi

//The 2nd problems are these %rax stuffs. The should be just imm 0
// sbbq $0,%rsi
	sbbq	%rax, %rsi
	sbbq	%rax, %rdx
	sbbq	%rax, %r9
	sbbq	%rcx, %rcx
	andl	$38, %ecx
	subq	%rcx, %rdi
	sbbq	%rax, %rsi
	movq	%rdi, -40(%rsp)
	sbbq	%rax, %rdx
	movq	%rsi, -32(%rsp)
	movdqa	-40(%rsp), %xmm0
	movq	%rdx, -24(%rsp)
	sbbq	%rax, %rdx
	movq	%r8, %rax
	movq	%rdx, -16(%rsp)
	movdqa	-24(%rsp), %xmm1
	movups	%xmm0, (%r8)
	movups	%xmm1, 16(%r8)
	ret

https://godbolt.org/z/nKPWx3
Comment 14 fdlbxtqi 2020-10-14 21:59:48 UTC
(In reply to fdlbxtqi from comment #13)
> https://godbolt.org/z/fqGrz1
> 
> After this patch, the assembly generated is much better now. However, it
> still contains many optimization problems.
> 
> The problem is the code like this.
> 
> Let's just walk through the assembly and see the problems here.
> 
> field_number operator-(field_number const& x,field_number const& y) noexcept
> {
> 	using namespace intrinsics;
> 	using unsigned_type = field_number::value_type;
> 	constexpr unsigned_type zero{};
>     field_number f;
>     bool borrow{sub_borrow(false,x[0],y[0],f[0])};
>     borrow=sub_borrow(borrow,x[1],y[1],f[1]);
>     borrow=sub_borrow(borrow,x[2],y[2],f[2]);
>     borrow=sub_borrow(borrow,x[3],y[3],f[3]);
>     unsigned_type v{};
>     sub_borrow(borrow,v,v,v);
>     v&=static_cast<unsigned_type>(38);
>     borrow=sub_borrow(false,f[0],v,f[0]);
>     borrow=sub_borrow(borrow,f[1],zero,f[1]);
>     borrow=sub_borrow(borrow,f[2],zero,f[2]);
>     borrow=sub_borrow(borrow,f[3],zero,f[3]);
>     sub_borrow(borrow,v,v,v);
>     v&=static_cast<unsigned_type>(38);
>     borrow=sub_borrow(false,f[0],v,f[0]);
>     borrow=sub_borrow(borrow,f[1],zero,f[1]);
>     borrow=sub_borrow(borrow,f[2],zero,f[2]);
>     borrow=sub_borrow(borrow,f[2],zero,f[3]);
>     return f;
> }
> 
> 
> _ZN7fast_io10curve25519miERKNS0_12field_numberES3_:
> .LFB5431:
> 	.cfi_startproc
> 	.cfi_personality 0x3,__gxx_personality_v0
> 	movq	%rsi, %rcx
> 	movq	%rdx, %rax
> 	movq	%rdi, %r8
> 	movq	(%rsi), %rdi
> 	movq	24(%rcx), %r9
> 	subq	(%rdx), %rdi
> 	movq	8(%rsi), %rsi
> 	sbbq	8(%rdx), %rsi
> 	movq	16(%rcx), %rdx
> 	sbbq	16(%rax), %rdx
> 	movq	24(%rax), %rax
> 	sbbq	%rax, %r9
> 
> 	movl	$0, %eax
> 	movq	%rax, %rcx
> //The value of sbbq to itself does not matter I should be sbbq %r9 %r9 and
> you are done.
> 
> 	sbbq	%rax, %rcx
> 	andl	$38, %ecx
> 	subq	%rcx, %rdi
> 
> //The 2nd problems are these %rax stuffs. The should be just imm 0
> // sbbq $0,%rsi
> 	sbbq	%rax, %rsi
> 	sbbq	%rax, %rdx
> 	sbbq	%rax, %r9
> 	sbbq	%rcx, %rcx
> 	andl	$38, %ecx
> 	subq	%rcx, %rdi
> 	sbbq	%rax, %rsi
> 	movq	%rdi, -40(%rsp)
> 	sbbq	%rax, %rdx
> 	movq	%rsi, -32(%rsp)
> 	movdqa	-40(%rsp), %xmm0
> 	movq	%rdx, -24(%rsp)
> 	sbbq	%rax, %rdx
> 	movq	%r8, %rax
> 	movq	%rdx, -16(%rsp)
> 	movdqa	-24(%rsp), %xmm1
> 	movups	%xmm0, (%r8)
> 	movups	%xmm1, 16(%r8)
> 	ret
> 
> https://godbolt.org/z/nKPWx3

I think the optimal instruction number should be 26.
Comment 15 Andrew Pinski 2021-09-09 06:49:50 UTC
PR 93990 is another related issue here.