#include <stdint.h> #include <x86intrin.h> void add256(uint64_t a[4], uint64_t b[4]){ uint8_t carry = 0; for (int i = 0; i < 4; ++i) carry = _addcarry_u64(carry, a[i], b[i], &a[i]); } People have reported the issue for many many times but the carry is still buggy. https://gcc.godbolt.org/z/TnM8cv
All these instructions generated by GCC are very very wrong. https://gcc.godbolt.org/z/asMrKv
Same issue as PR93990?
(In reply to Hongtao.liu from comment #2) > Same issue as PR93990? many of them. It has been reported similar bugs since 2015.
I guess the best way forward is to create sth like .ADD_OVERFLOW for add-with-carry and open-code those intrinsics in a form we can recognize. Obviously there seems to be pieces on RTL missing since the x86 builtin should better expand to (generic!) RTL that can be recognized by adc patterns.
I actually think that what we emit for these builtins is right, the problem is that combiner is not able to optimize (insn 10 9 11 2 (set (reg:QI 88 [ _31 ]) (ltu:QI (reg:CCC 17 flags) (const_int 0 [0]))) "include/adxintrin.h":69:10 784 {*setcc_qi} (expr_list:REG_DEAD (reg:CCC 17 flags) (nil))) followed (with instructions that don't clobber flags in between) by: (insn 17 15 18 2 (parallel [ (set (reg:CCC 17 flags) (compare:CCC (plus:QI (reg:QI 88 [ _31 ]) (const_int -1 [0xffffffffffffffff])) (reg:QI 88 [ _31 ]))) (clobber (scratch:QI)) ]) "include/adxintrin.h":69:10 349 {*addqi3_cconly_overflow_1} (expr_list:REG_DEAD (reg:QI 88 [ _31 ]) (nil))) into nothing. It tries that: Trying 10 -> 17: 10: r88:QI=ltu(flags:CCC,0) REG_DEAD flags:CCC 17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;} REG_DEAD r88:QI Failed to match this instruction: (parallel [ (set (reg:CC 17 flags) (compare:CC (neg:QI (geu:QI (reg:CCC 17 flags) (const_int 0 [0]))) (ltu:QI (reg:CCC 17 flags) (const_int 0 [0])))) (clobber (scratch:QI)) ]) Failed to match this instruction: (set (reg:CC 17 flags) (compare:CC (neg:QI (geu:QI (reg:CCC 17 flags) (const_int 0 [0]))) (ltu:QI (reg:CCC 17 flags) (const_int 0 [0])))) Similarly, the Trying 10, 17 -> 18: 10: r88:QI=ltu(flags:CCC,0) REG_DEAD flags:CCC 17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;} REG_DEAD r88:QI 18: {flags:CCC=cmp(zero_extend(ltu(flags:CCC,0)+r106:DI+r107:DI),zero_extend(r107:DI)+ltu(flags:CCC,0));r109:DI=ltu(flags:CCC,0)+r106:DI+r107:DI ;} REG_DEAD r107:DI REG_DEAD r106:DI Can't combine i1 into i3 fails because it would want to set flags multiple times and punts because of that. The 10 -> 17 combination seems more promissing, though I'm not sure the CCmode rather than CCCmode in that case is desirable.
Trying 10, 17 -> 18: 10: r88:QI=ltu(flags:CCC,0) REG_DEAD flags:CCC 17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;} REG_DEAD r88:QI 18: {flags:CCC=cmp(zero_extend(ltu(flags:CCC,0)+r106:DI+r107:DI),zero_extend(r107:DI)+ltu(flags:CCC,0));r109:DI=ltu(flags:CCC,0)+r106:DI+r107:DI ;} REG_DEAD r107:DI REG_DEAD r106:DI Can't combine i1 into i3
(In reply to Jakub Jelinek from comment #6) > Trying 10, 17 -> 18: > 10: r88:QI=ltu(flags:CCC,0) > REG_DEAD flags:CCC > 17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;} > REG_DEAD r88:QI > 18: > {flags:CCC=cmp(zero_extend(ltu(flags:CCC,0)+r106:DI+r107:DI), > zero_extend(r107:DI)+ltu(flags:CCC,0));r109:DI=ltu(flags:CCC,0)+r106:DI+r107: > DI > ;} > REG_DEAD r107:DI > REG_DEAD r106:DI > Can't combine i1 into i3 BTW. clang does not remove the movl $0, %esi before sbbq %rsi, %rsi sbbq itself does not matter what the value itself is. https://godbolt.org/z/szYb55 I do not know whether GCC would remove this after patch. If you can optimize this away, I would thank you.
I've tried: --- gcc/config/i386/i386.md.jj 2020-10-01 10:40:09.955758167 +0200 +++ gcc/config/i386/i386.md 2020-10-13 11:59:58.924599503 +0200 @@ -7039,6 +7039,20 @@ (set (match_operand:SWI48 0 "register_operand") (minus:SWI48 (match_dup 1) (match_dup 2)))])] "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)") + +;; Pre-reload splitter to optimize +;; *setcc_qi followed by *addqi3_cconly_overflow_1 with the same QI +;; operand and no intervening flags modifications into nothing. +(define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1" + [(parallel + [(set (reg:CC FLAGS_REG) + (compare:CC (neg:QI (geu:QI (reg:CCC FLAGS_REG) (const_int 0))) + (ltu:QI (reg:CCC FLAGS_REG) (const_int 0)))) + (clobber (scratch:QI))])] + "ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)]) ;; Overflow setting add instructions but unfortunately that doesn't really work: Trying 10 -> 17: 10: r88:QI=ltu(flags:CCC,0) REG_DEAD flags:CCC 17: {flags:CCC=cmp(r88:QI-0x1,r88:QI);clobber scratch;} REG_DEAD r88:QI Successfully matched this instruction: (parallel [ (set (reg:CC 17 flags) (compare:CC (neg:QI (geu:QI (reg:CCC 17 flags) (const_int 0 [0]))) (ltu:QI (reg:CCC 17 flags) (const_int 0 [0])))) (clobber (scratch:QI)) ]) Failed to match this instruction: (parallel [ (set (reg:CCC 17 flags) (compare:CCC (zero_extend:TI (plus:DI (plus:DI (ltu:DI (reg:CCC 17 flags) (const_int 0 [0])) (reg:DI 106 [ MEM[(long long unsigned int *)a_12(D) + 8B] ])) (reg:DI 107 [ MEM[(long long unsigned int *)b_13(D) + 8B] ]))) (plus:TI (zero_extend:TI (reg:DI 107 [ MEM[(long long unsigned int *)b_13(D) + 8B] ])) (ltu:TI (reg:CCC 17 flags) (const_int 0 [0]))))) (set (reg:DI 109) (plus:DI (plus:DI (ltu:DI (reg:CC 17 flags) (const_int 0 [0])) (reg:DI 106 [ MEM[(long long unsigned int *)a_12(D) + 8B] ])) (reg:DI 107 [ MEM[(long long unsigned int *)b_13(D) + 8B] ]))) ]) I guess that is caused by the CCmode vs. CCCmode difference. substing produces the desired: (set (reg:CCC 17 flags) (compare:CCC (neg:QI (geu:QI (reg:CCC 17 flags) (const_int 0 [0]))) (ltu:QI (reg:CCC 17 flags) (const_int 0 [0])))) but unfortunately combine_simplify_rtx -> simplify_set breaks that.
--- gcc/config/i386/i386.md.jj 2020-10-01 10:40:09.955758167 +0200 +++ gcc/config/i386/i386.md 2020-10-13 12:30:13.438172411 +0200 @@ -7039,6 +7039,18 @@ (define_expand "subborrow<mode>_0" (set (match_operand:SWI48 0 "register_operand") (minus:SWI48 (match_dup 1) (match_dup 2)))])] "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)") + +;; Pre-reload splitter to optimize +;; *setcc_qi followed by *addqi3_cconly_overflow_1 with the same QI +;; operand and no intervening flags modifications into nothing. +(define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1" + [(set (reg:CCC FLAGS_REG) + (compare:CCC (neg:QI (geu:QI (reg:CCC FLAGS_REG) (const_int 0))) + (ltu:QI (reg:CCC FLAGS_REG) (const_int 0))))] + "ix86_pre_reload_split ()" + "#" + "&& 1" + [(const_int 0)]) ;; Overflow setting add instructions --- gcc/config/i386/i386.c.jj 2020-10-01 10:40:09.951758225 +0200 +++ gcc/config/i386/i386.c 2020-10-13 12:45:33.209848289 +0200 @@ -15136,6 +15136,20 @@ ix86_cc_mode (enum rtx_code code, rtx op && (rtx_equal_p (op1, XEXP (op0, 0)) || rtx_equal_p (op1, XEXP (op0, 1)))) return CCCmode; + /* Similarly for *setcc_qi_addqi3_cconly_overflow_1 pattern. */ + else if (code == LTU + && GET_CODE (op0) == NEG + && GET_CODE (XEXP (op0, 0)) == GEU + && REG_P (XEXP (XEXP (op0, 0), 0)) + && GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode + && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG + && XEXP (XEXP (op0, 0), 1) == const0_rtx + && GET_CODE (op1) == LTU + && REG_P (XEXP (op1, 0)) + && GET_MODE (XEXP (op1, 0)) == CCCmode + && REGNO (XEXP (op1, 0)) == FLAGS_REG + && XEXP (op1, 1) == const0_rtx) + return CCCmode; else return CCmode; case GTU: /* CF=0 & ZF=0 */ @@ -19773,6 +19787,24 @@ ix86_rtx_costs (rtx x, machine_mode mode return true; } + if (mode == CCCmode + && GET_CODE (XEXP (x, 0)) == NEG + && GET_CODE (XEXP (XEXP (x, 0), 0)) == GEU + && REG_P (XEXP (XEXP (XEXP (x, 0), 0), 0)) + && GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == CCCmode + && REGNO (XEXP (XEXP (XEXP (x, 0), 0), 0)) == FLAGS_REG + && XEXP (XEXP (XEXP (x, 0), 0), 1) == const0_rtx + && GET_CODE (XEXP (x, 1)) == LTU + && REG_P (XEXP (XEXP (x, 1), 0)) + && GET_MODE (XEXP (XEXP (x, 1), 0)) == CCCmode + && REGNO (XEXP (XEXP (x, 1), 0)) == FLAGS_REG + && XEXP (XEXP (x, 1), 1) == const0_rtx) + { + /* This is *setcc_qi_addqi3_cconly_overflow_1 pattern, a nop. */ + *total = 0; + return true; + } + /* The embedded comparison operand is completely free. */ if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0))) && XEXP (x, 1) == const0_rtx) seems to work.
Created attachment 49365 [details] gcc11-pr97387.patch Full untested patch.
The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: https://gcc.gnu.org/g:06bec55e80d98419121f3998d98d969990a75b0b commit r11-3882-g06bec55e80d98419121f3998d98d969990a75b0b Author: Jakub Jelinek <jakub@redhat.com> Date: Wed Oct 14 17:14:47 2020 +0200 i386: Improve chaining of _{addcarry,subborrow}_u{32,64} [PR97387] These builtins have two known issues and this patch fixes one of them. One issue is that the builtins effectively return two results and they make the destination addressable until expansion, which means a stack slot is allocated for them and e.g. with -fstack-protector* DSE isn't able to optimize that away. I think for that we want to use the technique of returning complex value; the patch doesn't handle that though. See PR93990 for that. The other problem is optimization of successive uses of the builtin e.g. for arbitrary precision arithmetic additions/subtractions. As shown PR93990, combine is able to optimize the case when the first argument to these builtins is 0 (the first instance when several are used together), and also the last one if the last one ignores its result (i.e. the carry/borrow is dead and thrown away in that case). As shown in this PR, combiner refuses to optimize the rest, where it sees: (insn 10 9 11 2 (set (reg:QI 88 [ _31 ]) (ltu:QI (reg:CCC 17 flags) (const_int 0 [0]))) "include/adxintrin.h":69:10 785 {*setcc_qi} (expr_list:REG_DEAD (reg:CCC 17 flags) (nil))) - set pseudo 88 to CF from flags, then some uninteresting insns that don't modify flags, and finally: (insn 17 15 18 2 (parallel [ (set (reg:CCC 17 flags) (compare:CCC (plus:QI (reg:QI 88 [ _31 ]) (const_int -1 [0xffffffffffffffff])) (reg:QI 88 [ _31 ]))) (clobber (scratch:QI)) ]) "include/adxintrin.h":69:10 350 {*addqi3_cconly_overflow_1} (expr_list:REG_DEAD (reg:QI 88 [ _31 ]) (nil))) to set CF in flags back to what we saved earlier. The combiner just punts trying to combine the 10, 17 and following addcarrydi (etc.) instruction, because if (i1 && !can_combine_p (i1, i3, i0, NULL, i2, NULL, &i1dest, &i1src)) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Can't combine i1 into i3\n"); undo_all (); return 0; } fails - the 3 insns aren't all adjacent and || (! all_adjacent && (((!MEM_P (src) || ! find_reg_note (insn, REG_EQUIV, src)) && modified_between_p (src, insn, i3)) src (flags hard register) is modified between the first and third insn - in the second insn. The following patch optimizes this by optimizing just the two insns, 10 and 17 above, i.e. save CF into pseudo, set CF from that pseudo, into a nop. The new define_insn_and_split matches how combine simplifies those two together (except without the ix86_cc_mode change it was choosing CCmode for the destination instead of CCCmode, so had to change that function too, and also adjust costs so that combiner understand it is beneficial). With this, all the testcases are optimized, so that the: setc %dl ... addb $-1, %dl insns in between the ad[dc][lq] or s[ub]b[lq] instructions are all optimized away (sure, if something would clobber flags in between they wouldn't, but there is nothing that can be done about that). 2020-10-14 Jakub Jelinek <jakub@redhat.com> PR target/97387 * config/i386/i386.md (CC_CCC): New mode iterator. (*setcc_qi_addqi3_cconly_overflow_1_<mode>): New define_insn_and_split. * config/i386/i386.c (ix86_cc_mode): Return CCCmode for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern operands. (ix86_rtx_costs): Return true and *total = 0; for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern. Use op0 and op1 temporaries to simplify COMPARE checks. * gcc.target/i386/pr97387-1.c: New test. * gcc.target/i386/pr97387-2.c: New test.
(In reply to CVS Commits from comment #11) > The master branch has been updated by Jakub Jelinek <jakub@gcc.gnu.org>: > > https://gcc.gnu.org/g:06bec55e80d98419121f3998d98d969990a75b0b > > commit r11-3882-g06bec55e80d98419121f3998d98d969990a75b0b > Author: Jakub Jelinek <jakub@redhat.com> > Date: Wed Oct 14 17:14:47 2020 +0200 > > i386: Improve chaining of _{addcarry,subborrow}_u{32,64} [PR97387] > > These builtins have two known issues and this patch fixes one of them. > > One issue is that the builtins effectively return two results and > they make the destination addressable until expansion, which means > a stack slot is allocated for them and e.g. with -fstack-protector* > DSE isn't able to optimize that away. I think for that we want to use > the technique of returning complex value; the patch doesn't handle that > though. See PR93990 for that. > > The other problem is optimization of successive uses of the builtin > e.g. for arbitrary precision arithmetic additions/subtractions. > As shown PR93990, combine is able to optimize the case when the first > argument to these builtins is 0 (the first instance when several are used > together), and also the last one if the last one ignores its result (i.e. > the carry/borrow is dead and thrown away in that case). > As shown in this PR, combiner refuses to optimize the rest, where it > sees: > (insn 10 9 11 2 (set (reg:QI 88 [ _31 ]) > (ltu:QI (reg:CCC 17 flags) > (const_int 0 [0]))) "include/adxintrin.h":69:10 785 > {*setcc_qi} > (expr_list:REG_DEAD (reg:CCC 17 flags) > (nil))) > - set pseudo 88 to CF from flags, then some uninteresting insns that > don't modify flags, and finally: > (insn 17 15 18 2 (parallel [ > (set (reg:CCC 17 flags) > (compare:CCC (plus:QI (reg:QI 88 [ _31 ]) > (const_int -1 [0xffffffffffffffff])) > (reg:QI 88 [ _31 ]))) > (clobber (scratch:QI)) > ]) "include/adxintrin.h":69:10 350 {*addqi3_cconly_overflow_1} > (expr_list:REG_DEAD (reg:QI 88 [ _31 ]) > (nil))) > to set CF in flags back to what we saved earlier. The combiner just > punts > trying to combine the 10, 17 and following addcarrydi (etc.) instruction, > because > if (i1 && !can_combine_p (i1, i3, i0, NULL, i2, NULL, &i1dest, &i1src)) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "Can't combine i1 into i3\n"); > undo_all (); > return 0; > } > fails - the 3 insns aren't all adjacent and > || (! all_adjacent > && (((!MEM_P (src) > || ! find_reg_note (insn, REG_EQUIV, src)) > && modified_between_p (src, insn, i3)) > src (flags hard register) is modified between the first and third insn - > in > the second insn. > > The following patch optimizes this by optimizing just the two insns, > 10 and 17 above, i.e. save CF into pseudo, set CF from that pseudo, into > a nop. The new define_insn_and_split matches how combine simplifies > those > two together (except without the ix86_cc_mode change it was choosing > CCmode > for the destination instead of CCCmode, so had to change that function > too, > and also adjust costs so that combiner understand it is beneficial). > > With this, all the testcases are optimized, so that the: > setc %dl > ... > addb $-1, %dl > insns in between the ad[dc][lq] or s[ub]b[lq] instructions are all > optimized > away (sure, if something would clobber flags in between they wouldn't, > but > there is nothing that can be done about that). > > 2020-10-14 Jakub Jelinek <jakub@redhat.com> > > PR target/97387 > * config/i386/i386.md (CC_CCC): New mode iterator. > (*setcc_qi_addqi3_cconly_overflow_1_<mode>): New > define_insn_and_split. > * config/i386/i386.c (ix86_cc_mode): Return CCCmode > for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern operands. > (ix86_rtx_costs): Return true and *total = 0; > for *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern. Use op0 > and > op1 temporaries to simplify COMPARE checks. > > * gcc.target/i386/pr97387-1.c: New test. > * gcc.target/i386/pr97387-2.c: New test. awesome
https://godbolt.org/z/fqGrz1 After this patch, the assembly generated is much better now. However, it still contains many optimization problems. The problem is the code like this. Let's just walk through the assembly and see the problems here. field_number operator-(field_number const& x,field_number const& y) noexcept { using namespace intrinsics; using unsigned_type = field_number::value_type; constexpr unsigned_type zero{}; field_number f; bool borrow{sub_borrow(false,x[0],y[0],f[0])}; borrow=sub_borrow(borrow,x[1],y[1],f[1]); borrow=sub_borrow(borrow,x[2],y[2],f[2]); borrow=sub_borrow(borrow,x[3],y[3],f[3]); unsigned_type v{}; sub_borrow(borrow,v,v,v); v&=static_cast<unsigned_type>(38); borrow=sub_borrow(false,f[0],v,f[0]); borrow=sub_borrow(borrow,f[1],zero,f[1]); borrow=sub_borrow(borrow,f[2],zero,f[2]); borrow=sub_borrow(borrow,f[3],zero,f[3]); sub_borrow(borrow,v,v,v); v&=static_cast<unsigned_type>(38); borrow=sub_borrow(false,f[0],v,f[0]); borrow=sub_borrow(borrow,f[1],zero,f[1]); borrow=sub_borrow(borrow,f[2],zero,f[2]); borrow=sub_borrow(borrow,f[2],zero,f[3]); return f; } _ZN7fast_io10curve25519miERKNS0_12field_numberES3_: .LFB5431: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 movq %rsi, %rcx movq %rdx, %rax movq %rdi, %r8 movq (%rsi), %rdi movq 24(%rcx), %r9 subq (%rdx), %rdi movq 8(%rsi), %rsi sbbq 8(%rdx), %rsi movq 16(%rcx), %rdx sbbq 16(%rax), %rdx movq 24(%rax), %rax sbbq %rax, %r9 movl $0, %eax movq %rax, %rcx //The value of sbbq to itself does not matter I should be sbbq %r9 %r9 and you are done. sbbq %rax, %rcx andl $38, %ecx subq %rcx, %rdi //The 2nd problems are these %rax stuffs. The should be just imm 0 // sbbq $0,%rsi sbbq %rax, %rsi sbbq %rax, %rdx sbbq %rax, %r9 sbbq %rcx, %rcx andl $38, %ecx subq %rcx, %rdi sbbq %rax, %rsi movq %rdi, -40(%rsp) sbbq %rax, %rdx movq %rsi, -32(%rsp) movdqa -40(%rsp), %xmm0 movq %rdx, -24(%rsp) sbbq %rax, %rdx movq %r8, %rax movq %rdx, -16(%rsp) movdqa -24(%rsp), %xmm1 movups %xmm0, (%r8) movups %xmm1, 16(%r8) ret https://godbolt.org/z/nKPWx3
(In reply to fdlbxtqi from comment #13) > https://godbolt.org/z/fqGrz1 > > After this patch, the assembly generated is much better now. However, it > still contains many optimization problems. > > The problem is the code like this. > > Let's just walk through the assembly and see the problems here. > > field_number operator-(field_number const& x,field_number const& y) noexcept > { > using namespace intrinsics; > using unsigned_type = field_number::value_type; > constexpr unsigned_type zero{}; > field_number f; > bool borrow{sub_borrow(false,x[0],y[0],f[0])}; > borrow=sub_borrow(borrow,x[1],y[1],f[1]); > borrow=sub_borrow(borrow,x[2],y[2],f[2]); > borrow=sub_borrow(borrow,x[3],y[3],f[3]); > unsigned_type v{}; > sub_borrow(borrow,v,v,v); > v&=static_cast<unsigned_type>(38); > borrow=sub_borrow(false,f[0],v,f[0]); > borrow=sub_borrow(borrow,f[1],zero,f[1]); > borrow=sub_borrow(borrow,f[2],zero,f[2]); > borrow=sub_borrow(borrow,f[3],zero,f[3]); > sub_borrow(borrow,v,v,v); > v&=static_cast<unsigned_type>(38); > borrow=sub_borrow(false,f[0],v,f[0]); > borrow=sub_borrow(borrow,f[1],zero,f[1]); > borrow=sub_borrow(borrow,f[2],zero,f[2]); > borrow=sub_borrow(borrow,f[2],zero,f[3]); > return f; > } > > > _ZN7fast_io10curve25519miERKNS0_12field_numberES3_: > .LFB5431: > .cfi_startproc > .cfi_personality 0x3,__gxx_personality_v0 > movq %rsi, %rcx > movq %rdx, %rax > movq %rdi, %r8 > movq (%rsi), %rdi > movq 24(%rcx), %r9 > subq (%rdx), %rdi > movq 8(%rsi), %rsi > sbbq 8(%rdx), %rsi > movq 16(%rcx), %rdx > sbbq 16(%rax), %rdx > movq 24(%rax), %rax > sbbq %rax, %r9 > > movl $0, %eax > movq %rax, %rcx > //The value of sbbq to itself does not matter I should be sbbq %r9 %r9 and > you are done. > > sbbq %rax, %rcx > andl $38, %ecx > subq %rcx, %rdi > > //The 2nd problems are these %rax stuffs. The should be just imm 0 > // sbbq $0,%rsi > sbbq %rax, %rsi > sbbq %rax, %rdx > sbbq %rax, %r9 > sbbq %rcx, %rcx > andl $38, %ecx > subq %rcx, %rdi > sbbq %rax, %rsi > movq %rdi, -40(%rsp) > sbbq %rax, %rdx > movq %rsi, -32(%rsp) > movdqa -40(%rsp), %xmm0 > movq %rdx, -24(%rsp) > sbbq %rax, %rdx > movq %r8, %rax > movq %rdx, -16(%rsp) > movdqa -24(%rsp), %xmm1 > movups %xmm0, (%r8) > movups %xmm1, 16(%r8) > ret > > https://godbolt.org/z/nKPWx3 I think the optimal instruction number should be 26.
PR 93990 is another related issue here.