Bug 108477 - fwprop over-optimizes conversion from + to |
Summary: fwprop over-optimizes conversion from + to |
Status: UNCONFIRMED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 13.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2023-01-20 10:59 UTC by Uroš Bizjak
Modified: 2024-01-08 16:54 UTC (History)
0 users

See Also:
Host:
Target: x86_64-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Uroš Bizjak 2023-01-20 10:59:39 UTC
In the unsigned int case (baz) fwprop over-optimizes the addition to a logical or:

--cut here--
int lock;

int bar (int old)
{
  int val = (old >> 1) & 0x1;
  int new = (old & ~0x3) + 0x2 + val;

  lock = new;
  return val ? 0 : -1;
}

int ulock;

int baz (unsigned int old)
{
  unsigned int val = (old >> 1) & 0x1;
  unsigned int new = (old & ~0x3) + 0x2 + val;

  ulock = new;
  return val ? 0 : -1;
}
--cut here--

resulting in:

bar:
        movl    %edi, %eax
        andl    $-4, %edi
        sarl    %eax
        andl    $1, %eax
        leal    2(%rax,%rdi), %edx    <---- here
        subl    $1, %eax
        movl    %edx, lock(%rip)
        ret

baz:
        movl    %edi, %eax
        andl    $-4, %edi
        shrl    %eax
        andl    $1, %eax
        orl     %eax, %edi    <--- here ...
        subl    $1, %eax
        addl    $2, %edi      <--- ... and here
        movl    %edi, ulock(%rip)
        ret

Please note the three-operand addition, implemented with LEAL instruction in the signed case, which is not emitted in the unsigned case. The reason is fwprop pass that substitutes addition with the equivalent or operation.
Comment 1 Uroš Bizjak 2024-01-08 09:53:22 UTC
This conversion happens due to th following code in match.pd:

/* If we are XORing or adding two BIT_AND_EXPR's, both of which are and'ing
   with a constant, and the two constants have no bits in common,
   we should treat this as a BIT_IOR_EXPR since this may produce more
   simplifications.  */
(for op (bit_xor plus)
 (simplify
  (op (convert1? (bit_and@4 @0 INTEGER_CST@1))
      (convert2? (bit_and@5 @2 INTEGER_CST@3)))
  (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
       && tree_nop_conversion_p (type, TREE_TYPE (@2))
       && (wi::to_wide (@1) & wi::to_wide (@3)) == 0)
   (bit_ior (convert @4) (convert @5)))))
Comment 2 Uroš Bizjak 2024-01-08 11:00:02 UTC
If we consider the following testcase:

--cut here--
unsigned int foo (unsigned int a, unsigned int b)
{
  unsigned int r = a & 0x1;
  unsigned int p = b & ~0x3;

  return r + p + 2;
}

unsigned int bar (unsigned int a, unsigned int b)
{
  unsigned int r = a & 0x1;
  unsigned int p = b & ~0x3;

  return r | p | 2;
}
--cut here--

the above testcase compiles (x86_64 -O2) to:

foo:
        andl    $1, %edi
        andl    $-4, %esi
        orl     %esi, %edi
        leal    2(%rdi), %eax
        ret

bar:
        andl    $1, %edi
        andl    $-4, %esi
        orl     %esi, %edi
        movl    %edi, %eax
        orl     $2, %eax
        ret

So, there is no further simplification in any case, we can't combine OR with a PLUS in the first case, and we don't have OR instruction with multiple inputs in the second case.

If we switch around the logic in the conversion and convert from IOR/XOR to PLUS, as is the case in the following patch:

--cut here--
diff --git a/gcc/match.pd b/gcc/match.pd
index 7b4b15acc41..deac18a7635 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1830,18 +1830,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        && element_precision (type) <= element_precision (TREE_TYPE (@1)))
    (bit_not (rop (convert @0) (convert @1))))))
 
-/* If we are XORing or adding two BIT_AND_EXPR's, both of which are and'ing
+/* If we are ORing or XORing two BIT_AND_EXPR's, both of which are and'ing
    with a constant, and the two constants have no bits in common,
-   we should treat this as a BIT_IOR_EXPR since this may produce more
+   we should treat this as a PLUS_EXPR since this may produce more
    simplifications.  */
-(for op (bit_xor plus)
+(for op (bit_ior bit_xor)
  (simplify
   (op (convert1? (bit_and@4 @0 INTEGER_CST@1))
       (convert2? (bit_and@5 @2 INTEGER_CST@3)))
   (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
        && tree_nop_conversion_p (type, TREE_TYPE (@2))
        && (wi::to_wide (@1) & wi::to_wide (@3)) == 0)
-   (bit_ior (convert @4) (convert @5)))))
+   (plus (convert @4) (convert @5)))))
 
 /* (X | Y) ^ X -> Y & ~ X*/
 (simplify
--cut here--

then the resulting assembly reads:

foo:
        andl    $-4, %esi
        andl    $1, %edi
        leal    2(%rsi,%rdi), %eax
        ret

bar:
        andl    $1, %edi
        andl    $-4, %esi
        leal    (%rdi,%rsi), %eax
        orl     $2, %eax
        ret

On x86, the conversion can now use LEA instruction, which is much more usable than OR instruction. In the first case, LEA implements three input PLUS instruction, while in the second case, even though the instruction can't be combined with a follow-up OR, the non-destructive LEA avoids a move.