Speedup bitmap iterators

Richard Guenther richard.guenther@gmail.com
Wed Jun 9 11:10:00 GMT 2010


On Wed, Jun 9, 2010 at 11:51 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> Hi,
> this patch avoid dififcult to predict branch in bmp walking. Improving:
>               :static inline bool
>               :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
>               :{
>               :  /* If our current word is nonzero, it contains the bit we want.  */
>  1551  0.1115 :  if (bi->bits)
>               :    {
>               :    next_bit:
>  7282  0.5237 :      while (!(bi->bits & 1))
>               :        {
>  5312  0.3820 :          bi->bits >>= 1;
>  7904  0.5685 :          *bit_no += 1;
>               :        }
>               :      return true;
>               :    }
>
> to:
>
>               : bool
>               :bmp_iter_set (bitmap_iterator *bi, unsigned *bit_no)
>               :{
>               :  /* If our current word is nonzero, it contains the bit we want.  */
>  1274  0.0973 :  if (bi->bits)
>               :    {
>               :    next_bit:
>               :#if (GCC_VERSION >= 3004)
>               :      {
>  1106  0.0845 :        unsigned int n = __builtin_ctzl (bi->bits);
>               :        gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
>  1811  0.1384 :        bi->bits >>= n;
>  1892  0.1446 :        *bit_no += n;
>               :      }
>               :#else
>
> The code (including checking for 3.4) is the same as one present in
> bitmap_first_bit already.
>
> Bootstrapped/regtested x86_64-linux, OK?

Hm.  I am concerned about using __builtin_ctzl unconditionally
as it might expand to a call to libgcc which for sure wouldn't be
faster(?).  Maybe that's also a question as instead of a loop it
would use a few table lookups.

Note that the common case might even be that bi->bits & 1 already
which the dispatching to ctzl would be even worse.

So I guess you should probably use the count_leading/trailing_zeros
facility from longlong.h instead and just for selected host archs
use __builtin_ctzl (it wouldn't be available for generic x86_64 even).

Also duplicating all this code again and again is bad - can you
factor it out into a single macro first?

Thanks,
Richard.

> Honza
>
>        * bitmap.h (bmp_iter_set, bmp_iter_and, bmp_iter_and_compl):
>        Use __builtin_ctzl where possible.
> Index: bitmap.h
> ===================================================================
> --- bitmap.h    (revision 160447)
> +++ bitmap.h    (working copy)
> @@ -396,11 +396,20 @@ bmp_iter_set (bitmap_iterator *bi, unsig
>   if (bi->bits)
>     {
>     next_bit:
> +#if (GCC_VERSION >= 3004)
> +      {
> +       unsigned int n = __builtin_ctzl (bi->bits);
> +       gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
> +       bi->bits >>= n;
> +       *bit_no += n;
> +      }
> +#else
>       while (!(bi->bits & 1))
>        {
>          bi->bits >>= 1;
>          *bit_no += 1;
>        }
> +#endif
>       return true;
>     }
>
> @@ -443,11 +452,20 @@ bmp_iter_and (bitmap_iterator *bi, unsig
>   if (bi->bits)
>     {
>     next_bit:
> +#if (GCC_VERSION >= 3004)
> +      {
> +       unsigned int n = __builtin_ctzl (bi->bits);
> +       gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
> +       bi->bits >>= n;
> +       *bit_no += n;
> +      }
> +#else
>       while (!(bi->bits & 1))
>        {
>          bi->bits >>= 1;
>          *bit_no += 1;
>        }
> +#endif
>       return true;
>     }
>
> @@ -510,11 +528,20 @@ bmp_iter_and_compl (bitmap_iterator *bi,
>   if (bi->bits)
>     {
>     next_bit:
> +#if (GCC_VERSION >= 3004)
> +      {
> +       unsigned int n = __builtin_ctzl (bi->bits);
> +       gcc_assert (sizeof (unsigned long) == sizeof (BITMAP_WORD));
> +       bi->bits >>= n;
> +       *bit_no += n;
> +      }
> +#else
>       while (!(bi->bits & 1))
>        {
>          bi->bits >>= 1;
>          *bit_no += 1;
>        }
> +#endif
>       return true;
>     }
>
>



More information about the Gcc-patches mailing list