[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: ffsl: optimization for non-GCC
From: |
Bruno Haible |
Subject: |
Re: ffsl: optimization for non-GCC |
Date: |
Fri, 14 Oct 2011 22:24:50 +0200 |
User-agent: |
KMail/1.13.6 (Linux/2.6.37.6-0.5-desktop; KDE/4.6.0; x86_64; ; ) |
> 2011-10-13 Bruno Haible <address@hidden>
>
> ffsl: Optimize on 32-bit platforms.
> * lib/ffsl.h (FUNC): If TYPE has the same representation as 'int', just
> use ffs() without a loop.
The other frequent case is that sizeof (TYPE) == 2 * sizeof (int).
This means, the loop is executed just twice. And in the last loop
round, we are making an unnecessary test: If j != 0 and the low 32 bits
of j are known to be 0, we know that the high 32 bits of j must be non-zero.
We don't need to test that.
So I
- introduced a variable k that counts the number of loop rounds,
so as to avoid the non-zero test in the last loop round,
- unrolled the loop, extracting the first and last loop round,
letting the compiler realize that the middle part of the loop
is empty code,
- realized that this code also works when sizeof (TYPE) == sizeof (int),
- removed the workaround for the gcc warning, that no longer exists.
On MacOS X in 32-bit mode, disabling the GCC built-ins, the code for
ffsll.c got simplified from
=================================================================
.globl _ffsll
_ffsll:
pushl %esi
movl 12(%esp), %ecx
xorl %eax, %eax
movl 8(%esp), %edx
movl %ecx, %esi
orl %edx, %esi
je L6
xorl %esi, %esi
testl %edx, %edx
movl %edx, %eax
jne L5
.align 4,0x90
L8:
movl %ecx, %edx
addl $32, %esi
xorl %ecx, %ecx
testl %edx, %edx
movl %edx, %eax
je L8
L5:
bsfl %eax, %eax
movl $-1, %edx
cmove %edx, %eax
incl %eax
addl %esi, %eax
L6:
popl %esi
ret
=================================================================
to
=================================================================
.globl _ffsll
_ffsll:
pushl %esi
movl 12(%esp), %edx
xorl %ecx, %ecx
movl 8(%esp), %eax
movl %edx, %esi
orl %eax, %esi
je L4
testl %eax, %eax
movl %eax, %ecx
jne L9
movl %edx, %eax
movl $-1, %ecx
bsfl %eax, %eax
cmove %ecx, %eax
incl %eax
leal 32(%eax), %ecx
L4:
popl %esi
movl %ecx, %eax
ret
.align 4,0x90
L9:
bsfl %ecx, %ecx
movl $-1, %eax
popl %esi
cmove %eax, %ecx
incl %ecx
movl %ecx, %eax
ret
=================================================================
Yes, this is simpler: it contains only 2 conditional branches,
rather than 3.
2011-10-14 Bruno Haible <address@hidden>
ffsl: Optimize on 64-bit platforms.
* lib/ffsl.h (FUNC): Omit a test from the last loop round. Do loop
unrolling.
*** lib/ffsl.h.orig Fri Oct 14 22:13:03 2011
--- lib/ffsl.h Fri Oct 14 22:12:47 2011
***************
*** 34,67 ****
#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) && defined
GCC_BUILTIN
return GCC_BUILTIN (i);
#else
! if (sizeof (TYPE) == sizeof (int))
! return ffs (i);
! else
{
! unsigned TYPE j = i;
! /* Split j into chunks, and look at one chunk after the other. */
! /* Define chunk_bits so as to avoid a GCC warning
! "right shift count >= width of type"
! if sizeof (TYPE) == sizeof (int). */
! enum
! {
! chunk_bits = (sizeof (TYPE) != sizeof (int)
! ? CHAR_BIT * sizeof (unsigned int)
! : 0)
! };
! int result = 0;
/* It is tempting to write if (!j) here, but if we do this,
Solaris 10/x86 "cc -O" miscompiles the code. */
if (!i)
return 0;
! while (1)
! {
! if ((unsigned int) j)
! return result + ffs ((unsigned int) j);
! j >>= chunk_bits;
! result += chunk_bits;
! }
}
#endif
}
--- 34,64 ----
#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) && defined
GCC_BUILTIN
return GCC_BUILTIN (i);
#else
! unsigned TYPE j = i;
! /* Split j into chunks, and look at one chunk after the other. */
! enum { chunk_bits = CHAR_BIT * sizeof (unsigned int) };
! /* The number of chunks is ceil (sizeof (TYPE) / sizeof (unsigned int))
! = (sizeof (TYPE) - 1) / sizeof (unsigned int) + 1. */
! enum { chunk_count = (sizeof (TYPE) - 1) / sizeof (unsigned int) + 1 };
!
! if (chunk_count > 1)
{
! size_t k;
/* It is tempting to write if (!j) here, but if we do this,
Solaris 10/x86 "cc -O" miscompiles the code. */
if (!i)
return 0;
! /* Unroll the first loop round. k = 0. */
! if ((unsigned int) j)
! return ffs ((unsigned int) j);
! /* Generic loop. */
! for (k = 1; k < chunk_count - 1; k++)
! if ((unsigned int) (j >> (k * chunk_bits)) != 0)
! return k * chunk_bits + ffs ((unsigned int) (j >> (k *
chunk_bits)));
}
+ /* Last loop round. k = chunk_count - 1. */
+ return (chunk_count - 1) * chunk_bits
+ + ffs ((unsigned int) (j >> ((chunk_count - 1) * chunk_bits)));
#endif
}
--
In memoriam Robert Dziekański <http://en.wikipedia.org/wiki/Robert_Dziekański>