From: Andi Kleen Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200) Subject: [PATCH] optimize hweight64 for x86_64 X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=0136611c62e8650e354b95c76dff6d2ce6030eff;p=openwrt%2Fstaging%2Fblogic.git [PATCH] optimize hweight64 for x86_64 Based on patch from David Rientjes , but changed by AK. Optimizes the 64-bit hamming weight for x86_64 processors assuming they have fast multiplication. Uses five fewer bitops than the generic hweight64. Benchmark on one EMT64 showed ~25% speedup with 2^24 consecutive calls. Define a new ARCH_HAS_FAST_MULTIPLIER that can be set by other architectures that can also multiply fast. Signed-off-by: Andi Kleen --- diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index f7ba57b1cc08..5b535eaf5309 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -399,6 +399,8 @@ static __inline__ int fls(int x) return r+1; } +#define ARCH_HAS_FAST_MULTIPLIER 1 + #include #endif /* __KERNEL__ */ diff --git a/lib/hweight.c b/lib/hweight.c index 438257671708..360556a7803d 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -1,5 +1,6 @@ #include #include +#include /** * hweightN - returns the hamming weight of a N-bit word @@ -40,14 +41,19 @@ unsigned long hweight64(__u64 w) #if BITS_PER_LONG == 32 return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); #elif BITS_PER_LONG == 64 +#ifdef ARCH_HAS_FAST_MULTIPLIER + w -= (w >> 1) & 0x5555555555555555ul; + w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul); + w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful; + return (w * 0x0101010101010101ul) >> 56; +#else __u64 res = w - ((w >> 1) & 0x5555555555555555ul); res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful; res = res + (res >> 8); res = res + (res >> 16); return (res + (res >> 32)) & 0x00000000000000FFul; -#else -#error BITS_PER_LONG not defined +#endif #endif } EXPORT_SYMBOL(hweight64);