#!/usr/bin/perl -w
#
# bit-count
#
# explore different ways of counting the number of "on" bits in a 32-bit word.
#
# 1999-06-02 -- tkil@scrye.com
#
# released into the public domain. share and enjoy.
use strict;
use Benchmark;
# timing information on my machine (400MHz k6-2, linux 2.2.6, perl 5.005_02):
#
# count_em: 15 wallclock secs (14.87 usr + 0.02 sys = 14.89 CPU)
# count_masks: 12 wallclock secs (12.35 usr + 0.01 sys = 12.36 CPU)
# lucs: 11 wallclock secs (10.56 usr + 0.00 sys = 10.56 CPU)
# lucs_int: 7 wallclock secs ( 7.16 usr + 0.00 sys = 7.16 CPU)
# table: 5 wallclock secs ( 4.51 usr + 0.01 sys = 4.52 CPU)
# table_2: 2 wallclock secs ( 2.21 usr + 0.00 sys = 2.21 CPU)
# tkil: 4 wallclock secs ( 3.84 usr + 0.00 sys = 3.84 CPU)
# tkil_int: 3 wallclock secs ( 3.20 usr + 0.00 sys = 3.20 CPU)
# tony: 2 wallclock secs ( 2.37 usr + 0.00 sys = 2.37 CPU)
# =============================================================================
# not sure where i found this algorithm; i thought i had read about it
# in HakMem, but the closest they have there is an octal-marshalling
# technique. this technique *is* described in Graphics Gems II, gem
# VIII.3, page 371.
#
# in our timing runs, this was the fastest algorithm (although the
# "use integer" pragma, enabled in &tkil_int, squeezes an extra 20%
# out of it)
#
# for the curious, a 64-bit version simply requires doubling the width
# of all these masks (by duplicating the bit pattern) and adding one
# more line to sum up the two 32-bit sums. something like this:
#
# sub tkil_64
# {
# my $n_bits = shift;
# $n_bits = ((($n_bits & 0xaaaaaaaaaaaaaaaa) >> 1) +
# ($n_bits & 0x5555555555555555));
# $n_bits = ((($n_bits & 0xcccccccccccccccc) >> 2) +
# ($n_bits & 0x3333333333333333));
# $n_bits = ((($n_bits & 0xf0f0f0f0f0f0f0f0) >> 4) +
# ($n_bits & 0x0f0f0f0f0f0f0f0f));
# $n_bits = ((($n_bits & 0xff00ff00ff00ff00) >> 8) +
# ($n_bits & 0x00ff00ff00ff00ff));
# $n_bits = ((($n_bits & 0xffff0000ffff0000) >> 16) +
# ($n_bits & 0x0000ffff0000ffff));
# $n_bits = ((($n_bits & 0xffffffff00000000) >> 32) +
# ($n_bits & 0x00000000ffffffff));
# }
#
#
sub tkil
{
my $n_bits = shift;
$n_bits = (($n_bits & 0xaaaaaaaa) >> 1) + ($n_bits & 0x55555555);
$n_bits = (($n_bits & 0xcccccccc) >> 2) + ($n_bits & 0x33333333);
$n_bits = (($n_bits & 0xf0f0f0f0) >> 4) + ($n_bits & 0x0f0f0f0f);
$n_bits = (($n_bits & 0xff00ff00) >> 8) + ($n_bits & 0x00ff00ff);
$n_bits = (($n_bits & 0xffff0000) >> 16) + ($n_bits & 0x0000ffff);
return $n_bits;
}
# identical to above, but a bit faster by using integer arithmetic.
#
# WARNING, as noted by TonyC:
# I also found that tkil_int() produces an incorrect value on my
# x86 Linux box when the number is over 0x7FFFFFFF, presumably
# because the high-bit is becoming a sign bit.
sub tkil_int
{
use integer;
my $n_bits = shift;
$n_bits = (($n_bits & 0xaaaaaaaa) >> 1) + ($n_bits & 0x55555555);
$n_bits = (($n_bits & 0xcccccccc) >> 2) + ($n_bits & 0x33333333);
$n_bits = (($n_bits & 0xf0f0f0f0) >> 4) + ($n_bits & 0x0f0f0f0f);
$n_bits = (($n_bits & 0xff00ff00) >> 8) + ($n_bits & 0x00ff00ff);
$n_bits = (($n_bits & 0xffff0000) >> 16) + ($n_bits & 0x0000ffff);
return $n_bits;
}
# 2000-02-25: update
#
# Daniel Thompson submitted this one:
#
# I was looking at you perl bit-count experiments recently. A came
# across this optimisation of one of the alogorithms you use. It is
# derived from the Intel assembler displayed on
# http://www.df.lth.se/~john_e/index.html which is taken from an
# AMD optimiation paper. It optimises that last few stages into a
# multiply.
#
# 2002-06-08: that page seems to have moved to:
#
# http://www.df.lth.se/~john_e/gems/gem002d.html
#
# my (tkil's) gut reaction is that it is unlikely to be faster than
# the pure integer stuff above, but we'll find out.
#
# pardon me while i eat my words. the output is wrong, but it
# executes noticably faster than either of my versions above. now to
# talk to danial to see if i can't get it working correctly...
#
# 2002-06-08 -- alert reader PG pointed out the fix in the final
# scaling step.
#
# hm. i think this is fundamentally broken. looking at the first two
# steps, consider what happens when we feed 0xf into it. first:
#
# (1111 & 0101) + ((1111 >> 1) & 0101)
# = 0101 + 0101
# = 1010
#
# ok so far; we have a value of 2 in the bottom two bits. but watch
# what happens next:
#
# (1010 + (1010 >> 2)) & 0011
# = (1010 + 0010) & 0011
# = 1100 & 0011
# = 0000
#
# oops! all gone! did i miss something here? i don't think we can
# fix it easily, because we're not clearing out the top two bits of
# the sum of each 4 bits. in "tkil" above, we do that before adding,
# so we are guaranteed to have zeros to carry the all-ones value into.
# i'm pretty sure that we can't do it this way.
#
# unless maybe we zero out everything at the bottom only? no, then we
# have random crap coming in from the high bits. ahh... but if we
# shove everything up to the top? we're ok as long as it's not all
# ones, i think.
#
# stupid precedence tricks: tight * + >> & loose
sub d_thompson
{
my $n = shift;
# don't fully understand this, but it's a direct quote...
# $n = $n - (($n >> 1) & 0x55555555); # doesn't make sense
$n = (($n & 0xaaaaaaaa) >> 1) + ($n & 0x55555555); # simple
# this is the same as in "tkil"...
$n = ($n & 0x33333333) + (($n >> 2) & 0x33333333);
# each sub-sum will now fit in four bits, so:
$n = ($n + ($n >> 4)) & 0x0f0f0f0f;
# finally, we use a 32-bit multiply to sum up the four four-bit
# subtotals into the most signifcant 8 bits
$n = (($n * 0x01010101) >> 24) & 0xff;
# and we only care about the top 8 bits:
return ($n >> 24) & 0xff;
}
sub d_thompson_old_and_broken
{
my $n_bits = shift;
$n_bits = ($n_bits & 0x55555555) + (($n_bits >> 1) & 0x55555555);
$n_bits = ($n_bits + ($n_bits >> 2)) & 0x33333333;
$n_bits = ($n_bits + ($n_bits >> 4)) & 0x0f0f0f0f;
$n_bits = ($n_bits + ($n_bits >> 8)) & 0x00ff00ff;
$n_bits = ($n_bits + ($n_bits >> 16));
# $n_bits = $n_bits * 0x01010101;
$n_bits = $n_bits * 0x01000000; # fix from PG
return ($n_bits >> 24) & 0xff;
# return $n_bits & 0xff;
}
# =============================================================================
# submitted by lucs on IRC, this one uses the fact that -x == (~x + 1)
# in twos-complement representation.
sub lucs
{
my $x = shift;
my $b = 0;
while ($x != 0) {
++$b;
$x &= $x - 1;
}
return $b;
}
# and the same, but with integer.
sub lucs_int
{
use integer;
my $x = shift;
my $b = 0;
while ($x != 0) {
++$b;
$x &= $x - 1;
}
return $b;
}
# =============================================================================
# a naive counting algorithm.
sub count_em
{
use integer;
my $x = shift;
my $i = 0;
my $b = 0;
for (my $i = 0; $i < 32; ++$i)
{
++$b if $x & 1;
$x >>= 1;
}
return $b;
}
# a similar, but hopefully less-naive, algorithm.
# my @masks_precalc = ( 0x1, 0x2, 0x4, 0x8,
# 0x10, 0x20, 0x40, 0x80,
# 0x100, 0x200, 0x400, 0x800,
# 0x1000, 0x2000, 0x4000, 0x8000,
# 0x10000, 0x20000, 0x40000, 0x80000,
# 0x100000, 0x200000, 0x400000, 0x800000,
# 0x1000000, 0x2000000, 0x4000000, 0x8000000,
# 0x10000000, 0x20000000, 0x40000000, 0x80000000 );
my @masks = map { 1 << $_ } 0 .. 31;
sub count_masks
{
use integer;
my $x = shift;
my $b = 0;
foreach (@masks)
{
++$b if $x & $_;
}
return $b;
}
# =============================================================================
# these two attempt to treat the number as a string. they're both
# abysimally slow; don't use either of them, unless you're trying to
# be cute.
sub stringy
{
my $n_bits;
$n_bits += $_ foreach split //, unpack "B*", pack "N", shift;
return $n_bits;
}
sub stringy_2
{
return eval join "+", split //, unpack "B*", pack "N", shift;
}
# =============================================================================
# a table-lookup algorithm, at the perl level, was the second-place
# winner (after the bit-twiddling version above called "tkil"). as
# various people on #perl pointed out, the actual speed of this sort
# of thing would depend a lot on the CPU architecture (especially if
# this table is big enough to cause cache problems), compared to bit
# twiddling (which could cause pipeline stalls).
# my @lookup_precalc = ( 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
# 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
# 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
# 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
# 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
# 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
# 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
# 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
# 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
# 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
# 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
# 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
# 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
# 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
# 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
# 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 );
my @lookup = map { tkil($_) } 0 .. 255;
sub table
{
my @b = unpack "C4", pack "N", shift;
return $lookup[$b[0]] + $lookup[$b[1]] + $lookup[$b[2]] + $lookup[$b[3]];
}
sub table_2
{
my $n = shift;
return ($lookup[$n & 0xff] +
$lookup[$n >> 8 & 0xff] +
$lookup[$n >> 16 & 0xff] +
$lookup[$n >> 24 & 0xff]);
}
# =============================================================================
# a version contributed by TonyC. i can forward e-mail to him, or i
# can publish his e-mail address here; i simply have not yet asked him
# whether he cares one way or another. his comments:
#
# This tests as faster than the other methods on my machine. It's
# based on a note in the unpack documentation.
sub tony {
my $n = shift;
return unpack("%32b*", pack("I", $n))
}
# =============================================================================
my @data = map { int rand 0xffffffff } 1 .. 1000;
for my $t (@data[0,3])
{
print
"testing:\n",
" data = $t (", unpack("B*", pack "N", $t), ")\n",
" tkil => ", tkil ($t), "\n",
" tkil_int => ", tkil_int ($t), "\n",
" d_thompson => ", d_thompson ($t), "\n",
" lucs => ", lucs ($t), "\n",
" lucs_int => ", lucs_int ($t), "\n",
" count_em => ", count_em ($t), "\n",
" count_masks => ", count_masks ($t), "\n",
" stringy => ", stringy ($t), "\n",
" stringy_2 => ", stringy_2 ($t), "\n",
" table => ", table ($t), "\n",
" table_2 => ", table_2 ($t), "\n",
" tony => ", tony ($t), "\n",
"\n";
}
timethese(200,
{
tkil => sub { tkil ($_) foreach @data },
tkil_int => sub { tkil_int ($_) foreach @data },
d_thompson => sub { d_thompson ($_) foreach @data },
lucs => sub { lucs ($_) foreach @data },
lucs_int => sub { lucs_int ($_) foreach @data },
count_em => sub { count_em ($_) foreach @data },
count_masks => sub { count_masks ($_) foreach @data },
# stringy => sub { stringy ($_) foreach @data },
# stringy_2 => sub { stringy_2 ($_) foreach @data },
table => sub { table ($_) foreach @data },
table_2 => sub { table_2 ($_) foreach @data },
tony => sub { tony ($_) foreach @data },
} );
exit;