#!/usr/bin/perl -w # # bit-count # # explore different ways of counting the number of "on" bits in a 32-bit word. # # 1999-06-02 -- tkil@scrye.com # # released into the public domain. share and enjoy. use strict; use Benchmark; # timing information on my machine (400MHz k6-2, linux 2.2.6, perl 5.005_02): # # count_em: 15 wallclock secs (14.87 usr + 0.02 sys = 14.89 CPU) # count_masks: 12 wallclock secs (12.35 usr + 0.01 sys = 12.36 CPU) # lucs: 11 wallclock secs (10.56 usr + 0.00 sys = 10.56 CPU) # lucs_int: 7 wallclock secs ( 7.16 usr + 0.00 sys = 7.16 CPU) # table: 5 wallclock secs ( 4.51 usr + 0.01 sys = 4.52 CPU) # table_2: 2 wallclock secs ( 2.21 usr + 0.00 sys = 2.21 CPU) # tkil: 4 wallclock secs ( 3.84 usr + 0.00 sys = 3.84 CPU) # tkil_int: 3 wallclock secs ( 3.20 usr + 0.00 sys = 3.20 CPU) # tony: 2 wallclock secs ( 2.37 usr + 0.00 sys = 2.37 CPU) # ============================================================================= # not sure where i found this algorithm; i thought i had read about it # in HakMem, but the closest they have there is an octal-marshalling # technique. this technique *is* described in Graphics Gems II, gem # VIII.3, page 371. # # in our timing runs, this was the fastest algorithm (although the # "use integer" pragma, enabled in &tkil_int, squeezes an extra 20% # out of it) # # for the curious, a 64-bit version simply requires doubling the width # of all these masks (by duplicating the bit pattern) and adding one # more line to sum up the two 32-bit sums. something like this: # # sub tkil_64 # { # my $n_bits = shift; # $n_bits = ((($n_bits & 0xaaaaaaaaaaaaaaaa) >> 1) + # ($n_bits & 0x5555555555555555)); # $n_bits = ((($n_bits & 0xcccccccccccccccc) >> 2) + # ($n_bits & 0x3333333333333333)); # $n_bits = ((($n_bits & 0xf0f0f0f0f0f0f0f0) >> 4) + # ($n_bits & 0x0f0f0f0f0f0f0f0f)); # $n_bits = ((($n_bits & 0xff00ff00ff00ff00) >> 8) + # ($n_bits & 0x00ff00ff00ff00ff)); # $n_bits = ((($n_bits & 0xffff0000ffff0000) >> 16) + # ($n_bits & 0x0000ffff0000ffff)); # $n_bits = ((($n_bits & 0xffffffff00000000) >> 32) + # ($n_bits & 0x00000000ffffffff)); # } # # sub tkil { my $n_bits = shift; $n_bits = (($n_bits & 0xaaaaaaaa) >> 1) + ($n_bits & 0x55555555); $n_bits = (($n_bits & 0xcccccccc) >> 2) + ($n_bits & 0x33333333); $n_bits = (($n_bits & 0xf0f0f0f0) >> 4) + ($n_bits & 0x0f0f0f0f); $n_bits = (($n_bits & 0xff00ff00) >> 8) + ($n_bits & 0x00ff00ff); $n_bits = (($n_bits & 0xffff0000) >> 16) + ($n_bits & 0x0000ffff); return $n_bits; } # identical to above, but a bit faster by using integer arithmetic. # # WARNING, as noted by TonyC: # I also found that tkil_int() produces an incorrect value on my # x86 Linux box when the number is over 0x7FFFFFFF, presumably # because the high-bit is becoming a sign bit. sub tkil_int { use integer; my $n_bits = shift; $n_bits = (($n_bits & 0xaaaaaaaa) >> 1) + ($n_bits & 0x55555555); $n_bits = (($n_bits & 0xcccccccc) >> 2) + ($n_bits & 0x33333333); $n_bits = (($n_bits & 0xf0f0f0f0) >> 4) + ($n_bits & 0x0f0f0f0f); $n_bits = (($n_bits & 0xff00ff00) >> 8) + ($n_bits & 0x00ff00ff); $n_bits = (($n_bits & 0xffff0000) >> 16) + ($n_bits & 0x0000ffff); return $n_bits; } # 2000-02-25: update # # Daniel Thompson submitted this one: # # I was looking at you perl bit-count experiments recently. A came # across this optimisation of one of the alogorithms you use. It is # derived from the Intel assembler displayed on # http://www.df.lth.se/~john_e/index.html which is taken from an # AMD optimiation paper. It optimises that last few stages into a # multiply. # # 2002-06-08: that page seems to have moved to: # # http://www.df.lth.se/~john_e/gems/gem002d.html # # my (tkil's) gut reaction is that it is unlikely to be faster than # the pure integer stuff above, but we'll find out. # # pardon me while i eat my words. the output is wrong, but it # executes noticably faster than either of my versions above. now to # talk to danial to see if i can't get it working correctly... # # 2002-06-08 -- alert reader PG pointed out the fix in the final # scaling step. # # hm. i think this is fundamentally broken. looking at the first two # steps, consider what happens when we feed 0xf into it. first: # # (1111 & 0101) + ((1111 >> 1) & 0101) # = 0101 + 0101 # = 1010 # # ok so far; we have a value of 2 in the bottom two bits. but watch # what happens next: # # (1010 + (1010 >> 2)) & 0011 # = (1010 + 0010) & 0011 # = 1100 & 0011 # = 0000 # # oops! all gone! did i miss something here? i don't think we can # fix it easily, because we're not clearing out the top two bits of # the sum of each 4 bits. in "tkil" above, we do that before adding, # so we are guaranteed to have zeros to carry the all-ones value into. # i'm pretty sure that we can't do it this way. # # unless maybe we zero out everything at the bottom only? no, then we # have random crap coming in from the high bits. ahh... but if we # shove everything up to the top? we're ok as long as it's not all # ones, i think. # # stupid precedence tricks: tight * + >> & loose sub d_thompson { my $n = shift; # don't fully understand this, but it's a direct quote... # $n = $n - (($n >> 1) & 0x55555555); # doesn't make sense $n = (($n & 0xaaaaaaaa) >> 1) + ($n & 0x55555555); # simple # this is the same as in "tkil"... $n = ($n & 0x33333333) + (($n >> 2) & 0x33333333); # each sub-sum will now fit in four bits, so: $n = ($n + ($n >> 4)) & 0x0f0f0f0f; # finally, we use a 32-bit multiply to sum up the four four-bit # subtotals into the most signifcant 8 bits $n = (($n * 0x01010101) >> 24) & 0xff; # and we only care about the top 8 bits: return ($n >> 24) & 0xff; } sub d_thompson_old_and_broken { my $n_bits = shift; $n_bits = ($n_bits & 0x55555555) + (($n_bits >> 1) & 0x55555555); $n_bits = ($n_bits + ($n_bits >> 2)) & 0x33333333; $n_bits = ($n_bits + ($n_bits >> 4)) & 0x0f0f0f0f; $n_bits = ($n_bits + ($n_bits >> 8)) & 0x00ff00ff; $n_bits = ($n_bits + ($n_bits >> 16)); # $n_bits = $n_bits * 0x01010101; $n_bits = $n_bits * 0x01000000; # fix from PG return ($n_bits >> 24) & 0xff; # return $n_bits & 0xff; } # ============================================================================= # submitted by lucs on IRC, this one uses the fact that -x == (~x + 1) # in twos-complement representation. sub lucs { my $x = shift; my $b = 0; while ($x != 0) { ++$b; $x &= $x - 1; } return $b; } # and the same, but with integer. sub lucs_int { use integer; my $x = shift; my $b = 0; while ($x != 0) { ++$b; $x &= $x - 1; } return $b; } # ============================================================================= # a naive counting algorithm. sub count_em { use integer; my $x = shift; my $i = 0; my $b = 0; for (my $i = 0; $i < 32; ++$i) { ++$b if $x & 1; $x >>= 1; } return $b; } # a similar, but hopefully less-naive, algorithm. # my @masks_precalc = ( 0x1, 0x2, 0x4, 0x8, # 0x10, 0x20, 0x40, 0x80, # 0x100, 0x200, 0x400, 0x800, # 0x1000, 0x2000, 0x4000, 0x8000, # 0x10000, 0x20000, 0x40000, 0x80000, # 0x100000, 0x200000, 0x400000, 0x800000, # 0x1000000, 0x2000000, 0x4000000, 0x8000000, # 0x10000000, 0x20000000, 0x40000000, 0x80000000 ); my @masks = map { 1 << $_ } 0 .. 31; sub count_masks { use integer; my $x = shift; my $b = 0; foreach (@masks) { ++$b if $x & $_; } return $b; } # ============================================================================= # these two attempt to treat the number as a string. they're both # abysimally slow; don't use either of them, unless you're trying to # be cute. sub stringy { my $n_bits; $n_bits += $_ foreach split //, unpack "B*", pack "N", shift; return $n_bits; } sub stringy_2 { return eval join "+", split //, unpack "B*", pack "N", shift; } # ============================================================================= # a table-lookup algorithm, at the perl level, was the second-place # winner (after the bit-twiddling version above called "tkil"). as # various people on #perl pointed out, the actual speed of this sort # of thing would depend a lot on the CPU architecture (especially if # this table is big enough to cause cache problems), compared to bit # twiddling (which could cause pipeline stalls). # my @lookup_precalc = ( 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, # 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, # 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, # 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, # 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, # 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, # 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, # 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, # 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, # 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, # 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, # 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, # 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, # 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, # 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, # 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 ); my @lookup = map { tkil($_) } 0 .. 255; sub table { my @b = unpack "C4", pack "N", shift; return $lookup[$b[0]] + $lookup[$b[1]] + $lookup[$b[2]] + $lookup[$b[3]]; } sub table_2 { my $n = shift; return ($lookup[$n & 0xff] + $lookup[$n >> 8 & 0xff] + $lookup[$n >> 16 & 0xff] + $lookup[$n >> 24 & 0xff]); } # ============================================================================= # a version contributed by TonyC. i can forward e-mail to him, or i # can publish his e-mail address here; i simply have not yet asked him # whether he cares one way or another. his comments: # # This tests as faster than the other methods on my machine. It's # based on a note in the unpack documentation. sub tony { my $n = shift; return unpack("%32b*", pack("I", $n)) } # ============================================================================= my @data = map { int rand 0xffffffff } 1 .. 1000; for my $t (@data[0,3]) { print "testing:\n", " data = $t (", unpack("B*", pack "N", $t), ")\n", " tkil => ", tkil ($t), "\n", " tkil_int => ", tkil_int ($t), "\n", " d_thompson => ", d_thompson ($t), "\n", " lucs => ", lucs ($t), "\n", " lucs_int => ", lucs_int ($t), "\n", " count_em => ", count_em ($t), "\n", " count_masks => ", count_masks ($t), "\n", " stringy => ", stringy ($t), "\n", " stringy_2 => ", stringy_2 ($t), "\n", " table => ", table ($t), "\n", " table_2 => ", table_2 ($t), "\n", " tony => ", tony ($t), "\n", "\n"; } timethese(200, { tkil => sub { tkil ($_) foreach @data }, tkil_int => sub { tkil_int ($_) foreach @data }, d_thompson => sub { d_thompson ($_) foreach @data }, lucs => sub { lucs ($_) foreach @data }, lucs_int => sub { lucs_int ($_) foreach @data }, count_em => sub { count_em ($_) foreach @data }, count_masks => sub { count_masks ($_) foreach @data }, # stringy => sub { stringy ($_) foreach @data }, # stringy_2 => sub { stringy_2 ($_) foreach @data }, table => sub { table ($_) foreach @data }, table_2 => sub { table_2 ($_) foreach @data }, tony => sub { tony ($_) foreach @data }, } ); exit;