#!/usr/bin/perl -w
#
# bit-count
#
# explore different ways of counting the number of "on" bits in a 32-bit word.
#
# 1999-06-02 -- tkil@scrye.com
#
# released into the public domain.  share and enjoy.

use strict;
use Benchmark;

# timing information on my machine (400MHz k6-2, linux 2.2.6, perl 5.005_02):
#
#   count_em: 15 wallclock secs (14.87 usr +  0.02 sys = 14.89 CPU)
# count_masks: 12 wallclock secs (12.35 usr +  0.01 sys = 12.36 CPU)
#       lucs: 11 wallclock secs (10.56 usr +  0.00 sys = 10.56 CPU)
#   lucs_int:  7 wallclock secs ( 7.16 usr +  0.00 sys =  7.16 CPU)
#      table:  5 wallclock secs ( 4.51 usr +  0.01 sys =  4.52 CPU)
#    table_2:  2 wallclock secs ( 2.21 usr +  0.00 sys =  2.21 CPU)
#       tkil:  4 wallclock secs ( 3.84 usr +  0.00 sys =  3.84 CPU)
#   tkil_int:  3 wallclock secs ( 3.20 usr +  0.00 sys =  3.20 CPU)
#       tony:  2 wallclock secs ( 2.37 usr +  0.00 sys =  2.37 CPU)

# =============================================================================

# not sure where i found this algorithm; i thought i had read about it
# in HakMem, but the closest they have there is an octal-marshalling
# technique.  this technique *is* described in Graphics Gems II, gem
# VIII.3, page 371.
#
# in our timing runs, this was the fastest algorithm (although the
# "use integer" pragma, enabled in &tkil_int, squeezes an extra 20%
# out of it)
#
# for the curious, a 64-bit version simply requires doubling the width
# of all these masks (by duplicating the bit pattern) and adding one
# more line to sum up the two 32-bit sums.  something like this:
#
# sub tkil_64
# {
#   my $n_bits = shift;
#   $n_bits = ((($n_bits & 0xaaaaaaaaaaaaaaaa) >>  1) +
# 	        ($n_bits & 0x5555555555555555));
#   $n_bits = ((($n_bits & 0xcccccccccccccccc) >>  2) +
# 	        ($n_bits & 0x3333333333333333));
#   $n_bits = ((($n_bits & 0xf0f0f0f0f0f0f0f0) >>  4) + 
# 	        ($n_bits & 0x0f0f0f0f0f0f0f0f));
#   $n_bits = ((($n_bits & 0xff00ff00ff00ff00) >>  8) +
# 	        ($n_bits & 0x00ff00ff00ff00ff));
#   $n_bits = ((($n_bits & 0xffff0000ffff0000) >> 16) + 
# 	        ($n_bits & 0x0000ffff0000ffff));
#   $n_bits = ((($n_bits & 0xffffffff00000000) >> 32) + 
# 	        ($n_bits & 0x00000000ffffffff));
# }
#
#

sub tkil
{
  my $n_bits = shift;
  $n_bits = (($n_bits & 0xaaaaaaaa) >>  1) + ($n_bits & 0x55555555);
  $n_bits = (($n_bits & 0xcccccccc) >>  2) + ($n_bits & 0x33333333);
  $n_bits = (($n_bits & 0xf0f0f0f0) >>  4) + ($n_bits & 0x0f0f0f0f);
  $n_bits = (($n_bits & 0xff00ff00) >>  8) + ($n_bits & 0x00ff00ff);
  $n_bits = (($n_bits & 0xffff0000) >> 16) + ($n_bits & 0x0000ffff);
  return $n_bits;
}

# identical to above, but a bit faster by using integer arithmetic.
#
# WARNING, as noted by TonyC:
#    I also found that tkil_int() produces an incorrect value on my
#    x86 Linux box when the number is over 0x7FFFFFFF, presumably
#    because the high-bit is becoming a sign bit.

sub tkil_int
{
  use integer;

  my $n_bits = shift;
  $n_bits = (($n_bits & 0xaaaaaaaa) >>  1) + ($n_bits & 0x55555555);
  $n_bits = (($n_bits & 0xcccccccc) >>  2) + ($n_bits & 0x33333333);
  $n_bits = (($n_bits & 0xf0f0f0f0) >>  4) + ($n_bits & 0x0f0f0f0f);
  $n_bits = (($n_bits & 0xff00ff00) >>  8) + ($n_bits & 0x00ff00ff);
  $n_bits = (($n_bits & 0xffff0000) >> 16) + ($n_bits & 0x0000ffff);
  return $n_bits;
}

# 2000-02-25: update
#
# Daniel Thompson <daniel.thompson@st.com> submitted this one:
#
#    I was looking at you perl bit-count experiments recently. A came
#    across this optimisation of one of the alogorithms you use. It is
#    derived from the Intel assembler displayed on
#    http://www.df.lth.se/~john_e/index.html which is taken from an
#    AMD optimiation paper. It optimises that last few stages into a
#    multiply.
#
#    2002-06-08: that page seems to have moved to:
#
#       http://www.df.lth.se/~john_e/gems/gem002d.html
#
# my (tkil's) gut reaction is that it is unlikely to be faster than
# the pure integer stuff above, but we'll find out.
#
# pardon me while i eat my words.  the output is wrong, but it
# executes noticably faster than either of my versions above.  now to
# talk to danial to see if i can't get it working correctly...
#
# 2002-06-08 -- alert reader PG pointed out the fix in the final
# scaling step.
#
# hm.  i think this is fundamentally broken.  looking at the first two
# steps, consider what happens when we feed 0xf into it.  first:
#
#       (1111 & 0101) + ((1111 >> 1) & 0101)
#     = 0101 + 0101
#     = 1010
#
# ok so far; we have a value of 2 in the bottom two bits.  but watch
# what happens next:
#
#       (1010 + (1010 >> 2)) & 0011
#     = (1010 + 0010) & 0011
#     = 1100 & 0011
#     = 0000
#
# oops!  all gone!  did i miss something here?  i don't think we can
# fix it easily, because we're not clearing out the top two bits of
# the sum of each 4 bits.  in "tkil" above, we do that before adding,
# so we are guaranteed to have zeros to carry the all-ones value into.
# i'm pretty sure that we can't do it this way.  
#
# unless maybe we zero out everything at the bottom only?  no, then we
# have random crap coming in from the high bits.  ahh... but if we
# shove everything up to the top?  we're ok as long as it's not all
# ones, i think.
#
# stupid precedence tricks: tight * + >> & loose

sub d_thompson
{
    my $n = shift;

    # don't fully understand this, but it's a direct quote...
    # $n = $n - (($n >> 1) & 0x55555555); # doesn't make sense
    $n = (($n & 0xaaaaaaaa) >> 1) + ($n & 0x55555555); # simple

    # this is the same as in "tkil"...
    $n = ($n & 0x33333333) + (($n >> 2) & 0x33333333);

    # each sub-sum will now fit in four bits, so: 
    $n = ($n + ($n >> 4)) & 0x0f0f0f0f;
    
    # finally, we use a 32-bit multiply to sum up the four four-bit
    # subtotals into the most signifcant 8 bits
    $n = (($n * 0x01010101) >> 24) & 0xff;

    # and we only care about the top 8 bits:
    return ($n >> 24) & 0xff;    
}

sub d_thompson_old_and_broken
{
    my $n_bits = shift;

    $n_bits = ($n_bits & 0x55555555) + (($n_bits >> 1) & 0x55555555);
    $n_bits = ($n_bits + ($n_bits >> 2)) & 0x33333333;
    $n_bits = ($n_bits + ($n_bits >> 4)) & 0x0f0f0f0f;
    $n_bits = ($n_bits + ($n_bits >> 8)) & 0x00ff00ff;
    $n_bits = ($n_bits + ($n_bits >> 16));

    # $n_bits = $n_bits * 0x01010101;
    $n_bits = $n_bits * 0x01000000; # fix from PG
    return ($n_bits >> 24) & 0xff;

    # return $n_bits & 0xff;
}

# =============================================================================

# submitted by lucs on IRC, this one uses the fact that -x == (~x + 1)
# in twos-complement representation.
sub lucs
{
  my $x = shift;
  my $b = 0;
  while ($x != 0) { 
    ++$b; 
    $x &= $x - 1;
  }
  return $b;
}

# and the same, but with integer.

sub lucs_int
{
  use integer;
  my $x = shift;
  my $b = 0;
  while ($x != 0) { 
    ++$b; 
    $x &= $x - 1;
  }
  return $b;
}

# =============================================================================

# a naive counting algorithm.
sub count_em
{
  use integer; 
  my $x = shift;
  my $i = 0;
  my $b = 0;
  for (my $i = 0; $i < 32; ++$i)
  {
    ++$b if $x & 1;
    $x >>= 1;
  }
  return $b;
}

# a similar, but hopefully less-naive, algorithm.

# my @masks_precalc = (        0x1,        0x2,        0x4,        0x8,
#                             0x10,       0x20,       0x40,       0x80,
#                            0x100,      0x200,      0x400,      0x800,
#                           0x1000,     0x2000,     0x4000,     0x8000,
#                          0x10000,    0x20000,    0x40000,    0x80000,
#                         0x100000,   0x200000,   0x400000,   0x800000,
#                        0x1000000,  0x2000000,  0x4000000,  0x8000000,
#                       0x10000000, 0x20000000, 0x40000000, 0x80000000 );

my @masks = map { 1 << $_ } 0 .. 31;
             
sub count_masks
{
  use integer;
  my $x = shift;
  my $b = 0;
  foreach (@masks) 
  {
    ++$b if $x & $_;
  }
  return $b;
}

# =============================================================================

# these two attempt to treat the number as a string.  they're both
# abysimally slow; don't use either of them, unless you're trying to
# be cute.

sub stringy
{
  my $n_bits;
  $n_bits += $_ foreach split //, unpack "B*", pack "N", shift;
  return $n_bits;
}

sub stringy_2
{
  return eval join "+", split //, unpack "B*", pack "N", shift;
}

# =============================================================================

# a table-lookup algorithm, at the perl level, was the second-place
# winner (after the bit-twiddling version above called "tkil").  as
# various people on #perl pointed out, the actual speed of this sort
# of thing would depend a lot on the CPU architecture (especially if
# this table is big enough to cause cache problems), compared to bit
# twiddling (which could cause pipeline stalls).

# my @lookup_precalc = ( 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
#                        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
#                        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 
#                        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
#                        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
#                        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
#                        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 
#                        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 
#                        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 
#                        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 
#                        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 
#                        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 
#                        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
#                        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 
#                        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
#                        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 );

my @lookup = map { tkil($_) } 0 .. 255;

sub table
{
  my @b = unpack "C4", pack "N", shift;
  return $lookup[$b[0]] + $lookup[$b[1]] + $lookup[$b[2]] + $lookup[$b[3]];
}

sub table_2
{
  my $n = shift;
  return ($lookup[$n       & 0xff] +
	  $lookup[$n >>  8 & 0xff] + 
	  $lookup[$n >> 16 & 0xff] +
	  $lookup[$n >> 24 & 0xff]);
}

# =============================================================================
# a version contributed by TonyC.  i can forward e-mail to him, or i
# can publish his e-mail address here; i simply have not yet asked him
# whether he cares one way or another.  his comments:
#
#    This tests as faster than the other methods on my machine.  It's
#    based on a note in the unpack documentation.
 
sub tony {
    my $n = shift;
    return unpack("%32b*", pack("I", $n))
}

# =============================================================================

my @data = map { int rand 0xffffffff } 1 .. 1000;
for my $t (@data[0,3])
{
    print 
      "testing:\n",
      "  data = $t (", unpack("B*", pack "N", $t), ")\n",
      "  tkil         => ", tkil        ($t), "\n",
      "  tkil_int     => ", tkil_int    ($t), "\n",
      "  d_thompson   => ", d_thompson  ($t), "\n",
      "  lucs         => ", lucs        ($t), "\n",
      "  lucs_int     => ", lucs_int    ($t), "\n",
      "  count_em     => ", count_em    ($t), "\n",
      "  count_masks  => ", count_masks ($t), "\n",
      "  stringy      => ", stringy     ($t), "\n",
      "  stringy_2    => ", stringy_2   ($t), "\n",
      "  table        => ", table       ($t), "\n",
      "  table_2      => ", table_2     ($t), "\n",
      "  tony         => ", tony        ($t), "\n",
      "\n";
}

timethese(200,
          {
           tkil         => sub { tkil        ($_) foreach @data },
           tkil_int     => sub { tkil_int    ($_) foreach @data },
           d_thompson   => sub { d_thompson  ($_) foreach @data },
           lucs         => sub { lucs        ($_) foreach @data },
           lucs_int     => sub { lucs_int    ($_) foreach @data },
           count_em     => sub { count_em    ($_) foreach @data },
           count_masks  => sub { count_masks ($_) foreach @data },
           # stringy      => sub { stringy     ($_) foreach @data },
           # stringy_2    => sub { stringy_2   ($_) foreach @data },
           table        => sub { table       ($_) foreach @data },
           table_2      => sub { table_2     ($_) foreach @data },
           tony         => sub { tony        ($_) foreach @data },
          } );

exit;