LogLog and HyperLogLog algorithms for counting of large cardinalities

JavascriptAlgorithmCountingLoglogHyperloglog

Javascript Problem Overview


Where can I find a valid implementation of LogLog algorithm? Have tried to implement it by myself but my draft implementation yields strange results.

Here it is:

function LogLog(max_error, max_count)
{
	function log2(x)
	{
		 return Math.log(x) / Math.LN2;
	}
	 
	var m = 1.30 / max_error;
	var k = Math.ceil(log2(m * m));
	m = Math.pow(2, k);
	 
	var k_comp = 32 - k;
	 
	var l = log2(log2(max_count / m));
	if (isNaN(l)) l = 1; else l = Math.ceil(l);
	var l_mask = ((1 << l) - 1) >>> 0;
	 
	var M = [];
	for (var i = 0; i < m; ++i) M[i] = 0;
 
	function count(hash)
	{
		  if (hash !== undefined)
		  {
				var j = hash >>> k_comp;
				
				var rank = 0;
				for (var i = 0; i < k_comp; ++i)
				{
					 if ((hash >>> i) & 1)
					 {
						  rank = i + 1;
						  break;
					 }
				}
				
				M[j] = Math.max(M[j], rank & l_mask);
		  }
		  else
		  {
				var c = 0;
				for (var i = 0; i < m; ++i) c += M[i];
				return 0.79402 * m * Math.pow(2, c / m);
		  }
	}
	
	return {count: count};
}

function fnv1a(text)
{
	 var hash = 2166136261;
	 for (var i = 0; i < text.length; ++i)
	 {
		  hash ^= text.charCodeAt(i);
		  hash += (hash << 1) + (hash << 4) + (hash << 7) +
			(hash << 8) + (hash << 24);
	 }
	return hash >>> 0;
}

var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words

var log_log = LogLog(0.01, 100000);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]));
alert(log_log.count());

For unknown reason implementation is very sensitive to max_error parameter, it is the main factor that determines the magnitude of the result. I'm sure, there is some stupid mistake :)

UPDATE: This problem is solved in the newer version of algorithm. I will post its implementation later.

Javascript Solutions


Solution 1 - Javascript

Here it is the updated version of the algorithm based on the newer paper:

var pow_2_32 = 0xFFFFFFFF + 1;

function HyperLogLog(std_error)
{
	 function log2(x)
	 {
		  return Math.log(x) / Math.LN2;
	 }
	 
	 function rank(hash, max)
	 {
		  var r = 1;
		  while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
		  return r;
	 }
	
	 var m = 1.04 / std_error;
	 var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
	 m = Math.pow(2, k);
	 
	 var alpha_m = m == 16 ? 0.673
		  : m == 32 ? 0.697
		  : m == 64 ? 0.709
		  : 0.7213 / (1 + 1.079 / m);
	 
	 var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
	 
	 function count(hash)
	 {
		  if (hash !== undefined)
		  {
				var j = hash >>> k_comp;
				M[j] = Math.max(M[j], rank(hash, k_comp));
		  }
		  else
		  {
				var c = 0.0;
				for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
				var E = alpha_m * m * m / c;
				
				// -- make corrections
				
				if (E <= 5/2 * m)
				{
					 var V = 0;
					 for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
					 if (V > 0) E = m * Math.log(m / V);
				}
				else if (E > 1/30 * pow_2_32)
					 E = -pow_2_32 * Math.log(1 - E / pow_2_32);
				
				// --

				return E;
		  }
	}
	
	return {count: count};
}

function fnv1a(text)
{
	 var hash = 2166136261;
	 for (var i = 0; i < text.length; ++i)
	 {
		  hash ^= text.charCodeAt(i);
		  hash += (hash << 1) + (hash << 4) + (hash << 7) +
			(hash << 8) + (hash << 24);
	 }
	 return hash >>> 0;
}

var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words

var seed = Math.floor(Math.random() * pow_2_32); // make more fun

var log_log = HyperLogLog(0.065);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed);
var count = log_log.count();
alert(count + ', error ' +
	(count - words.length) / (words.length / 100.0) + '%');

Solution 2 - Javascript

Here is a slightly modified version which adds the merge operation.

Merge allows you to take the counters from several instances of HyperLogLog, and determine the unique counters overall.

For example, if you have unique visitors collected on Monday, Tuesday and Wednesday, then you can merge the buckets together and count the number of unique visitors over the three day span:

var pow_2_32 = 0xFFFFFFFF + 1; 
function HyperLogLog(std_error)
{
    function log2(x)
    {
        return Math.log(x) / Math.LN2;
    }

    function rank(hash, max)
    {
        var r = 1;
        while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
        return r;
    }

    var m = 1.04 / std_error;
    var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
    m = Math.pow(2, k);

    var alpha_m = m == 16 ? 0.673
         : m == 32 ? 0.697
         : m == 64 ? 0.709
         : 0.7213 / (1 + 1.079 / m);

    var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
    
    function merge(other)
    {
        for (var i = 0; i < m; i++)
        M[i] = Math.max(M[i], other.buckets[i]);
    }

    function count(hash)
    {
        if (hash !== undefined)
        {
            var j = hash >>> k_comp;
            M[j] = Math.max(M[j], rank(hash, k_comp));
        }
        else
        {
            var c = 0.0;
            for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
            var E = alpha_m * m * m / c;

            // -- make corrections

            if (E <= 5/2 * m)
            {
                 var V = 0;
                 for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
                 if (V > 0) E = m * Math.log(m / V);
            }
            else if (E > 1/30 * pow_2_32)
                 E = -pow_2_32 * Math.log(1 - E / pow_2_32);

            // --

            return E;
        }
    }

    return {count: count, merge: merge, buckets: M};
}

function fnv1a(text)
{
    var hash = 2166136261;
    for (var i = 0; i < text.length; ++i)
    {
        hash ^= text.charCodeAt(i);
        hash += (hash << 1) + (hash << 4) + (hash << 7) +
          (hash << 8) + (hash << 24);
    }
    return hash >>> 0;
}

Then you can do something like this:

// initialize one counter per day
var ll_monday = HyperLogLog(0.01);
var ll_tuesday = HyperLogLog(0.01);
var ll_wednesday = HyperLogLog(0.01);


// add 5000 unique values in each day
for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random()));

// add 5000 values which appear every day
for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i));   ll_wednesday.count(fnv1a('' + i));}


// merge three days together
together = HyperLogLog(0.01);
together.merge(ll_monday);
together.merge(ll_tuesday);
together.merge(ll_wednesday);

// report
console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count()));
console.log('unique numbers overall: ' + Math.round(together.count()));

Solution 3 - Javascript

We've open sourced a project called Stream-Lib that has a LogLog implementation. The work was based on this paper.

Solution 4 - Javascript

Using the js version @actual provided, I tried to implement the same in C#, which seems close enough. Just changed fnv1a function a little bit and renamed it to getHashCode. (Credit goes to Jenkins hash function, http://en.wikipedia.org/wiki/Jenkins_hash_function)

public class HyperLogLog
{
    private double mapSize, alpha_m, k;
    private int kComplement;
    private Dictionary<int, int> Lookup = new Dictionary<int, int>();
    private const double pow_2_32 = 4294967297;

    public HyperLogLog(double stdError)
    {
        mapSize = (double)1.04 / stdError;
        k = (long)Math.Ceiling(log2(mapSize * mapSize));

        kComplement = 32 - (int)k;
        mapSize = (long)Math.Pow(2, k);

        alpha_m = mapSize == 16 ? (double)0.673
              : mapSize == 32 ? (double)0.697
              : mapSize == 64 ? (double)0.709
              : (double)0.7213 / (double)(1 + 1.079 / mapSize);
        for (int i = 0; i < mapSize; i++)
            Lookup[i] = 0;
    }

    private static double log2(double x)
    {
        return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2
    }
    private static int getRank(uint hash, int max)
    {
        int r = 1;
        uint one = 1;
        while ((hash & one) == 0 && r <= max)
        {
            ++r;
            hash >>= 1;
        }
        return r;
    }
    public static uint getHashCode(string text)
    {
        uint hash = 0;

        for (int i = 0, l = text.Length; i < l; i++)
        {
            hash += (uint)text[i];
            hash += hash << 10;
            hash ^= hash >> 6;
        }
        hash += hash << 3;
        hash ^= hash >> 6;
        hash += hash << 16;
      
        return hash;
    }

    public int Count()
    {
        double c = 0, E;

        for (var i = 0; i < mapSize; i++)
            c += 1d / Math.Pow(2, (double)Lookup[i]);

        E = alpha_m * mapSize * mapSize / c;

        // Make corrections & smoothen things.
        if (E <= (5 / 2) * mapSize)
        {
            double V = 0;
            for (var i = 0; i < mapSize; i++)
                if (Lookup[i] == 0) V++;
            if (V > 0)
                E = mapSize * Math.Log(mapSize / V);
        }
        else
            if (E > (1 / 30) * pow_2_32)
                E = -pow_2_32 * Math.Log(1 - E / pow_2_32);
        // Made corrections & smoothen things, or not.

        return (int)E;
    }

    public void Add(object val)
    {
        uint hashCode = getHashCode(val.ToString());
        int j = (int)(hashCode >> kComplement);

        Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement));
    }
} 

Solution 5 - Javascript

I know this is an old post but the @buryat implementation has moved, and is in any case incomplete, and a bit on the slow side (sorry o_o ).

I've taken the implementation used by the new Redis release which can be found here and ported it to PHP. The repo is here https://github.com/joegreen0991/HyperLogLog

<?php

class HyperLogLog {

    private $HLL_P_MASK;

    private $HLL_REGISTERS;

    private $ALPHA;

    private $registers;

    public function __construct($HLL_P = 14)
    {
        $this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */

        $this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */

        $this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS);

        $this->registers = new SplFixedArray($this->HLL_REGISTERS);

        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $this->registers[$i] = 0;
        }
    }

    public function add($v)
    {
        $h = crc32(md5($v));

        $h |= 1 << 63; /* Make sure the loop terminates. */
        $bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */
        $count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
        while(($h & $bit) == 0) {
            $count++;
            $bit <<= 1;
        }

        /* Update the register if this element produced a longer run of zeroes. */
        $index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */

        if ($this->registers[$index] < $count) {
            $this->registers[$index] = $count;
        }
    }

    public function export()
    {
        $str = '';
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $str .= chr($this->registers[$i]);
        }
        return $str;
    }

    public function import($str)
    {
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0;
        }
    }

    public function merge($str)
    {
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            if(isset($str[$i]))
            {
                $ord = ord($str[$i]);
                if ($this->registers[$i] < $ord) {
                    $this->registers[$i] = $ord;
                }
            }

        }
    }

    /**
     * @static
     * @param $arr
     * @return int Number of unique items in $arr
     */
    public function count() {
        $E = 0;

        $ez = 0;

        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            if ($this->registers[$i] !== 0) {
                $E += (1.0 / pow(2, $this->registers[$i]));
            } else {
                $ez++;
                $E += 1.0;
            }
        }

        $E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS;

        /* Use the LINEARCOUNTING algorithm for small cardinalities.
         * For larger values but up to 72000 HyperLogLog raw approximation is
         * used since linear counting error starts to increase. However HyperLogLog
         * shows a strong bias in the range 2.5*16384 - 72000, so we try to
         * compensate for it. */
        if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) {
            $E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez);
        }

        else if ($this->HLL_REGISTERS == 16384 && $E < 72000) {
            // We did polynomial regression of the bias for this range, this
            // way we can compute the bias for a given cardinality and correct
            // according to it. Only apply the correction for P=14 that's what
            // we use and the value the correction was verified with.
            $bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E)
                -1.4253 * 1.0e-12 * ($E*$E*$E)+
                1.2940 * 1.0e-7 * ($E*$E)
                -5.2921 * 1.0e-3 * $E+
                83.3216;
            $E -= $E * ($bias/100);
        }

        return floor($E);
    }
}

Solution 6 - Javascript

I implemented loglog and hyperloglog in JS and PHP and well-commented code https://github.com/buryat/loglog

Attributions

All content for this solution is sourced from the original question on Stackoverflow.

The content on this page is licensed under the Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.

Content TypeOriginal AuthorOriginal Content on Stackoverflow
QuestionactualView Question on Stackoverflow
Solution 1 - JavascriptactualView Answer on Stackoverflow
Solution 2 - JavascriptGary WeissView Answer on Stackoverflow
Solution 3 - Javascriptuser1032819View Answer on Stackoverflow
Solution 4 - Javascriptadnan korkmazView Answer on Stackoverflow
Solution 5 - JavascriptJoe GreenView Answer on Stackoverflow
Solution 6 - JavascriptburyatView Answer on Stackoverflow