用于计算大基数的 LogLog 和 HyperLogLog 算法



在哪里可以找到 LogLog 算法的有效实现?我试图自己实现它,但我的执行草案产生了奇怪的结果。

在这里:

function LogLog(max_error, max_count)
{
    function log2(x)
    {
         return Math.log(x) / Math.LN2;
    }
    var m = 1.30 / max_error;
    var k = Math.ceil(log2(m * m));
    m = Math.pow(2, k);
    var k_comp = 32 - k;
    var l = log2(log2(max_count / m));
    if (isNaN(l)) l = 1; else l = Math.ceil(l);
    var l_mask = ((1 << l) - 1) >>> 0;
    var M = [];
    for (var i = 0; i < m; ++i) M[i] = 0;
    function count(hash)
    {
          if (hash !== undefined)
          {
                var j = hash >>> k_comp;
                var rank = 0;
                for (var i = 0; i < k_comp; ++i)
                {
                     if ((hash >>> i) & 1)
                     {
                          rank = i + 1;
                          break;
                     }
                }
                M[j] = Math.max(M[j], rank & l_mask);
          }
          else
          {
                var c = 0;
                for (var i = 0; i < m; ++i) c += M[i];
                return 0.79402 * m * Math.pow(2, c / m);
          }
    }
    return {count: count};
}
function fnv1a(text)
{
     var hash = 2166136261;
     for (var i = 0; i < text.length; ++i)
     {
          hash ^= text.charCodeAt(i);
          hash += (hash << 1) + (hash << 4) + (hash << 7) +
            (hash << 8) + (hash << 24);
     }
    return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words
var log_log = LogLog(0.01, 100000);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]));
alert(log_log.count());

由于未知原因,实现对max_error参数非常敏感,它是决定结果大小的主要因素。我敢肯定,有一些愚蠢的错误:)

更新:此问题在较新版本的算法中得到解决。我稍后会发布它的实现。

这是基于新论文的算法的更新版本:

var pow_2_32 = 0xFFFFFFFF + 1;
function HyperLogLog(std_error)
{
     function log2(x)
     {
          return Math.log(x) / Math.LN2;
     }
     function rank(hash, max)
     {
          var r = 1;
          while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
          return r;
     }
     var m = 1.04 / std_error;
     var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
     m = Math.pow(2, k);
     var alpha_m = m == 16 ? 0.673
          : m == 32 ? 0.697
          : m == 64 ? 0.709
          : 0.7213 / (1 + 1.079 / m);
     var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
     function count(hash)
     {
          if (hash !== undefined)
          {
                var j = hash >>> k_comp;
                M[j] = Math.max(M[j], rank(hash, k_comp));
          }
          else
          {
                var c = 0.0;
                for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
                var E = alpha_m * m * m / c;
                // -- make corrections
                if (E <= 5/2 * m)
                {
                     var V = 0;
                     for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
                     if (V > 0) E = m * Math.log(m / V);
                }
                else if (E > 1/30 * pow_2_32)
                     E = -pow_2_32 * Math.log(1 - E / pow_2_32);
                // --
                return E;
          }
    }
    return {count: count};
}
function fnv1a(text)
{
     var hash = 2166136261;
     for (var i = 0; i < text.length; ++i)
     {
          hash ^= text.charCodeAt(i);
          hash += (hash << 1) + (hash << 4) + (hash << 7) +
            (hash << 8) + (hash << 24);
     }
     return hash >>> 0;
}
var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words
var seed = Math.floor(Math.random() * pow_2_32); // make more fun
var log_log = HyperLogLog(0.065);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed);
var count = log_log.count();
alert(count + ', error ' +
    (count - words.length) / (words.length / 100.0) + '%');

这是一个稍作修改的版本,它添加了合并操作。

合并允许您从HyperLogLog的多个实例中获取计数器,并确定整体唯一计数器。

例如,如果您在周一、周二和周三收集了唯一身份访问者,然后,您可以将存储桶合并在一起并计算唯一身份访问者的数量在三天的跨度内:

var pow_2_32 = 0xFFFFFFFF + 1; 
function HyperLogLog(std_error)
{
    function log2(x)
    {
        return Math.log(x) / Math.LN2;
    }
    function rank(hash, max)
    {
        var r = 1;
        while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
        return r;
    }
    var m = 1.04 / std_error;
    var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
    m = Math.pow(2, k);
    var alpha_m = m == 16 ? 0.673
         : m == 32 ? 0.697
         : m == 64 ? 0.709
         : 0.7213 / (1 + 1.079 / m);
    var M = []; for (var i = 0; i < m; ++i) M[i] = 0;
    function merge(other)
    {
        for (var i = 0; i < m; i++)
        M[i] = Math.max(M[i], other.buckets[i]);
    }
    function count(hash)
    {
        if (hash !== undefined)
        {
            var j = hash >>> k_comp;
            M[j] = Math.max(M[j], rank(hash, k_comp));
        }
        else
        {
            var c = 0.0;
            for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
            var E = alpha_m * m * m / c;
            // -- make corrections
            if (E <= 5/2 * m)
            {
                 var V = 0;
                 for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
                 if (V > 0) E = m * Math.log(m / V);
            }
            else if (E > 1/30 * pow_2_32)
                 E = -pow_2_32 * Math.log(1 - E / pow_2_32);
            // --
            return E;
        }
    }
    return {count: count, merge: merge, buckets: M};
}
function fnv1a(text)
{
    var hash = 2166136261;
    for (var i = 0; i < text.length; ++i)
    {
        hash ^= text.charCodeAt(i);
        hash += (hash << 1) + (hash << 4) + (hash << 7) +
          (hash << 8) + (hash << 24);
    }
    return hash >>> 0;
}

然后你可以做这样的事情:

// initialize one counter per day
var ll_monday = HyperLogLog(0.01);
var ll_tuesday = HyperLogLog(0.01);
var ll_wednesday = HyperLogLog(0.01);

// add 5000 unique values in each day
for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random()));
// add 5000 values which appear every day
for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i));   ll_wednesday.count(fnv1a('' + i));}

// merge three days together
together = HyperLogLog(0.01);
together.merge(ll_monday);
together.merge(ll_tuesday);
together.merge(ll_wednesday);
// report
console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count()));
console.log('unique numbers overall: ' + Math.round(together.count()));

我们已经开源了一个名为Stream-Lib的项目,该项目具有LogLog实现。 这项工作基于这篇论文。

> 使用@actual提供的 js 版本,我尝试在 C# 中实现相同的内容,这似乎足够接近。只是稍微更改了fnv1a函数并将其重命名为getHashCode。(功劳归于 Jenkins 哈希函数,http://en.wikipedia.org/wiki/Jenkins_hash_function)

public class HyperLogLog
{
    private double mapSize, alpha_m, k;
    private int kComplement;
    private Dictionary<int, int> Lookup = new Dictionary<int, int>();
    private const double pow_2_32 = 4294967297;
    public HyperLogLog(double stdError)
    {
        mapSize = (double)1.04 / stdError;
        k = (long)Math.Ceiling(log2(mapSize * mapSize));
        kComplement = 32 - (int)k;
        mapSize = (long)Math.Pow(2, k);
        alpha_m = mapSize == 16 ? (double)0.673
              : mapSize == 32 ? (double)0.697
              : mapSize == 64 ? (double)0.709
              : (double)0.7213 / (double)(1 + 1.079 / mapSize);
        for (int i = 0; i < mapSize; i++)
            Lookup[i] = 0;
    }
    private static double log2(double x)
    {
        return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2
    }
    private static int getRank(uint hash, int max)
    {
        int r = 1;
        uint one = 1;
        while ((hash & one) == 0 && r <= max)
        {
            ++r;
            hash >>= 1;
        }
        return r;
    }
    public static uint getHashCode(string text)
    {
        uint hash = 0;
        for (int i = 0, l = text.Length; i < l; i++)
        {
            hash += (uint)text[i];
            hash += hash << 10;
            hash ^= hash >> 6;
        }
        hash += hash << 3;
        hash ^= hash >> 6;
        hash += hash << 16;
        return hash;
    }
    public int Count()
    {
        double c = 0, E;
        for (var i = 0; i < mapSize; i++)
            c += 1d / Math.Pow(2, (double)Lookup[i]);
        E = alpha_m * mapSize * mapSize / c;
        // Make corrections & smoothen things.
        if (E <= (5 / 2) * mapSize)
        {
            double V = 0;
            for (var i = 0; i < mapSize; i++)
                if (Lookup[i] == 0) V++;
            if (V > 0)
                E = mapSize * Math.Log(mapSize / V);
        }
        else
            if (E > (1 / 30) * pow_2_32)
                E = -pow_2_32 * Math.Log(1 - E / pow_2_32);
        // Made corrections & smoothen things, or not.
        return (int)E;
    }
    public void Add(object val)
    {
        uint hashCode = getHashCode(val.ToString());
        int j = (int)(hashCode >> kComplement);
        Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement));
    }
} 
我知道

这是一个旧帖子,但@buryat实现已经移动,无论如何都不完整,而且有点慢(对不起o_o)。

我已经采用了新的 Redis 版本使用的实现,可以在这里找到,并将其移植到 PHP。回购在这里 https://github.com/joegreen0991/HyperLogLog

<?php
class HyperLogLog {
    private $HLL_P_MASK;
    private $HLL_REGISTERS;
    private $ALPHA;
    private $registers;
    public function __construct($HLL_P = 14)
    {
        $this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */
        $this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */
        $this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS);
        $this->registers = new SplFixedArray($this->HLL_REGISTERS);
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $this->registers[$i] = 0;
        }
    }
    public function add($v)
    {
        $h = crc32(md5($v));
        $h |= 1 << 63; /* Make sure the loop terminates. */
        $bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */
        $count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
        while(($h & $bit) == 0) {
            $count++;
            $bit <<= 1;
        }
        /* Update the register if this element produced a longer run of zeroes. */
        $index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */
        if ($this->registers[$index] < $count) {
            $this->registers[$index] = $count;
        }
    }
    public function export()
    {
        $str = '';
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $str .= chr($this->registers[$i]);
        }
        return $str;
    }
    public function import($str)
    {
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0;
        }
    }
    public function merge($str)
    {
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            if(isset($str[$i]))
            {
                $ord = ord($str[$i]);
                if ($this->registers[$i] < $ord) {
                    $this->registers[$i] = $ord;
                }
            }
        }
    }
    /**
     * @static
     * @param $arr
     * @return int Number of unique items in $arr
     */
    public function count() {
        $E = 0;
        $ez = 0;
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            if ($this->registers[$i] !== 0) {
                $E += (1.0 / pow(2, $this->registers[$i]));
            } else {
                $ez++;
                $E += 1.0;
            }
        }
        $E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS;
        /* Use the LINEARCOUNTING algorithm for small cardinalities.
         * For larger values but up to 72000 HyperLogLog raw approximation is
         * used since linear counting error starts to increase. However HyperLogLog
         * shows a strong bias in the range 2.5*16384 - 72000, so we try to
         * compensate for it. */
        if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) {
            $E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez);
        }
        else if ($this->HLL_REGISTERS == 16384 && $E < 72000) {
            // We did polynomial regression of the bias for this range, this
            // way we can compute the bias for a given cardinality and correct
            // according to it. Only apply the correction for P=14 that's what
            // we use and the value the correction was verified with.
            $bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E)
                -1.4253 * 1.0e-12 * ($E*$E*$E)+
                1.2940 * 1.0e-7 * ($E*$E)
                -5.2921 * 1.0e-3 * $E+
                83.3216;
            $E -= $E * ($bias/100);
        }
        return floor($E);
    }
}

我在JS和PHP中实现了loglog和hyperloglog,并 https://github.com/buryat/loglog 注释良好的代码

相关内容

  • 没有找到相关文章

最新更新