redis源码分析之布隆过滤器-机器学习

/* Given a string element to add to the HyperLogLog, returns the length
 * of the pattern 000..1 of the element hash. As a side effect 'regp' is
 * set to the register index this element hashes to. */
int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
    uint64_t hash, bit, index;
    int count;

    /* Count the number of zeroes starting from bit HLL_REGISTERS
     * (that is a power of two corresponding to the first bit we don't use
     * as index). The max run can be 64-P+1 = Q+1 bits.
     *
     * Note that the final "1" ending the sequence of zeroes must be
     * included in the count, so if we find "001" the count is 3, and
     * the smallest count possible is no zeroes at all, just a 1 bit
     * at the first position, that is a count of 1.
     *
     * This may sound like inefficient, but actually in the average case
     * there are high probabilities to find a 1 after a few iterations. */
    hash = MurmurHash64A(ele,elesize,0xadc83b19ULL);
	// 低的14位作为下标索引  
    index = hash & HLL_P_MASK/*‭0011 1111 1111 1111‬*/; /* Register index. */
    //取高的50位数据作为count数据的操作
	//00 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 
	hash >>= HLL_P; /* 高于14位 数据保留下来 Remove bits used to address the register. */
	// 把第50位改为1 决定count的最大值是 50
	//10 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 
    hash |= ((uint64_t)1<<HLL_Q); /* Make sure the loop terminates
                                     and count will be <= Q+1. */
    bit = 1; // 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001
    count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
    while((hash & bit) == 0) {
        count++; //为什么要这样制定count的值
        bit <<= 1;
    }
    *regp = (int) index;
    return count;
}

hllDenseSet方法分析

这个需要知道 count实际储存到内存中只有6个字节的

/* Note: if we access the last counter, we will also access the b+1 byte
 * that is out of the array, but sds strings always have an implicit null
 * term, so the byte exists, and we can skip the conditional (or the need
 * to allocate 1 byte more explicitly). */

/* Store the value of the register at position 'regnum' into variable 'target'.
 * 'p' is an array of unsigned bytes. */
// 它里面操作的目的是 保存 1-52的数字到6个bit位中去 这个是这个函数要达到的目的 所以怎么实现也就无所谓了
#define HLL_DENSE_GET_REGISTER(target,p,regnum) do { \
    uint8_t *_p = (uint8_t*) p; \
    unsigned long _byte = regnum*HLL_BITS/8;/* 8 = 2 ^ 3 ==>    操作数 >> 3*/ \  
    unsigned long _fb = regnum*HLL_BITS&7; \
    unsigned long _fb8 = 8 - _fb; \
    unsigned long b0 = _p[_byte]; \
    unsigned long b1 = _p[_byte+1]; \
    target = ((b0 >> _fb) | (b1 << _fb8)) & HLL_REGISTER_MAX; /* HLL_REGISTER_MAX ==> 0011 1111*/ \
} while(0)

/* Set the value of the register at position 'regnum' to 'val'.
 * 'p' is an array of unsigned bytes. */
#define HLL_DENSE_SET_REGISTER(p,regnum,val) do { \
    uint8_t *_p = (uint8_t*) p; \
    unsigned long _byte = regnum*HLL_BITS/8;/*在初始化时 内存大小计算公式 = (HLL_REGISTERS*HLL_BITS+7)/8) */ \
    unsigned long _fb = regnum*HLL_BITS&7; /* 只有regnum的个位上是2和7可以 _fd != 0的操作*/ \ 
    unsigned long _fb8 = 8 - _fb; \
    unsigned long _v = val; \
    _p[_byte] &= ~(HLL_REGISTER_MAX << _fb); \
    _p[_byte] |= _v << _fb; /*‭0011 0100 => [<< 2] => 1101 0000 ‬*/ \
    _p[_byte+1] &= ~(HLL_REGISTER_MAX >> _fb8); \
    _p[_byte+1] |= _v >> _fb8; /*0011 0100 => [>> 6] => 0000 0011*/\
} while(0)

int hllDenseSet(uint8_t *registers, long index, uint8_t count) {
    uint8_t oldcount;

    HLL_DENSE_GET_REGISTER(oldcount,registers,index);
    if (count > oldcount) {
        HLL_DENSE_SET_REGISTER(registers,index,count);
        return 1;
    } else {
        return 0;
    }
}

2，查询有多少条数据

查询的流程图

在这里插入图片描述

52个桶数据获取代码分析

/* Compute the register histogram in the dense representation. */
void hllDenseRegHisto(uint8_t *registers/*实际内存数据*/, int* reghisto/*桶*/) {
    int j;
    /* Redis default is to use 16384 registers 6 bits each. The code works
     * with other values by modifying the defines, but for our target value
     * we take a faster path with unrolled loops. */
    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
		
		/*char * ptr = s_malloc(86016>>2);
		if (ptr)
		{
			byte2hex(ptr, (const unsigned char *)registers, 86016>>2);
			printf("[%s][%d][hex=%s]\n", __PRETTY_FUNCTION__, __LINE__, ptr);
			s_free(ptr);
		}*/
		
        uint8_t *r = registers;
        unsigned long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
                      r10, r11, r12, r13, r14, r15;
		// 一共内存的大小是 86016  ---> 下面统计时使用的内存大小是 12288--->还有没有使用的内存？？怎么处理呢
        for (j = 0; j < 1024; j++) {
            /* Handle 16 registers per iteration. */
            r0 = r[0] & 63; // 63 => 0011 1111
            r1 = (r[0] >> 6 | r[1] << 2) & 63;
            r2 = (r[1] >> 4 | r[2] << 4) & 63;
            r3 = (r[2] >> 2) & 63;
            r4 = r[3] & 63;
            r5 = (r[3] >> 6 | r[4] << 2) & 63;
            r6 = (r[4] >> 4 | r[5] << 4) & 63;
            r7 = (r[5] >> 2) & 63;
            r8 = r[6] & 63;
            r9 = (r[6] >> 6 | r[7] << 2) & 63;
            r10 = (r[7] >> 4 | r[8] << 4) & 63;
            r11 = (r[8] >> 2) & 63;
            r12 = r[9] & 63;
            r13 = (r[9] >> 6 | r[10] << 2) & 63;
            r14 = (r[10] >> 4 | r[11] << 4) & 63;
            r15 = (r[11] >> 2) & 63;
			//  还记得在生成hash值时产生的count值的范围吗？ [1 <= count <= 51]  ?? 思考一下 
            reghisto[r0]++;
            reghisto[r1]++;
            reghisto[r2]++;
            reghisto[r3]++;
            reghisto[r4]++;
            reghisto[r5]++;
            reghisto[r6]++;
            reghisto[r7]++;
            reghisto[r8]++;
            reghisto[r9]++;
            reghisto[r10]++;
            reghisto[r11]++;
            reghisto[r12]++;
            reghisto[r13]++;
            reghisto[r14]++;
            reghisto[r15]++;
			// 指针数组增加移动
            r += 12;
        }
    } else {
        for(j = 0; j < HLL_REGISTERS; j++) {
            unsigned long reg;
            HLL_DENSE_GET_REGISTER(reg,registers,j);
            reghisto[reg]++;
        }
    }
}

基于类似于伯努利试验的代码分析

/* Return the approximated cardinality of the set based on the harmonic
 * mean of the registers values. 'hdr' points to the start of the SDS
 * representing the String object holding the HLL representation.
 *
 * If the sparse representation of the HLL object is not valid, the integer
 * pointed by 'invalid' is set to non-zero, otherwise it is left untouched.
 *
 * hllCount() supports a special internal-only encoding of HLL_RAW, that
 * is, hdr->registers will point to an uint8_t array of HLL_REGISTERS element.
 * This is useful in order to speedup PFCOUNT when called against multiple
 * keys (no need to work with 6-bit integers encoding). */
uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
    double m = HLL_REGISTERS;
    double E;
    int j;
    int reghisto[HLL_Q+2] = {0};  // 52

    /* Compute register histogram */
    if (hdr->encoding == HLL_DENSE) {
        hllDenseRegHisto(hdr->registers,reghisto);
    } else if (hdr->encoding == HLL_SPARSE) {
        hllSparseRegHisto(hdr->registers, sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,reghisto);
    } else if (hdr->encoding == HLL_RAW) {
        hllRawRegHisto(hdr->registers,reghisto);
    } else {
        serverPanic("Unknown HyperLogLog encoding in hllCount()");
    }

    /* Estimate cardinality form register histogram. See:
     * "New cardinality estimation algorithms for HyperLogLog sketches"
     * Otmar Ertl, arXiv:1702.01284 */ 
	// m *((16384 - [52]) /m)
	//1. 调和平均数
	// 调和平均数 公式 = n((1/a1 + 1/a2 + ... + 1/an)/2)
    double z = m * hllTau((m-reghisto[HLL_Q+1])/(double)m);
    //仔细观察    没有把第一个数据reghisto[0]和最后一个数据reghisto[51] 加入进入  ---> 正态分布
	for (j = HLL_Q; j >= 1; --j) {
        z += reghisto[j];
        z *= 0.5;
    }
	//2. 标准误差 计算
    z += m * hllSigma(reghisto[0]/(double)m);
	// 调和平均数使用 0.5/ln(2)
	// 公式的应用 基数统计
    E = llroundl(HLL_ALPHA_INF*m*m/z);
    return (uint64_t) E;
}

二，布隆过滤器的数学原理分析

伯努利模型的原理

事件的各自独立的事件
结果只有两个可能(类似于抛硬币的只有正反面两种可能)

公式 $P_n(k) = C^k_np^k(1-p)^{n-k} = C^k_np^kq^{n-k} [q = 1-p]$

基于上面的抛硬币的试验

我们抛硬币只记录抛硬币出现正面的时的抛硬币的次数记为A 当我们继续大量试验，看可以得到一个比较均匀的A的值，

这个就和我们布隆过滤器的原理是差不多的，布隆过滤器中使用桶为我们的次数A，中可能就6个字节的

在这里插入图片描述

概率统计,机器学习

前言

正文

一， redis中布隆过滤器代码分析

1，插入”一条数据”

2，查询有多少条数据

二，布隆过滤器的数学原理分析

结语

CATALOG

FEATURED TAGS

FRIENDS

前言

正文

一， redis中布隆过滤器代码分析

1， 插入”一条数据”

2，查询有多少条数据

二， 布隆过滤器的数学原理分析

结语

CATALOG

FEATURED TAGS

FRIENDS

1，插入”一条数据”

二，布隆过滤器的数学原理分析