Python实现布隆过滤器

发布时间:2019-08-26 07:55:33编辑:auto阅读(1919)

    转载自:http://blog.csdn.net/demon24/article/details/8537665

    http://blog.csdn.net/u013402746/article/details/28414901          http://palydawn.blog.163.com/blog/static/182969056201210260470485/

    #_*_coding:utf_8_
    import BitVector
    import os
    import sys
    
    class SimpleHash():  
        
        def __init__(self, capability, seed):
            self.capability = capability
            self.seed = seed
    
        #传入的value即为url值,ord(value[i])表示第i位字符的ascii码值
        def hash(self, value):
            ret = 0
            for i in range(len(value)):
                ret += self.seed*ret + ord(value[i])
            #最终产生的随机数是二进制向量最大下标与随机数的按位与结果
            return (self.capability-1) & ret    
    
    class BloomFilter():
        
        def __init__(self, BIT_SIZE=1<<25):
            self.BIT_SIZE = 1 << 25
            self.seeds = [5, 7, 11, 13, 31, 37, 61]
            #建立一个大小为1<<25=33554432位的二进制向量,分配内存
        	self.bitset = BitVector.BitVector(size=self.BIT_SIZE)
            self.hashFunc = []
            #利用8个素数初始化8个随机数生成器
            for i in range(len(self.seeds)):
                self.hashFunc.append(SimpleHash(self.BIT_SIZE, self.seeds[i]))
            
        def insert(self, value):
            for f in self.hashFunc:
                loc = f.hash(value)
                self.bitset[loc] = 1
        def isContaions(self, value):
            if value == None:
                return False
            ret = True
            for f in self.hashFunc:
                loc = f.hash(value)
                #用同样的随机数产生方法对比相应位的二进制值,只要发现有一个不同即返回结果为假
                ret = ret & self.bitset[loc]
                if ret==False:
                    return ret
            #只有当8个二进制位都相等时才返回真
            return ret
    
    def main():
        fd = open("urls.txt")
        bloomfilter = BloomFilter()
        while True:
            #url = raw_input()
            url = fd.readline()
            if cmp(url, 'exit') == 0: #if 'exit' then break
                break
            if bloomfilter.isContaions(url) == False:
                bloomfilter.insert(url)
            else:
                print 'url :%s has exist' % url 
                
    main()

    程序根据网页链接的个数计算所需最佳空间大小,即二进制向量的位数,以及所需随机生成器的哈希函数个数:

    def __init__(self, error_rate, elementNum):
    
            #计算所需要的bit数
            self.bit_num = -1 * elementNum * cmath.log(error_rate) / (cmath.log(2.0) * cmath.log(2.0))
            #四字节对齐
            self.bit_num = self.align_4byte(self.bit_num.real)
    
            #分配内存
            self.bit_array = BitVector(size=self.bit_num)
    
            #计算hash函数个数
            self.hash_num = cmath.log(2) * self.bit_num / elementNum
            self.hash_num = self.hash_num.real
            #向上取整
            self.hash_num = int(self.hash_num) + 1
    
            #产生hash函数种子
            self.hash_seeds = self.generate_hashseeds(self.hash_num)

    生成指定的前n位素数:

    def is_prime(n):
    
        if n == 9:
            return False
        for i in range(3,int(n**0.5)):
            if n%i == 0:
                return False
        return True
    
    def find_prime( n ): #找到从5开始的n个素数,使用素数筛法
        prime = []
        i = 5
        while len(prime) != n:
            flag = False
            for j in prime:
                if i % j == 0:
                    flag = True
                    i += 1
                    break
            if flag: #如果能被素数整除就跳过一轮循环
                continue
    
            if is_prime(i):
                prime.append(i)
            i += 1
    
        return prime

     

关键字