## 一般方法

import random
import sys

def generate_random_phone():
"""
随机生成11位的字符串
"""
phone = ''
for j in range(0, 11):
phone += str(random.randint(0, 9))
return phone

# 10万个黑名单用户
black_list = []
for i in range(0, 100000):
black_list.append(generate_random_phone())

# 转成集合
black_set = set(black_list)
print(len(black_list), len(black_set))
# 看一下两种数据结构的空间占用
print("size of black_list: %f M" % (sys.getsizeof(black_list) / 1024 / 1024))
print("size of black_set: %f M" % (sys.getsizeof(black_set) / 1024 / 1024))

def brute_force_find():
"""
直接列表线性查找，随机查一个存在或者不存在的元素, O(n)
"""
if random.randint(0, 10) % 2:
target = black_list[random.randint(0, len(black_list))]
return __brute_force_find(target)
else:
return __brute_force_find(generate_random_phone())

def __brute_force_find(target):
for i in range(0, len(black_list)):
if target == black_list[i]:
return True
return False

def set_find():
"""
集合查找，随机查一个存在或者不存在的元素, O(1)
"""
if random.randint(0, 10) % 2:
target = black_list[random.randint(0, len(black_list))]
return __set_find(target)
else:
return __set_find(generate_random_phone())

def __set_find(target):
return target in black_set

print(brute_force_find())
print(set_find())

100000 100000
size of black_list: 0.786270 M
size of black_set: 4.000214 M
True
True


import timeit

print(timeit.repeat('brute_force_find()', number=100, setup="from __main__ import brute_force_find"))
print(timeit.repeat('set_find()', number=100, setup="from __main__ import set_find"))

[0.8502976149320602, 0.8765472685918212, 0.9624506058171391]
[0.0016423738561570644, 0.0013590981252491474, 0.0014535998925566673]


## 布隆过滤器原理

from bitarray import bitarray
import mmh3

class BloomFilter:
def __init__(self, arr):
# 位图长度暂定为20倍黑名单库的大小
self.SIZE = 20 * len(arr)
self.bit_array = bitarray(self.SIZE)
self.bit_array.setall(0)
for item in arr:
for pos in self.get_positions(item):
self.bit_array[pos] = 1

def get_positions(self, val):
# 使用10个哈希函数，murmurhash算法，返回索引值
return [mmh3.hash(val, i) % self.SIZE for i in range(40, 50)]

def find(self, val):
for pos in self.get_positions(val):
if self.bit_array[pos] == 0:
return False
return True

bloomFilter = BloomFilter(black_list)
print("size of bloomFilter's bit_array: %f M" % (sys.getsizeof(bloomFilter.bit_array) / 1024 / 1024))

def get_error_rate():
# 用1w个随机手机号，测试布隆过滤器的错误率
size = 10000
error_count = 0
for i in range(0, size):
phone = generate_random_phone()
bloom_filter_result = bloomFilter.find(phone)
set_result = __set_find(phone)
if bloom_filter_result != set_result:
error_count += 1
return error_count / size

print(get_error_rate())

size of bloomFilter's bit_array: 0.000092 M
0.0001


def bloom_filter_find():
if random.randint(0, 10) % 2:
target = black_list[random.randint(0, len(black_list))]
return bloomFilter.find(target)
else:
return bloomFilter.find(generate_random_phone())

print(timeit.repeat('brute_force_find()', number=100, setup="from __main__ import brute_force_find"))
print(timeit.repeat('set_find()', number=100, setup="from __main__ import set_find"))
print(timeit.repeat('bloom_filter_find()', number=100, setup="from __main__ import bloom_filter_find"))

[0.70748823415488, 0.7686979519203305, 0.7785645266994834]
[0.001686999574303627, 0.002007704693824053, 0.0013333242386579514]
[0.001962156966328621, 0.0018132571130990982, 0.0023592300713062286]