Java實現的Bloom Filter
Java實現簡單的Bloom Filter
布隆過濾器在信息去重,比如網頁的去重、郵件去重以及URL去重上很常用,其原理也相對簡單,主要是通過多個哈希函數來設置bit位作為某條記錄的指紋,查詢和空間效率都非常不錯。
一、基本原理
這篇博客講的非常好,我就不再狗尾續貂了:
http://www.cnblogs.com/haippy/archive/2012/07/13/2590351.html
簡單來說,就是對數據庫中的記錄$X,Y,Z$分別使用多個(圖中為3個)哈希函數,映射到對應的比特位,將比特位置1,如果下一個數據使用相同三個哈希后對應的比特位全為1,則該數據已存在數據庫中。
推導結論為(m為bit位個數,n為待添加的記錄數,p為沖突概率)
如果已經限定了bitset容量和待添加的記錄數,則最優的哈希函數的個數k為
此時沖突率為:
如果已經限定了沖突概率,則bit數組的大小最優為:
二、構造函數
public class Bloomfilter {
private BitSet bitSet;
private int bitSetSize;
private int addedElements;
private int hashFunctionNumber;
/**
* 構造一個布隆過濾器,過濾器的容量為c * n 個bit.
* @param c 當前過濾器預先開辟的最大包含記錄,通常要比預計存入的記錄多一倍.
* @param n 當前過濾器預計所要包含的記錄.
* @param k 哈希函數的個數,等同每條記錄要占用的bit數.
*/
public Bloomfilter(int c, int n, int k) {
this.hashFunctionNumber = k;
this.bitSetSize = (int) Math.ceil(c * k);
this.addedElements = n;
this.bitSet = new BitSet(this.bitSetSize);
}</pre><br />
通常我們要預設一個相對大一點的bit空間,這樣沖突才會比較小,哈希函數的數量K也是越多越好,但是通常8個以上就行了,要看能接受的沖突閾值。
代碼中也提供了獲得當前沖突率的函數。
三、完整代碼
完整代碼包括兩個類,一個哈希函數庫(從網上搜的8個hash函數),和過濾器類:
package crawler.analysis;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
public class Bloomfilter {
private BitSet bitSet;
private int bitSetSize;
private int addedElements;
private int hashFunctionNumber;
/**
* 構造一個布隆過濾器,過濾器的容量為c * n 個bit.
* @param c 當前過濾器預先開辟的最大包含記錄,通常要比預計存入的記錄多一倍.
* @param n 當前過濾器預計所要包含的記錄.
* @param k 哈希函數的個數,等同每條記錄要占用的bit數.
*/
public Bloomfilter(int c, int n, int k) {
this.hashFunctionNumber = k;
this.bitSetSize = (int) Math.ceil(c * k);
this.addedElements = n;
this.bitSet = new BitSet(this.bitSetSize);
}
/**
* 通過文件初始化過濾器.
* @param file
*/
public void init(String file) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while (line != null && line.length() > 0) {
this.put(line);
line = reader.readLine();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (reader != null) reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void put(String str) {
int[] positions = createHashes(str.getBytes(), hashFunctionNumber);
for (int i = 0; i < positions.length; i++) {
int position = Math.abs(positions[i] % bitSetSize);
bitSet.set(position, true);
}
}
public boolean contains(String str) {
byte[] bytes = str.getBytes();
int[] positions = createHashes(bytes, hashFunctionNumber);
for (int i : positions) {
int position = Math.abs(i % bitSetSize);
if (!bitSet.get(position)) {
return false;
}
}
return true;
}
/**
* 得到當前過濾器的錯誤率.
* @return
*/
public double getFalsePositiveProbability() {
// (1 - e^(-k * n / m)) ^ k
return Math.pow((1 - Math.exp(-hashFunctionNumber * (double) addedElements / bitSetSize)),
hashFunctionNumber);
}
/**
* 將字符串的字節表示進行多哈希編碼.
* @param bytes 待添加進過濾器的字符串字節表示.
* @param hashNumber 要經過的哈希個數.
* @return 各個哈希的結果數組.
*/
public static int[] createHashes(byte[] bytes, int hashNumber) {
int[] result = new int[hashNumber];
int k = 0;
while (k < hashNumber) {
result[k] = HashFunctions.hash(bytes, k);
k++;
}
return result;
}
public static void main(String[] args) throws Exception {
Bloomfilter bloomfilter = new Bloomfilter(30000000, 10000000, 8);
System.out.println("Bloom Filter Initialize ... ");
bloomfilter.init("data/base.txt");
System.out.println("Bloom Filter Ready");
System.out.println("False Positive Probability : "
+ bloomfilter.getFalsePositiveProbability());
// 查找新數據
List<String> result = new ArrayList<String>();
long t1 = System.currentTimeMillis();
BufferedReader reader = new BufferedReader(new FileReader("data/input.txt"));
String line = reader.readLine();
while (line != null && line.length() > 0) {
if (!bloomfilter.contains(line)) {
result.add(line);
}
line = reader.readLine();
}
reader.close();
long t2 = System.currentTimeMillis();
System.out.println("Parse 9900000 items, Time : " + (t2 - t1) + "ms , find "
+ result.size() + " new items.");
System.out.println("Average : " + 9900000 / ((t2 - t1) / 1000) + " items/second");
}
}
class HashFunctions {
public static int hash(byte[] bytes, int k) {
switch (k) {
case 0:
return RSHash(bytes);
case 1:
return JSHash(bytes);
case 2:
return ELFHash(bytes);
case 3:
return BKDRHash(bytes);
case 4:
return APHash(bytes);
case 5:
return DJBHash(bytes);
case 6:
return SDBMHash(bytes);
case 7:
return PJWHash(bytes);
}
return 0;
}
public static int RSHash(byte[] bytes) {
int hash = 0;
int magic = 63689;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = hash magic + bytes[i];
magic = magic 378551;
}
return hash;
}
public static int JSHash(byte[] bytes) {
int hash = 1315423911;
for (int i = 0; i < bytes.length; i++) {
hash ^= ((hash << 5) + bytes[i] + (hash >> 2));
}
return hash;
}
public static int ELFHash(byte[] bytes) {
int hash = 0;
int x = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = (hash << 4) + bytes[i];
if ((x = hash & 0xF0000000) != 0) {
hash ^= (x >> 24);
hash &= ~x;
}
}
return hash;
}
public static int BKDRHash(byte[] bytes) {
int seed = 131;
int hash = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = (hash seed) + bytes[i];
}
return hash;
}
public static int APHash(byte[] bytes) {
int hash = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
if ((i & 1) == 0) {
hash ^= ((hash << 7) ^ bytes[i] ^ (hash >> 3));
} else {
hash ^= (~((hash << 11) ^ bytes[i] ^ (hash >> 5)));
}
}
return hash;
}
public static int DJBHash(byte[] bytes) {
int hash = 5381;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = ((hash << 5) + hash) + bytes[i];
}
return hash;
}
public static int SDBMHash(byte[] bytes) {
int hash = 0;
int len = bytes.length;
for (int i = 0; i < len; i++) {
hash = bytes[i] + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
public static int PJWHash(byte[] bytes) {
long BitsInUnsignedInt = (4 8);
long ThreeQuarters = ((BitsInUnsignedInt * 3) / 4);
long OneEighth = (BitsInUnsignedInt / 8);
long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
int hash = 0;
long test = 0;
for (int i = 0; i < bytes.length; i++) {
hash = (hash << OneEighth) + bytes[i];
if ((test = hash & HighBits) != 0) {
hash = (int) ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return hash;
}
}</pre>
直接運行代碼,可以得到:
Bloom Filter Initialize ...
Bloom Filter Ready
False Positive Probability : 4.169085162009671E-5
Parse 9900000 items, Time : 17288ms , find 100 new items.
Average : 582352 items/second