Java實現的Bloom Filter

jopen 8年前發布 | 12K 次閱讀 算法 Bloom Filter

原文  http://sobuhu.com/algorithm/2013/03/04/java-bloom-filter.html


Java實現簡單的Bloom Filter

布隆過濾器在信息去重,比如網頁的去重、郵件去重以及URL去重上很常用,其原理也相對簡單,主要是通過多個哈希函數來設置bit位作為某條記錄的指紋,查詢和空間效率都非常不錯。

一、基本原理

這篇博客講的非常好,我就不再狗尾續貂了:

http://www.cnblogs.com/haippy/archive/2012/07/13/2590351.html

Java實現的Bloom Filter

簡單來說,就是對數據庫中的記錄$X,Y,Z$分別使用多個(圖中為3個)哈希函數,映射到對應的比特位,將比特位置1,如果下一個數據使用相同三個哈希后對應的比特位全為1,則該數據已存在數據庫中。

推導結論為(m為bit位個數,n為待添加的記錄數,p為沖突概率)

如果已經限定了bitset容量和待添加的記錄數,則最優的哈希函數的個數k為

此時沖突率為:

如果已經限定了沖突概率,則bit數組的大小最優為:

二、構造函數

public class Bloomfilter {
    private BitSet bitSet;
    private int bitSetSize;
    private int addedElements;
    private int hashFunctionNumber;
    /**

 * 構造一個布隆過濾器,過濾器的容量為c * n 個bit.
 * @param c 當前過濾器預先開辟的最大包含記錄,通常要比預計存入的記錄多一倍.
 * @param n 當前過濾器預計所要包含的記錄.
 * @param k 哈希函數的個數,等同每條記錄要占用的bit數.
 */
public Bloomfilter(int c, int n, int k) {
    this.hashFunctionNumber = k;
    this.bitSetSize = (int) Math.ceil(c * k);
    this.addedElements = n;
    this.bitSet = new BitSet(this.bitSetSize);
}</pre><br />

通常我們要預設一個相對大一點的bit空間,這樣沖突才會比較小,哈希函數的數量K也是越多越好,但是通常8個以上就行了,要看能接受的沖突閾值。

代碼中也提供了獲得當前沖突率的函數。

三、完整代碼

完整代碼包括兩個類,一個哈希函數庫(從網上搜的8個hash函數),和過濾器類:

package crawler.analysis;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
public class Bloomfilter {
    private BitSet bitSet;
    private int bitSetSize;
    private int addedElements;
    private int hashFunctionNumber;
    /**

 * 構造一個布隆過濾器,過濾器的容量為c * n 個bit.
 * @param c 當前過濾器預先開辟的最大包含記錄,通常要比預計存入的記錄多一倍.
 * @param n 當前過濾器預計所要包含的記錄.
 * @param k 哈希函數的個數,等同每條記錄要占用的bit數.
 */
public Bloomfilter(int c, int n, int k) {
    this.hashFunctionNumber = k;
    this.bitSetSize = (int) Math.ceil(c * k);
    this.addedElements = n;
    this.bitSet = new BitSet(this.bitSetSize);
}
/**
 * 通過文件初始化過濾器.
 * @param file
 */
public void init(String file) {
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new FileReader(file));
        String line = reader.readLine();
        while (line != null && line.length() > 0) {
            this.put(line);
            line = reader.readLine();
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        try {
            if (reader != null) reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
public void put(String str) {
    int[] positions = createHashes(str.getBytes(), hashFunctionNumber);
    for (int i = 0; i < positions.length; i++) {
        int position = Math.abs(positions[i] % bitSetSize);
        bitSet.set(position, true);
    }
}
public boolean contains(String str) {
    byte[] bytes = str.getBytes();
    int[] positions = createHashes(bytes, hashFunctionNumber);
    for (int i : positions) {
        int position = Math.abs(i % bitSetSize);
        if (!bitSet.get(position)) {
            return false;
        }
    }
    return true;
}
/**
 * 得到當前過濾器的錯誤率.
 * @return
 */
public double getFalsePositiveProbability() {
    // (1 - e^(-k * n / m)) ^ k
    return Math.pow((1 - Math.exp(-hashFunctionNumber * (double) addedElements / bitSetSize)),
            hashFunctionNumber);
}
/**
 * 將字符串的字節表示進行多哈希編碼.
 * @param bytes 待添加進過濾器的字符串字節表示.
 * @param hashNumber 要經過的哈希個數.
 * @return 各個哈希的結果數組.
 */
public static int[] createHashes(byte[] bytes, int hashNumber) {
    int[] result = new int[hashNumber];
    int k = 0;
    while (k < hashNumber) {
        result[k] = HashFunctions.hash(bytes, k);
        k++;
    }
    return result;
}
public static void main(String[] args) throws Exception {
    Bloomfilter bloomfilter = new Bloomfilter(30000000, 10000000, 8);
    System.out.println("Bloom Filter Initialize ... ");
    bloomfilter.init("data/base.txt");
    System.out.println("Bloom Filter Ready");
    System.out.println("False Positive Probability : "
            + bloomfilter.getFalsePositiveProbability());
    // 查找新數據
    List<String> result = new ArrayList<String>();
    long t1 = System.currentTimeMillis();
    BufferedReader reader = new BufferedReader(new FileReader("data/input.txt"));
    String line = reader.readLine();
    while (line != null && line.length() > 0) {
        if (!bloomfilter.contains(line)) {
            result.add(line);
        }
        line = reader.readLine();
    }
    reader.close();
    long t2 = System.currentTimeMillis();
    System.out.println("Parse 9900000 items, Time : " + (t2 - t1) + "ms , find "
            + result.size() + " new items.");
    System.out.println("Average : " + 9900000 / ((t2 - t1) / 1000) + " items/second");
}

} class HashFunctions { public static int hash(byte[] bytes, int k) { switch (k) { case 0: return RSHash(bytes); case 1: return JSHash(bytes); case 2: return ELFHash(bytes); case 3: return BKDRHash(bytes); case 4: return APHash(bytes); case 5: return DJBHash(bytes); case 6: return SDBMHash(bytes); case 7: return PJWHash(bytes); } return 0; } public static int RSHash(byte[] bytes) { int hash = 0; int magic = 63689; int len = bytes.length; for (int i = 0; i < len; i++) { hash = hash magic + bytes[i]; magic = magic 378551; } return hash; } public static int JSHash(byte[] bytes) { int hash = 1315423911; for (int i = 0; i < bytes.length; i++) { hash ^= ((hash << 5) + bytes[i] + (hash >> 2)); } return hash; } public static int ELFHash(byte[] bytes) { int hash = 0; int x = 0; int len = bytes.length; for (int i = 0; i < len; i++) { hash = (hash << 4) + bytes[i]; if ((x = hash & 0xF0000000) != 0) { hash ^= (x >> 24); hash &= ~x; } } return hash; } public static int BKDRHash(byte[] bytes) { int seed = 131; int hash = 0; int len = bytes.length; for (int i = 0; i < len; i++) { hash = (hash seed) + bytes[i]; } return hash; } public static int APHash(byte[] bytes) { int hash = 0; int len = bytes.length; for (int i = 0; i < len; i++) { if ((i & 1) == 0) { hash ^= ((hash << 7) ^ bytes[i] ^ (hash >> 3)); } else { hash ^= (~((hash << 11) ^ bytes[i] ^ (hash >> 5))); } } return hash; } public static int DJBHash(byte[] bytes) { int hash = 5381; int len = bytes.length; for (int i = 0; i < len; i++) { hash = ((hash << 5) + hash) + bytes[i]; } return hash; } public static int SDBMHash(byte[] bytes) { int hash = 0; int len = bytes.length; for (int i = 0; i < len; i++) { hash = bytes[i] + (hash << 6) + (hash << 16) - hash; } return hash; } public static int PJWHash(byte[] bytes) { long BitsInUnsignedInt = (4 8); long ThreeQuarters = ((BitsInUnsignedInt * 3) / 4); long OneEighth = (BitsInUnsignedInt / 8); long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); int hash = 0; long test = 0; for (int i = 0; i < bytes.length; i++) { hash = (hash << OneEighth) + bytes[i]; if ((test = hash & HighBits) != 0) { hash = (int) ((hash ^ (test >> ThreeQuarters)) & (~HighBits)); } } return hash; } }</pre>
直接運行代碼,可以得到:

Bloom Filter Initialize ...
Bloom Filter Ready
False Positive Probability : 4.169085162009671E-5
Parse 9900000 items, Time : 17288ms , find 100 new items.
Average : 582352 items/second

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!