Java實現的Bloom Filter

jopen 10年前發布 | 12K 次閱讀算法 Bloom Filter

原文 http://sobuhu.com/algorithm/2013/03/04/java-bloom-filter.html

Java實現簡單的Bloom Filter

布隆過濾器在信息去重，比如網頁的去重、郵件去重以及URL去重上很常用，其原理也相對簡單，主要是通過多個哈希函數來設置bit位作為某條記錄的指紋，查詢和空間效率都非常不錯。

一、基本原理

這篇博客講的非常好，我就不再狗尾續貂了：

http://www.cnblogs.com/haippy/archive/2012/07/13/2590351.html

Java實現的Bloom Filter

簡單來說，就是對數據庫中的記錄$X，Y，Z$分別使用多個（圖中為3個）哈希函數，映射到對應的比特位，將比特位置1，如果下一個數據使用相同三個哈希后對應的比特位全為1，則該數據已存在數據庫中。

推導結論為(m為bit位個數，n為待添加的記錄數，p為沖突概率)

如果已經限定了bitset容量和待添加的記錄數，則最優的哈希函數的個數k為

此時沖突率為：

如果已經限定了沖突概率，則bit數組的大小最優為：

二、構造函數

public class Bloomfilter {
    private BitSet bitSet;
    private int bitSetSize;
    private int addedElements;
    private int hashFunctionNumber;
    /**

 * 構造一個布隆過濾器，過濾器的容量為c * n 個bit.
 * @param c 當前過濾器預先開辟的最大包含記錄,通常要比預計存入的記錄多一倍.
 * @param n 當前過濾器預計所要包含的記錄.
 * @param k 哈希函數的個數，等同每條記錄要占用的bit數.
 */
public Bloomfilter(int c, int n, int k) {
    this.hashFunctionNumber = k;
    this.bitSetSize = (int) Math.ceil(c * k);
    this.addedElements = n;
    this.bitSet = new BitSet(this.bitSetSize);
}</pre><br />

通常我們要預設一個相對大一點的bit空間，這樣沖突才會比較小，哈希函數的數量K也是越多越好，但是通常8個以上就行了，要看能接受的沖突閾值。 

代碼中也提供了獲得當前沖突率的函數。 

三、完整代碼 

完整代碼包括兩個類，一個哈希函數庫（從網上搜的8個hash函數），和過濾器類： 

package crawler.analysis;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
public class Bloomfilter {
    private BitSet bitSet;
    private int bitSetSize;
    private int addedElements;
    private int hashFunctionNumber;
    /**

 * 構造一個布隆過濾器，過濾器的容量為c * n 個bit.
 * @param c 當前過濾器預先開辟的最大包含記錄,通常要比預計存入的記錄多一倍.
 * @param n 當前過濾器預計所要包含的記錄.
 * @param k 哈希函數的個數，等同每條記錄要占用的bit數.
 */
public Bloomfilter(int c, int n, int k) {
    this.hashFunctionNumber = k;
    this.bitSetSize = (int) Math.ceil(c * k);
    this.addedElements = n;
    this.bitSet = new BitSet(this.bitSetSize);
}
/**
 * 通過文件初始化過濾器.
 * @param file
 */
public void init(String file) {
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new FileReader(file));
        String line = reader.readLine();
        while (line != null && line.length() > 0) {
            this.put(line);
            line = reader.readLine();
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        try {
            if (reader != null) reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
public void put(String str) {
    int[] positions = createHashes(str.getBytes(), hashFunctionNumber);
    for (int i = 0; i < positions.length; i++) {
        int position = Math.abs(positions[i] % bitSetSize);
        bitSet.set(position, true);
    }
}
public boolean contains(String str) {
    byte[] bytes = str.getBytes();
    int[] positions = createHashes(bytes, hashFunctionNumber);
    for (int i : positions) {
        int position = Math.abs(i % bitSetSize);
        if (!bitSet.get(position)) {
            return false;
        }
    }
    return true;
}
/**
 * 得到當前過濾器的錯誤率.
 * @return
 */
public double getFalsePositiveProbability() {
    // (1 - e^(-k * n / m)) ^ k
    return Math.pow((1 - Math.exp(-hashFunctionNumber * (double) addedElements / bitSetSize)),
            hashFunctionNumber);
}
/**
 * 將字符串的字節表示進行多哈希編碼.
 * @param bytes 待添加進過濾器的字符串字節表示.
 * @param hashNumber 要經過的哈希個數.
 * @return 各個哈希的結果數組.
 */
public static int[] createHashes(byte[] bytes, int hashNumber) {
    int[] result = new int[hashNumber];
    int k = 0;
    while (k < hashNumber) {
        result[k] = HashFunctions.hash(bytes, k);
        k++;
    }
    return result;
}
public static void main(String[] args) throws Exception {
    Bloomfilter bloomfilter = new Bloomfilter(30000000, 10000000, 8);
    System.out.println("Bloom Filter Initialize ... ");
    bloomfilter.init("data/base.txt");
    System.out.println("Bloom Filter Ready");
    System.out.println("False Positive Probability : "
            + bloomfilter.getFalsePositiveProbability());
    // 查找新數據
    List<String> result = new ArrayList<String>();
    long t1 = System.currentTimeMillis();
    BufferedReader reader = new BufferedReader(new FileReader("data/input.txt"));
    String line = reader.readLine();
    while (line != null && line.length() > 0) {
        if (!bloomfilter.contains(line)) {
            result.add(line);
        }
        line = reader.readLine();
    }
    reader.close();
    long t2 = System.currentTimeMillis();
    System.out.println("Parse 9900000 items, Time : " + (t2 - t1) + "ms , find "
            + result.size() + " new items.");
    System.out.println("Average : " + 9900000 / ((t2 - t1) / 1000) + " items/second");
}

}
class HashFunctions {
    public static int hash(byte[] bytes, int k) {
        switch (k) {
            case 0:
                return RSHash(bytes);
            case 1:
                return JSHash(bytes);
            case 2:
                return ELFHash(bytes);
            case 3:
                return BKDRHash(bytes);
            case 4:
                return APHash(bytes);
            case 5:
                return DJBHash(bytes);
            case 6:
                return SDBMHash(bytes);
            case 7:
                return PJWHash(bytes);
        }
        return 0;
    }
    public static int RSHash(byte[] bytes) {
        int hash = 0;
        int magic = 63689;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = hash  magic + bytes[i];
            magic = magic  378551;
        }
        return hash;
    }
    public static int JSHash(byte[] bytes) {
        int hash = 1315423911;
        for (int i = 0; i < bytes.length; i++) {
            hash ^= ((hash << 5) + bytes[i] + (hash >> 2));
        }
        return hash;
    }
    public static int ELFHash(byte[] bytes) {
        int hash = 0;
        int x = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = (hash << 4) + bytes[i];
            if ((x = hash & 0xF0000000) != 0) {
                hash ^= (x >> 24);
                hash &= ~x;
            }
        }
        return hash;
    }
    public static int BKDRHash(byte[] bytes) {
        int seed = 131;
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = (hash  seed) + bytes[i];
        }
        return hash;
    }
    public static int APHash(byte[] bytes) {
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            if ((i & 1) == 0) {
                hash ^= ((hash << 7) ^ bytes[i] ^ (hash >> 3));
            } else {
                hash ^= (~((hash << 11) ^ bytes[i] ^ (hash >> 5)));
            }
        }
        return hash;
    }
    public static int DJBHash(byte[] bytes) {
        int hash = 5381;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = ((hash << 5) + hash) + bytes[i];
        }
        return hash;
    }
    public static int SDBMHash(byte[] bytes) {
        int hash = 0;
        int len = bytes.length;
        for (int i = 0; i < len; i++) {
            hash = bytes[i] + (hash << 6) + (hash << 16) - hash;
        }
        return hash;
    }
    public static int PJWHash(byte[] bytes) {
        long BitsInUnsignedInt = (4  8);
        long ThreeQuarters = ((BitsInUnsignedInt * 3) / 4);
        long OneEighth = (BitsInUnsignedInt / 8);
        long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
        int hash = 0;
        long test = 0;
        for (int i = 0; i < bytes.length; i++) {
            hash = (hash << OneEighth) + bytes[i];
            if ((test = hash & HighBits) != 0) {
                hash = (int) ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
            }
        }
        return hash;
    }
}</pre>

直接運行代碼，可以得到： 
Bloom Filter Initialize ...
Bloom Filter Ready
False Positive Probability : 4.169085162009671E-5
Parse 9900000 items, Time : 17288ms , find 100 new items.
Average : 582352 items/second

本文由用戶 jopen 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。

轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。

本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！

本文地址：http://www.baiduhome.net/lib/view/open1450879369292.html

算法 Bloom Filter

Java實現的Bloom Filter

Java實現簡單的Bloom Filter

一、基本原理

推導結論為(m為bit位個數，n為待添加的記錄數，p為沖突概率)

二、構造函數

相關經驗

相關資訊

相關文檔

目錄