Naive Bayesian文本分類器

jopen 8年前發布 | 11K 次閱讀 機器學習

貝葉斯學習方法中實用性很高的一種為樸素貝葉斯學習期,常被稱為樸素貝葉斯分類器。在某些領域中與神經網絡和決策樹學習相當。雖然樸素貝葉斯分類器忽略單詞間的依賴關系,即假設所有單詞是條件獨立的,但樸素貝葉斯分類在實際應用中有很出色的表現。

樸素貝葉斯文本分類算法偽代碼:


樸素貝葉斯文本分類算法流程:


通過計算訓練集中每個類別的概率與不同類別下每個單詞的概率,然后利用樸素貝葉斯公式計算新文檔被分類為各個類別的概率,最終輸出概率最大的類別。

C++源碼:

/*
    Bayesian classifier for document classifiaction
    15S103182
    Ethan
    2015.12.27
*/
#include <iostream>
#include <vector>
#include <iterator>
#include <map>
#include <fstream>
#include <iomanip>
#include <sstream>
using namespace std;
int stringToInteger(string a){
    stringstream ss;
    ss<<a;
    int b;
    ss>>b;
    return b;
}
vector<int> openClassificationFile(const char* dataset){
    fstream file;
    file.open(dataset,ios::in);
    if(!file) 
    {
        cout <<"Open File Failed!" <<endl;
        vector<int> a;
        return a;
    } 
    vector<int> data;
    int i=1;
    while(!file.eof()){
        string temp;
        file>>temp;
        data.push_back(stringToInteger(temp));
    }
    file.close();
    return data;
}
vector<string> openFile(const char* dataset){
    fstream file;
    file.open(dataset,ios::in);
    if(!file) 
    {
        cout <<"Open File Failed!" <<endl;
        vector<string> a;
        return a;
    }
    vector<string> data;
    int i=1;
    while(!file.eof()){
        string temp;
        file>>temp;
        data.push_back(temp);
    }
    file.close();
    for(int i=0;i<data.size();i++) cout<<data[i]<<"\t";
    cout<<endl;
    cout<<"Open file successfully!"<<endl;
    return data;
}
vector<vector<string> > openFiles(const vector<char*> files){
    vector<vector<string> > docs;
    for(int i=0;i<files.size();i++){
        vector<string> t = openFile(files[i]);
        docs.push_back(t);
    }
    return docs;
}
void bayesian(vector<vector<string> > docs,vector<int> c,vector<string> d){
    map<string,int> wordFrequency;//每個單詞出現的個數 
    map<int,float> cWordProbability;//類別單詞頻率 
    map<int,int> cTotalFrequency;//類別單詞個數
    map<int,map<string,int> > cWordlTotalFrequency;//類別下單詞個數 
    int totalWords=0;
    for(int i=0;i<docs.size();i++){
        totalWords += docs[i].size();
        cWordProbability[c[i]] = cWordProbability[c[i]] + docs[i].size();
        map<string,int> sn; 
        for(int j=0;j<docs[i].size();j++){
            wordFrequency[docs[i][j]] = wordFrequency[docs[i][j]] + 1;
            sn[docs[i][j]] = sn[docs[i][j]] + 1;
        }
        map<string,int>::iterator isn;
        for(isn = sn.begin();isn!=sn.end();isn++){
            cWordlTotalFrequency[c[i]][isn->first] = cWordlTotalFrequency[c[i]][isn->first] + isn->second;
        }
    }
    int tw = wordFrequency.size();
    map<int,float>::iterator icWordProbability;
    for(icWordProbability=cWordProbability.begin();icWordProbability!=cWordProbability.end();icWordProbability++){
        cTotalFrequency[icWordProbability->first] = icWordProbability->second;
        cWordProbability[icWordProbability->first] = icWordProbability->second / totalWords;
    }
    cout<<"Word Frequency:"<<endl;
    map<string,int>::iterator iwordFrequency;
    for(iwordFrequency=wordFrequency.begin();iwordFrequency!=wordFrequency.end();iwordFrequency++){
        cout<<setw(8)<<iwordFrequency->first<<"\tFrequency:"<<iwordFrequency->second<<endl;
    }
    cout<<"Conditional Probability:"<<endl;
    map<string,int> dtw;//待分類文檔詞頻 
    for(int i=0;i<d.size();i++) dtw[d[i]] = dtw[d[i]] + 1;
    map<string,map<int,float> > cp;//單詞類別概率 
    map<string,int>::iterator idtw;
    for(idtw=dtw.begin();idtw!=dtw.end();idtw++){
        map<int,float> cf;
        for(int j=0;j<cTotalFrequency.size();j++){
            float p=0;
            p = (float)(cWordlTotalFrequency[j][idtw->first] +1) / (cTotalFrequency[j] + wordFrequency.size());
            cf[j] = p;
            cout<<"P("<<idtw->first<<"|"<<j<<") \t= "<<p<<endl;
        }
        cp[idtw->first] = cf;
    }
    cout<<"Classification Probability:"<<endl;
    float mp = 0;
    int classification=0;
    for(int i=0;i<cTotalFrequency.size();i++){
        float tcp=1;
        for(int j=0;j<d.size();j++){
            tcp = tcp * cp[d[j]][i];
        }
        tcp = tcp * cWordProbability[i];
        cout<<"classification:"<<i<<"\t"<<"Probability:"<<tcp<<endl;
        if(mp<tcp) {
            mp = tcp;
            classification = i;
        }
    }
    cout<<"The new document classification is:"<<classification<<endl;
}

int main(int argc, char** argv) {
    vector<vector<string> > docs;
    vector<int> c = openClassificationFile("classification.txt");
    vector<char *> files;
    files.push_back("1.txt");files.push_back("2.txt");files.push_back("3.txt");files.push_back("4.txt");files.push_back("5.txt");
    cout<<"訓練文檔集:"<<endl;
    docs = openFiles(files);
    vector<string> d;
    cout<<"待分類文檔:"<<endl; 
    d = openFile("new.txt");
    bayesian(docs,c,d);
    return 0;
}

效果展示:


結論:

樸素貝葉斯分類器用于處理離散型的文本數據,能夠有效對文本文檔進行分類。在實驗過程中,最困難的地方在于數據結構的設計,由于要統計每個文檔類別的頻數和每個文檔類別下單詞的概率,這個地方需要用到復雜映射與統計,在編碼過程中經過不斷的思考,最終通過多級映射的形式儲存所需的數據,最終計算出新文檔的類別。通過實驗,成功將新的未分類文檔輸入例子分類為期待的文檔類型,實驗結果較為滿意。


來自: http://blog.csdn.net/k76853/article/details/50532195

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!