nltk-比較中文文檔相似度
nltk同時也能處理中文的場景,只要做如下改動:
-
使用中文分詞器(如我選用了結巴分詞)
</li> -
對中文字符做編碼處理,使用unicode編碼方式
</li> -
python的源碼編碼統一聲明為 gbk
</li> -
使用支持中文的語料庫
</li> </ol>代碼如下,需要jieba的支持
#!/usr/bin/env python
--coding=gbk--
""" 原始數據,用于建立模型 """
縮水版的courses,實際數據的格式應該為 課程名\t課程簡介\t課程詳情,并已去除html等干擾因素
courses = [
u'Writing II: Rhetorical Composing', u'Genetics and Society: A Course for Educators', u'General Game Playing', u'Genes and the Human Condition (From Behavior to Biotechnology)', u'A Brief History of Humankind', u'New Models of Business in Society', u'Analyse Numrique pour Ingnieurs', u'Evolution: A Course for Educators', u'Coding the Matrix: Linear Algebra through Computer Science Applications', u'The Dynamic Earth: A Course for Educators', u'Tiny Wings\tYou have always dreamed of flying - but your wings are tiny. Luckily the world is full of beautiful hills. Use the hills as jumps - slide down, flap your wings and fly! At least for a moment - until this annoying gravity brings you back down to earth. But the next hill is waiting for you already. Watch out for the night and fly as fast as you can. ', u'Angry Birds Free', u'沒有\它很相似', u'沒有\t它很相似', u'沒有\t他很相似', u'沒有\t他不很相似', u'沒有', u'可以沒有', u'也沒有', u'有沒有也不管', u'Angry Birds Stella', u'Flappy Wings - FREE\tFly into freedom!A parody of the #1 smash hit game!', u'沒有一個', u'沒有一個2', ]只是為了最后的查看方便
實際的 courses_name = [course.split('\t')[0] for course in courses]
courses_name = courses
""" 預處理(easy_install nltk) """ def pre_process_cn(courses, low_freq_filter = True): """ 簡化的 中文+英文 預處理 1.去掉停用詞 2.去掉標點符號 3.處理為詞干 4.去掉低頻詞
""" import nltk import jieba.analyse from nltk.tokenize import word_tokenize texts_tokenized = [] for document in courses: texts_tokenized_tmp = [] for word in word_tokenize(document): texts_tokenized_tmp += jieba.analyse.extract_tags(word,10) texts_tokenized.append(texts_tokenized_tmp) texts_filtered_stopwords = texts_tokenized #去除標點符號 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] #詞干化 from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered] #去除過低頻詞 if low_freq_filter: all_stems = sum(texts_stemmed, []) stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] else: texts = texts_stemmed return texts
lib_texts = pre_process_cn(courses)
""" 引入gensim,正式開始處理(easy_install gensim) """
def train_by_lsi(lib_texts): """ 通過LSI模型的訓練 """ from gensim import corpora, models, similarities
#為了能看到過程日志 #import logging #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(lib_texts) corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): 將collection words 轉為詞袋,用兩元組(word_id, word_frequency)表示 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #拍腦袋的:訓練topic數量為10的LSI模型 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) index = similarities.MatrixSimilarity(lsi[corpus]) # index 是 gensim.similarities.docsim.MatrixSimilarity 實例 return (index, dictionary, lsi)
庫建立完成 -- 這部分可能數據很大,可以預先處理好,存儲起來
(index,dictionary,lsi) = train_by_lsi(lib_texts)
要處理的對象登場
target_courses = [u'沒有'] target_text = pre_process_cn(target_courses, low_freq_filter=False)
""" 對具體對象相似度匹配 """
選擇一個基準數據
ml_course = target_text[0]
詞袋處理
ml_bow = dictionary.doc2bow(ml_course)
在上面選擇的模型數據 lsi 中,計算其他數據與其的相似度
ml_lsi = lsi[ml_bow] #ml_lsi 形式如 (topic_id, topic_value) sims = index[ml_lsi] #sims 是最終結果了, index[xxx] 調用內置方法 getitem() 來計算ml_lsi
排序,為輸出方便
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
查看結果
print sort_sims[0:10] #看下前10個最相似的,第一個是基準數據自身 print courses_name[sort_sims[1][0]] #看下實際最相似的數據叫什么 print courses_name[sort_sims[2][0]] #看下實際最相似的數據叫什么 print courses_name[sort_sims[3][0]] #看下實際最相似的數據叫什么</pre>
來自:http://my.oschina.net/kakablue/blog/314513