lucene4.x自定義停用分詞器
package com.kkrgwbj.util;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version;
import java.io.Reader; import java.util.HashSet; import java.util.Set;
/**
- 自定義停用分詞器
Created by lvbingyang on 2015/11/25 0025. */ public class MyStopAnalyzer extends Analyzer { private Set stops;
public MyStopAnalyzer(String[] sws) {
//將字符串數組添加到停用詞的set集合中 stops = StopFilter.makeStopSet(Version.LUCENE_45, sws, true); //加入原來的停用詞 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
/**
默認構造方法 */ public MyStopAnalyzer() { stops = new HashSet<>(); stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//加入原來的停用詞 }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //主要負責接收reader,將reader進行分詞操作 Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_45, reader); //創建停用詞的set對象 CharArraySet charArraySet = CharArraySet.copy(Version.LUCENE_45, stops); //分詞器做好處理之后得到的一個流,這個流中存儲了分詞的信息 //使用了忽略大小寫的filter,停用filter過濾 TokenStream tokenStream = new LowerCaseFilter(Version.LUCENE_45, new StopFilter(Version.LUCENE_45, tokenizer, charArraySet)); return new TokenStreamComponents(tokenizer, tokenStream); } }</pre>
junit測試:
@Test public void test2() { Analyzer analyzer = new MyStopAnalyzer(new String[]{"I", "you", "hate"}); Analyzer analyzer1 = new StopAnalyzer(Version.LUCENE_45); String txt = "i love you,i hate you"; //自定義的停用詞分詞器 AnalyzerUtils.displayToken(txt, analyzer); //默認的停用詞分詞器 AnalyzerUtils.displayToken(txt, analyzer1); }
在這里,我們停用了i,you,hate,運行結果: