Lucene 3.6 中文分詞、分頁查詢、高亮顯示等

jopen 13年前發布 | 104K 次閱讀 Lucene 中文分詞
1、準備工作

下載lucene 3.6.1 ： http://lucene.apache.org/
下載中文分詞IK Analyzer： http://code.google.com/p/ik-analyzer/downloads/list （注意下載的是IK Analyzer 2012_u5_source.zip，其他版本有bug）
下載solr 3.6.1： http://lucene.apache.org/solr/（編譯IK Analyzer時需引用包）
OK，將lucene 、solr 相關包（lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar）拷貝到項目lib下，IK源碼置于項目src下。
2、從Oracle數據庫中取數據創建索引（使用IK分詞）

package lucene.util;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.sql.Connection;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import modules.gk.Gk_info;
import modules.gk.Gk_infoSub;
import web.sys.Globals;
import web.db.DBConnector;
import web.db.ObjectCtl;
import web.util.StringUtil;
//Wizzer.cn
public class LuceneIndex {
    IndexWriter writer = null;
    FSDirectory dir = null;
    boolean create = true;

public void init() {
    long a1 = System.currentTimeMillis();
    System.out.println("[Lucene 開始執行：" + new Date() + "]");
    Connection con = DBConnector.getconecttion(); //取得一個數據庫連接
    try {
        final File docDir = new File(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene
        if (!docDir.exists()) {
            docDir.mkdirs();
        }
        String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false
        if ("false".equals(cr.toLowerCase())) {
            create = false;
        }
        Directory dir = FSDirectory.open(docDir);

//            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
            Analyzer analyzer = new IKAnalyzer(true);
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            if (create) {
                // Create a new index in the directory, removing any
                // previously indexed documents:
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
            } else {
                // Add new documents to an existing index:
                iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            }
            IndexWriter writer = new IndexWriter(dir, iwc);
            String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 ";
            int rowCount = ObjectCtl.getRowCount(con, sql);
            int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString());   //每頁記錄數
            int pages = (rowCount - 1) / pageSize + 1; //計算總頁數
            ArrayList list = null;
            Gk_infoSub gk = null;
            for (int i = 1; i < pages+1; i++) {
                long a = System.currentTimeMillis();
                list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub());
                for (int j = 0; j < list.size(); j++) {
                    gk = (Gk_infoSub) list.get(j);
                    Document doc = new Document();
                    doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主鍵不分詞
                    doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));
                    doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));
                    doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分詞
                    doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));
                    writer.addDocument(doc);
                    ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引狀態
                }

            long b = System.currentTimeMillis();
            long c = b - a;
            System.out.println("[Lucene " + rowCount + "條，" + pages + "頁，第" + i + "頁花費時間：" + c + "毫秒]");
        }
        writer.commit();

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        DBConnector.freecon(con); //釋放數據庫連接
        try {
            if (writer != null) {
                writer.close();
            }
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (dir != null && IndexWriter.isLocked(dir)) {
                    IndexWriter.unlock(dir);//注意解鎖
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    long b1 = System.currentTimeMillis();
    long c1 = b1 - a1;
    System.out.println("[Lucene 執行完畢，花費時間：" + c1 + "毫秒，完成時間：" + new Date() + "]");
}

}</pre>  
3、單字段查詢以及多字段分頁查詢高亮顯示
  package lucene.util;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.search.*;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.Version;
import modules.gk.Gk_infoSub;
import java.util.ArrayList;
import java.io.File;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import web.util.StringUtil;
import web.sys.Globals;
import org.wltea.analyzer.lucene.IKAnalyzer;
//Wizzer.cn
public class LuceneQuery {
    private static String indexPath;// 索引生成的目錄
    private int rowCount;// 記錄數
    private int pages;// 總頁數
    private int currentPage;// 當前頁數
    private int pageSize;   //每頁記錄數

public LuceneQuery() {
    this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString();
}

public int getRowCount() {
    return rowCount;
}

public int getPages() {
    return pages;
}

public int getPageSize() {
    return pageSize;
}

public int getCurrentPage() {
    return currentPage;
}

/**
 * 函數功能:根據字段查詢索引
 */
public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) {
    ArrayList list = new ArrayList();
    try {
        if (curpage <= 0) {
            curpage = 1;
        }
        if (pageSize <= 0) {
            pageSize = 20;
        }
        this.pageSize = pageSize;   //每頁記錄數
        this.currentPage = curpage;   //當前頁
        int start = (curpage - 1) * pageSize;
        Directory dir = FSDirectory.open(new File(indexPath));
        IndexReader reader = IndexReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new IKAnalyzer(true);
        QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer);
        queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query query = queryParser.parse(keyWord);
        int hm = start + pageSize;
        TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
        searcher.search(query, res);

        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
        Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
        this.rowCount = res.getTotalHits();
        this.pages = (rowCount - 1) / pageSize + 1; //計算總頁數
        TopDocs tds = res.topDocs(start, pageSize);
        ScoreDoc[] sd = tds.scoreDocs;
        for (int i = 0; i < sd.length; i++) {
            Document hitDoc = reader.document(sd[i].doc);
            list.add(createObj(hitDoc, analyzer, highlighter));
        }

    } catch (Exception e) {
        e.printStackTrace();
    }

    return list;

}
/**
 * 函數功能:根據字段查詢索引
 */
public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) {
    ArrayList list = new ArrayList();
    try {
        if (curpage <= 0) {
            curpage = 1;
        }
        if (pageSize <= 0) {
            pageSize = 20;
        }
        this.pageSize = pageSize;   //每頁記錄數
        this.currentPage = curpage;   //當前頁
        int start = (curpage - 1) * pageSize;
        Directory dir = FSDirectory.open(new File(indexPath));
        IndexReader reader = IndexReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        BooleanQuery bQuery = new BooleanQuery();  //組合查詢
        if (!"".equals(allkeyword)) {//包含全部關鍵詞
            KeywordAnalyzer analyzer = new KeywordAnalyzer();
            BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND
            Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
            bQuery.add(query, BooleanClause.Occur.MUST);  //AND
        }
        if (!"".equals(onekeyword)) { //包含任意關鍵詞
            Analyzer analyzer = new IKAnalyzer(true);
            BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR
            Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
            bQuery.add(query, BooleanClause.Occur.MUST);  //AND
        }
        if (!"".equals(nokeyword)) { //排除關鍵詞
            Analyzer analyzer = new IKAnalyzer(true);
            BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT
            Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
            bQuery.add(query, BooleanClause.Occur.MUST_NOT);  //AND

        }
        int hm = start + pageSize;
        TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
        searcher.search(bQuery, res);
        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
        Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(bQuery));
        this.rowCount = res.getTotalHits();
        this.pages = (rowCount - 1) / pageSize + 1; //計算總頁數
        System.out.println("rowCount:" + rowCount);
        TopDocs tds = res.topDocs(start, pageSize);
        ScoreDoc[] sd = tds.scoreDocs;
        Analyzer analyzer = new IKAnalyzer();
        for (int i = 0; i < sd.length; i++) {
            Document hitDoc = reader.document(sd[i].doc);
            list.add(createObj(hitDoc, analyzer, highlighter));
        }

    } catch (Exception e) {
        e.printStackTrace();
    }

    return list;

}

/**
 * 創建返回對象（高亮）
 */

private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {

    Gk_infoSub gk = new Gk_infoSub();
    try {

        if (doc != null) {
            gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
            gk.setPdate(StringUtil.null2String(doc.get("pdate")));
            String title = StringUtil.null2String(doc.get("title"));
            gk.setTitle(title);
            if (!"".equals(title)) {
                highlighter.setTextFragmenter(new SimpleFragmenter(title.length()));
                TokenStream tk = analyzer.tokenStream("title", new StringReader(title));
                String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));
                if (!"".equals(htext)) {
                    gk.setTitle(htext);
                }
            }
            String keywords = StringUtil.null2String(doc.get("keywords"));
            gk.setKeywords(keywords);
            if (!"".equals(keywords)) {
                highlighter.setTextFragmenter(new SimpleFragmenter(keywords.length()));
                TokenStream tk = analyzer.tokenStream("keywords", new StringReader(keywords));
                String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));
                if (!"".equals(htext)) {
                    gk.setKeywords(htext);
                }
            }
            String describes = StringUtil.null2String(doc.get("describes"));
            gk.setDescribes(describes);
            if (!"".equals(describes)) {
                highlighter.setTextFragmenter(new SimpleFragmenter(describes.length()));
                TokenStream tk = analyzer.tokenStream("keywords", new StringReader(describes));
                String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));
                if (!"".equals(htext)) {
                    gk.setDescribes(htext);
                }
            }

        }
        return gk;
    }
    catch (Exception e) {

        e.printStackTrace();
        return null;
    }
    finally {
        gk = null;
    }

}

private synchronized static Object createObj(Document doc) {

    Gk_infoSub gk = new Gk_infoSub();
    try {

        if (doc != null) {
            gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
            gk.setPdate(StringUtil.null2String(doc.get("pdate")));
            gk.setTitle(StringUtil.null2String(doc.get("title")));
            gk.setKeywords(StringUtil.null2String(doc.get("keywords")));
            gk.setDescribes(StringUtil.null2String(doc.get("describes")));
        }
        return gk;
    }
    catch (Exception e) {

        e.printStackTrace();
        return null;
    }
    finally {
        gk = null;
    }

}

}</pre>  單字段查詢： 
        long a = System.currentTimeMillis();
        try {
            int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
            int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
            String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title")));
            LuceneQuery lu = new LuceneQuery();
            form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize));
            form.addResult("curPage", lu.getCurrentPage());
            form.addResult("pageSize", lu.getPageSize());
            form.addResult("rowCount", lu.getRowCount());
            form.addResult("pageCount", lu.getPages());
        } catch (Exception e) {
            e.printStackTrace();
        }
        long b = System.currentTimeMillis();
        long c = b - a;
        System.out.println("[搜索信息花費時間：" + c + "毫秒]");
多字段查詢：         long a = System.currentTimeMillis();
        try {
            int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
            int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
            String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword")));
            String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword")));
            String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword")));
            LuceneQuery lu = new LuceneQuery();
            form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));
            form.addResult("curPage", lu.getCurrentPage());
            form.addResult("pageSize", lu.getPageSize());
            form.addResult("rowCount", lu.getRowCount());
            form.addResult("pageCount", lu.getPages());
        } catch (Exception e) {
            e.printStackTrace();
        }
        long b = System.currentTimeMillis();
        long c = b - a;
        System.out.println("[高級檢索花費時間：" + c + "毫秒]");
 4、Lucene通配符查詢

            BooleanQuery bQuery = new BooleanQuery();  //組合查詢
            if (!"".equals(title)) {
                WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*"));

            bQuery.add(w1, BooleanClause.Occur.MUST);  //AND
        }
        int hm = start + pageSize;
        TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
        searcher.search(bQuery, res);</pre>  <h2>5、Lucene嵌套查詢</h2>

實現SQL：(unitid like 'unitid%'  and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1) 
                    BooleanQuery bQuery = new BooleanQuery();
                    BooleanQuery b1 = new BooleanQuery();
                    WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*"));
                    WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*"));
                    b1.add(w1, BooleanClause.Occur.MUST);//AND
                    b1.add(w2, BooleanClause.Occur.MUST);//AND
                    bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR
                    BooleanQuery b2 = new BooleanQuery();
                    WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*"));
                    WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*"));
                    WildcardQuery w5 = new WildcardQuery(new Term("tostate", "1"));
                    b2.add(w3, BooleanClause.Occur.MUST);//AND
                    b2.add(w4, BooleanClause.Occur.MUST);//AND
                    b2.add(w5, BooleanClause.Occur.MUST);//AND
                    bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR
 6、Lucene先根據時間排序后分頁

            int hm = start + pageSize;
            Sort sort = new Sort(new SortField("pdate", SortField.STRING, true));
            TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false);
            searcher.search(bQuery, res);
            this.rowCount = res.getTotalHits();
            this.pages = (rowCount - 1) / pageSize + 1; //計算總頁數
            TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize);
            ScoreDoc[] sd = tds.scoreDocs;
            System.out.println("rowCount:" + rowCount);
            int i=0;
            for (ScoreDoc scoreDoc : sd) {
                i++;
                if(i<start){
                    continue;
                }
                if(i>hm){
                    break;
                }
                Document doc = searcher.doc(scoreDoc.doc);
                list.add(createObj(doc));
            }
這個效率不高，正常的做飯是創建索引的時候進行排序，之后使用分頁方法，不要這樣進行2次查詢。
本文由用戶 jopen 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。
轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！
本文地址：http://www.baiduhome.net/lib/view/open1347432540553.html
Lucene 中文分詞
Lucene 3.6 中文分詞、分頁查詢、高亮顯示等

1、準備工作

2、從Oracle數據庫中取數據創建索引（使用IK分詞）

3、單字段查詢以及多字段分頁查詢高亮顯示

4、Lucene通配符查詢

6、Lucene先根據時間排序后分頁

相關經驗

相關資訊

相關文檔

目錄