使用Lucene4.8進行索引及搜索的基本操作

jopen 10年前發布 | 36K 次閱讀 Lucene 搜索引擎

在Lucene對文本進行處理的過程中,可以大致分為兩大部分:

1、索引文件:提取文檔內容并分析,生成索引

2、搜索內容:搜索索引內容,根據搜索關鍵字得出搜索結果

 

一、索引文件

基本步驟如下:

1、創建索引庫IndexWriter

2、根據文件創建文檔Document

 3、向索引庫中寫入文檔內容

    package com.ljh.search.index;

import java.io.File;  
import java.io.FileReader;  
import java.io.IOException;  

import org.apache.lucene.analysis.standard.StandardAnalyzer;  
import org.apache.lucene.document.Document;  
import org.apache.lucene.document.Field;  
import org.apache.lucene.document.LongField;  
import org.apache.lucene.document.StringField;  
import org.apache.lucene.document.TextField;  
import org.apache.lucene.index.IndexWriter;  
import org.apache.lucene.index.IndexWriterConfig;  
import org.apache.lucene.store.Directory;  
import org.apache.lucene.store.FSDirectory;  
import org.apache.lucene.util.Version;  

// 1、創建索引庫IndexWriter  
// 2、根據文件創建文檔Document  
// 3、向索引庫中寫入文檔內容  

public class IndexFiles {  

    public static void main(String[] args) throws IOException {  

        String usage = "java IndexFiles"  
                + " [-index INDEX_PATH] [-docs DOCS_PATH] \n\n"  
                + "This indexes the documents in DOCS_PATH, creating a Lucene index"  
                + "in INDEX_PATH that can be searched with SearchFiles";  

        String indexPath = null;  
        String docsPath = null;  
        for (int i = 0; i < args.length; i++) {  
            if ("-index".equals(args[i])) {  
                indexPath = args[i + 1];  
                i++;  
            } else if ("-docs".equals(args[i])) {  
                docsPath = args[i + 1];  
                i++;  
            }  
        }  

        if (docsPath == null) {  
            System.err.println("Usage: " + usage);  
            System.exit(1);  
        }  

        final File docDir = new File(docsPath);  
        if (!docDir.exists() || !docDir.canRead()) {  
            System.out  
                    .println("Document directory '"  
                            + docDir.getAbsolutePath()  
                            + "' does not exist or is not readable, please check the path");  
            System.exit(1);  
        }  

        IndexWriter writer = null;  
        try {  
            // 1、創建索引庫IndexWriter  
            writer = getIndexWriter(indexPath);  
            index(writer, docDir);  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            writer.close();  
        }  

    }  

    private static IndexWriter getIndexWriter(String indexPath)  
            throws IOException {  

        Directory indexDir = FSDirectory.open(new File(indexPath));  

        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48,  
                new StandardAnalyzer(Version.LUCENE_48));  

        IndexWriter writer = new IndexWriter(indexDir, iwc);  

        return writer;  
    }  

    private static void index(IndexWriter writer, File file) throws IOException {  

        if (file.isDirectory()) {  
            String[] files = file.list();  
            if (files != null) {  
                for (int i = 0; i < files.length; i++) {  
                    index(writer, new File(file, files[i]));  
                }  
            }  
        } else {  
            // 2、根據文件創建文檔Document  
            Document doc = new Document();  
            Field pathField = new StringField("path", file.getPath(),  
                    Field.Store.YES);  
            doc.add(pathField);  
            doc.add(new LongField("modified", file.lastModified(),  
                    Field.Store.NO));  
            doc.add(new TextField("contents", new FileReader(file)));  
            System.out.println("Indexing " + file.getName());  

            // 3、向索引庫中寫入文檔內容  
            writer.addDocument(doc);  
        }  

    }  

}  </pre><a style="text-indent:0px;" title="派生到我的代碼片" href="/misc/goto?guid=4959554361015216275" target="_blank"></a></div>

</div> </div>

(1)使用“java indexfiles -index d:/index -docs d:/tmp”運行程序,索引d:/tmp中的文件,并將索引文件放置到d:/index。

(2)上述生成的索引文件可以使用Luke進行查看。目前Luke已遷移至github進行托管。


二、搜索文件

1、打開索引庫IndexSearcher
2、根據關鍵詞進行搜索
3、遍歷結果并處理

package com.ljh.search.search;

//1、打開索引庫IndexSearcher
//2、根據關鍵詞進行搜索
//3、遍歷結果并處理
import java.io.File;
import java.io.IOException;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher {
public static void main(String[] args) throws IOException {

    String indexPath = null;  
    String term = null;  
    for (int i = 0; i < args.length; i++) {  
        if ("-index".equals(args[i])) {  
            indexPath = args[i + 1];  
            i++;  
        } else if ("-term".equals(args[i])) {  
            term = args[i + 1];  
            i++;  
        }  
    }  

    System.out.println("Searching " + term + " in " + indexPath);  

    // 1、打開索引庫  
    Directory indexDir = FSDirectory.open(new File(indexPath));  
    IndexReader ir = DirectoryReader.open(indexDir);  
    IndexSearcher searcher = new IndexSearcher(ir);  

    // 2、根據關鍵詞進行搜索  
    TopDocs docs = searcher.search(  
            new TermQuery(new Term("contents", term)), 20);  

    // 3、遍歷結果并處理  
    ScoreDoc[] hits = docs.scoreDocs;  
    System.out.println(hits.length);  
    for (ScoreDoc hit : hits) {  
        System.out.println("doc: " + hit.doc + " score: " + hit.score);  
    }  

    ir.close();  

}  

} </pre></div> </div> </div> 來自:http://blog.csdn.net/jediael_lu/article/details/30035025

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!