使用Lucene4.8進行索引及搜索的基本操作
在Lucene對文本進行處理的過程中,可以大致分為兩大部分:
1、索引文件:提取文檔內容并分析,生成索引
2、搜索內容:搜索索引內容,根據搜索關鍵字得出搜索結果
一、索引文件
基本步驟如下:
1、創建索引庫IndexWriter
2、根據文件創建文檔Document
3、向索引庫中寫入文檔內容
package com.ljh.search.index;import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; // 1、創建索引庫IndexWriter // 2、根據文件創建文檔Document // 3、向索引庫中寫入文檔內容 public class IndexFiles { public static void main(String[] args) throws IOException { String usage = "java IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] \n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = null; String docsPath = null; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out .println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } IndexWriter writer = null; try { // 1、創建索引庫IndexWriter writer = getIndexWriter(indexPath); index(writer, docDir); } catch (IOException e) { e.printStackTrace(); } finally { writer.close(); } } private static IndexWriter getIndexWriter(String indexPath) throws IOException { Directory indexDir = FSDirectory.open(new File(indexPath)); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, new StandardAnalyzer(Version.LUCENE_48)); IndexWriter writer = new IndexWriter(indexDir, iwc); return writer; } private static void index(IndexWriter writer, File file) throws IOException { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { index(writer, new File(file, files[i])); } } } else { // 2、根據文件創建文檔Document Document doc = new Document(); Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); doc.add(new TextField("contents", new FileReader(file))); System.out.println("Indexing " + file.getName()); // 3、向索引庫中寫入文檔內容 writer.addDocument(doc); } } } </pre><a style="text-indent:0px;" title="派生到我的代碼片" href="/misc/goto?guid=4959554361015216275" target="_blank"></a></div>
</div> </div>
(1)使用“java indexfiles -index d:/index -docs d:/tmp”運行程序,索引d:/tmp中的文件,并將索引文件放置到d:/index。
(2)上述生成的索引文件可以使用Luke進行查看。目前Luke已遷移至github進行托管。
二、搜索文件1、打開索引庫IndexSearcher
2、根據關鍵詞進行搜索
3、遍歷結果并處理package com.ljh.search.search;//1、打開索引庫IndexSearcher
//2、根據關鍵詞進行搜索
//3、遍歷結果并處理
import java.io.File;
import java.io.IOException;import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;public class Searcher {
public static void main(String[] args) throws IOException {String indexPath = null; String term = null; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-term".equals(args[i])) { term = args[i + 1]; i++; } } System.out.println("Searching " + term + " in " + indexPath); // 1、打開索引庫 Directory indexDir = FSDirectory.open(new File(indexPath)); IndexReader ir = DirectoryReader.open(indexDir); IndexSearcher searcher = new IndexSearcher(ir); // 2、根據關鍵詞進行搜索 TopDocs docs = searcher.search( new TermQuery(new Term("contents", term)), 20); // 3、遍歷結果并處理 ScoreDoc[] hits = docs.scoreDocs; System.out.println(hits.length); for (ScoreDoc hit : hits) { System.out.println("doc: " + hit.doc + " score: " + hit.score); } ir.close(); }
} </pre></div> </div> </div> 來自:http://blog.csdn.net/jediael_lu/article/details/30035025
本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!相關經驗
相關文檔
目錄