lucene索引_加權操作、對日期和數字進行索引、IndexReader的設計
package com.dhb.index;import java.io.File; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.junit.Before; import org.junit.Test; public class IndexUtil { private String[] ids = {"1","2","3","4","5","6"}; private String[] emails = {"aa@csdn.org","bb@csdn.org","cc@sina.org","dd@sina.org", "ee@qq.com","ff@qq.com"}; private String[] contents = {"Welcome to my office ,I like surfing internet.", "hello boys like haha", "hello girls we like each other.", "I like football,you like too.", "I like basketball very much, how about you?", "bye-bye see you I don't like."}; private int[] attachment ={2,3,1,4,5,5}; private String[] names = {"Victor","Nancy","Kitty","Cindy","Tom","Tony"}; private Map<String, Float> scores = new HashMap<String, Float>(); private Date[] dates = null; private static IndexReader reader = null; private Directory directory = null; @Before public void IndexUtilBefore() { try { setDates(); scores.put("qq.com", 2.0f); scores.put("sina.org", 1.5f); directory = FSDirectory.open(new File("D:/luceneData/index02")); reader = IndexReader.open(directory, false); } catch (IOException e) { e.printStackTrace(); } } public IndexSearcher getSearcher() { try { if(reader==null) { reader = IndexReader.open(directory); } else { IndexReader tr = IndexReader.openIfChanged(reader); if(tr!=null) { reader.close(); //關閉原來的reader reader = tr; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } private void setDates() { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); dates = new Date[ids.length]; try { dates[0] = sdf.parse("2010-02-19"); dates[1] = sdf.parse("2012-01-11"); dates[2] = sdf.parse("2011-09-19"); dates[3] = sdf.parse("2010-12-22"); dates[4] = sdf.parse("2012-01-01"); dates[5] = sdf.parse("2011-05-19"); } catch (ParseException e) { e.printStackTrace(); } } @Test public void index() { IndexWriter writer = null; IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); try { writer = new IndexWriter(directory, iwc); //清空所有索引 writer.deleteAll(); Document doc = null; for (int i = 0;i < ids.length; i++) { /** * Field.Store.YES或者NO(存儲域選項) * 1、設置為YES表示把這個域中的內容完全存儲到文件中,方便進行文本的還原 * 2、設置為NO表示把這個域中的內容不存儲到文件中,但是可以被索引,此時內容無法還原(doc.get) */ /** * 使用Field.Index.*來進行操作 * Index.ANALYZED:進行分詞和索引,適用于標題和內容 * Index.NOT_ANALYZED:進行索引,但不進行分詞,如身份證號碼,姓名,ID等,適用于精確搜索 * Index.ANALYZED_NO_NORMS進行分詞但是不存儲norms信息,這個norms中包含了創建索引的時間和權值等信息 * Index.NOT_ANALYZED_NO_NORMS即不進行分詞也不存儲norms信息 * Index.NO不進行索引 */ /** * NOT_ANALYZED_NO_NORMS YES 標示符(主鍵,文件名),電話號碼,身份證號,姓名,日期 * ANALYZED YES 文檔標題和摘要 * ANALYZED NO 文檔正文 * NO YES 文檔類型,數據庫主鍵(不進行索引) * NOT_ANALYZED NO 隱藏關鍵字 */ doc = new Document(); doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED)); doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS)); /** * 加權操作 */ String et = emails[i].substring(emails[i].indexOf("@")+1); System.out.println(et); if(scores.containsKey(et)) { doc.setBoost(scores.get(et)); } else { doc.setBoost(0.5f); } //對數字的操作,存儲數字 doc.add(new NumericField("attachment", Field.Store.YES, true).setIntValue(attachment[i])); //對日期的操作,存儲日期 doc.add(new NumericField("dates", Field.Store.YES, true).setLongValue(dates[i].getTime())); writer.addDocument(doc); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if(writer!=null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } @Test public void query() { try { IndexReader reader = IndexReader.open(directory); //通過reader可以有效地獲取文檔的數量 System.out.println("numDocs:"+reader.numDocs()); System.out.println("maxDocs:"+reader.maxDoc()); //通過reader可以有效地獲取刪除的文檔的數量 System.out.println("numDeletedDocs:"+reader.numDeletedDocs()); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test /** * 刪除 */ public void delete() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); //參數是一個選項,可以是一個query;也可以是一個term,term是一個精確查找的值 //此時刪除的文檔并不會被完全刪除,而是存儲在一個回收站中的,是可以恢復的 writer.deleteDocuments(new Term("id", "1")); writer.commit(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { /*if(writer!=null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } */ } } @Test public void delete2() { try { reader.deleteDocuments(new Term("id", "1")); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test /** * 強制刪除 * 在lucene3.5之前都是使用optimize()進行處理,但是這個操作消耗資源,已經被棄用 */ public void forceDelete() { //刪除優化,刪除回收站文件 IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); writer.forceMergeDeletes(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if(writer!=null) try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } @Test /** * 恢復刪除 */ public void unDelete() { //使用indexReader進行恢復 try { IndexReader reader = IndexReader.open(directory, false); //恢復時必須把IndexReader的只讀(readOnly)設置為false reader.undeleteAll(); reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test public void search() { try { IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TermQuery query = new TermQuery(new Term("content", "like")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd : tds.scoreDocs) { Document d = searcher.doc(sd.doc); System.out.println("("+sd.doc+") "+"--權值:"+d.getBoost()+"--分數:"+sd.score+ d.get("name")+"["+d.get("email")+"] "+d.get("id")+",附件:" +d.get("attachment")+",日期:"+d.get("dates")); } reader.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test public void search2() { try { //IndexReader reader = IndexReader.open(directory); //IndexSearcher searcher = new IndexSearcher(reader); //方式二: IndexSearcher searcher = getSearcher(); TermQuery query = new TermQuery(new Term("content", "like")); TopDocs tds = searcher.search(query, 10); for(ScoreDoc sd : tds.scoreDocs) { Document d = searcher.doc(sd.doc); System.out.println("("+sd.doc+") "+"--權值:"+d.getBoost()+"--分數:"+sd.score+ d.get("name")+"["+d.get("email")+"] "+d.get("id")+",附件:" +d.get("attachment")+",日期:"+d.get("dates")); } searcher.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Test public void search3() { for (int i = 0; i < 5; i++) { search2(); System.out.println("------------------"); try { Thread.sleep(10000); } catch (InterruptedException e) { e.printStackTrace(); } } } } </pre><br />
本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!