lucene索引_創建_域選項和lucene索引_的刪除、更新.

jopen 9年前發布 | 13K 次閱讀 Lucene 搜索引擎

    package com.dhb.index;

import java.io.File;  
import java.io.IOException;  

import org.apache.lucene.analysis.standard.StandardAnalyzer;  
import org.apache.lucene.document.Document;  
import org.apache.lucene.document.Field;  
import org.apache.lucene.index.CorruptIndexException;  
import org.apache.lucene.index.IndexReader;  
import org.apache.lucene.index.IndexWriter;  
import org.apache.lucene.index.IndexWriterConfig;  
import org.apache.lucene.index.IndexReader.FieldOption;  
import org.apache.lucene.index.Term;  
import org.apache.lucene.store.Directory;  
import org.apache.lucene.store.FSDirectory;  
import org.apache.lucene.store.LockObtainFailedException;  
import org.apache.lucene.util.Version;  
import org.junit.Before;  
import org.junit.Test;  

public class IndexUtil {  
    private String[] ids = {"1","2","3","4","5","6"};  
    private String[] emails = {"aa@csdn.org","bb@csdn.org","cc@sina.org","dd@sina.org",  
            "ee@qq.com","ff@qq.com"};  
    private String[] contents = {"Welcome to my office","hello boys","hello girls",  
            "I like football","I like basketball","bye-bye see you"};  
    private int[] attachment ={2,3,1,4,5,5};  
    private String[] names = {"Victor","Nancy","Kitty","Cindy","Tom","Tony"};   

    private Directory directory = null;  
    @Before  
    public void IndexUtilBefore() {  
        try {  
            directory = FSDirectory.open(new File("D:/luceneData/index02"));  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
    }  
    @Test  
    public void index() {  
        IndexWriter writer = null;  
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,   
                new StandardAnalyzer(Version.LUCENE_35));  
        try {  
            writer = new IndexWriter(directory, iwc);  
            //清空所有索引  
            writer.deleteAll();  
            Document doc = null;  
            for (int i = 0;i < ids.length; i++) {  
                /** 
                 * Field.Store.YES或者NO(存儲域選項) 
                 * 1、設置為YES表示把這個域中的內容完全存儲到文件中,方便進行文本的還原 
                 * 2、設置為NO表示把這個域中的內容不存儲到文件中,但是可以被索引,此時內容無法還原(doc.get) 
                 */  
                /** 
                 * 使用Field.Index.*來進行操作 
                 * Index.ANALYZED:進行分詞和索引,適用于標題和內容 
                 * Index.NOT_ANALYZED:進行索引,但不進行分詞,如身份證號碼,姓名,ID等,適用于精確搜索 
                 * Index.ANALYZED_NO_NORMS進行分詞但是不存儲norms信息,這個norms中包含了創建索引的時間和權值等信息 
                 * Index.NOT_ANALYZED_NO_NORMS即不進行分詞也不存儲norms信息 
                 * Index.NO不進行索引 
                 */  
                /** 
                 * NOT_ANALYZED_NO_NORMS         YES    標示符(主鍵,文件名),電話號碼,身份證號,姓名,日期 
                 * ANALYZED                      YES    文檔標題和摘要 
                 * ANALYZED                      NO     文檔正文 
                 * NO                            YES    文檔類型,數據庫主鍵(不進行索引) 
                 * NOT_ANALYZED                  NO     隱藏關鍵字 
                 */  
                doc = new Document();  
                doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
                doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));  
                doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));  
                doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
                writer.addDocument(doc);  

            }  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (LockObtainFailedException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if(writer!=null)  
                try {  
                    writer.close();  
                } catch (CorruptIndexException e) {  
                    e.printStackTrace();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
        }  
    }  
    @Test  
    public void query() {  
        try {  
            IndexReader reader = IndexReader.open(directory);  
            //通過reader可以有效地獲取文檔的數量  
            System.out.println("numDocs:"+reader.numDocs());  
            System.out.println("maxDocs:"+reader.maxDoc());  
            //通過reader可以有效地獲取刪除的文檔的數量  
            System.out.println("numDeletedDocs:"+reader.numDeletedDocs());  
            reader.close();  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        }     
    }  
    @Test  
    /** 
     * 刪除 
     */  
    public void delete() {  
        IndexWriter writer = null;  
        try {  
            writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,   
                    new StandardAnalyzer(Version.LUCENE_35)));  
            //參數是一個選項,可以是一個query;也可以是一個term,term是一個精確查找的值  
            //此時刪除的文檔并不會被完全刪除,而是存儲在一個回收站中的,是可以恢復的  
            writer.deleteDocuments(new Term("id", "1"));  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (LockObtainFailedException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if(writer!=null)  
                try {  
                    writer.close();  
                } catch (CorruptIndexException e) {  
                    e.printStackTrace();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }   
        }  
    }  
    @Test  
    /** 
     * 強制刪除 
     * 在lucene3.5之前都是使用optimize()進行處理,但是這個操作消耗資源,已經被棄用 
     */  
    public void forceDelete() {  
        //刪除優化,刪除回收站文件  
        IndexWriter writer = null;  
        try {  
            writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,   
                    new StandardAnalyzer(Version.LUCENE_35)));  
            writer.forceMergeDeletes();  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (LockObtainFailedException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if(writer!=null)  
                try {  
                    writer.close();  
                } catch (CorruptIndexException e) {  
                    e.printStackTrace();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }   
        }  
    }  
    @Test  
    /** 
     * 恢復刪除 
     */  
    public void unDelete() {  
        //使用indexReader進行恢復  
        try {  
            IndexReader reader = IndexReader.open(directory, false);  
            //恢復時必須把IndexReader的只讀(readOnly)設置為false  
            reader.undeleteAll();  
            reader.close();  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
    }  
    @Test  
    /** 
     * 優化和合并索引 
     */  
    public void merge() {  
        IndexWriter writer = null;  
        try {  
            writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,   
                    new StandardAnalyzer(Version.LUCENE_35)));  
            //參數是maxNumSegments,即最大段的數量  
            //會將索引合并成2段,這兩段中被刪除的數據會被清空  
            //特別注意:此處lucene在3.5之后不建議使用,因為會消耗大量的開銷,lucene會根據情況自動處理  
            writer.forceMerge(2);  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (LockObtainFailedException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if(writer!=null)  
                try {  
                    writer.close();  
                } catch (CorruptIndexException e) {  
                    e.printStackTrace();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }   
        }  
    }  
    @Test  
    /** 
     * 更新索引 
     */  
    public void update() {  
        IndexWriter writer = null;  
        try {  
            writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,   
                    new StandardAnalyzer(Version.LUCENE_35)));  
            /** 
             * lucene并沒有提供更新,這里的更新操作實際是如下兩個操作的合集 
             * 先刪除之后再添加 
             */  
            Document doc = new Document();  
            doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
            doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));  
            doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));  
            doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
            writer.updateDocument(new Term("id", "1"), doc);  
            /** 
             * 更新之前: 
             * numDocs:6 
               maxDocs:6 
               numDeletedDocs:0 
             */  
            /** 
             * 更新之后: 
             * numDocs:6 
               maxDocs:7 
               numDeletedDocs:1 
             */  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (LockObtainFailedException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if(writer!=null)  
                try {  
                    writer.close();  
                } catch (CorruptIndexException e) {  
                    e.printStackTrace();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }   
        }  
    }  
}  </pre><br />

  補充:lucene索引_加權操作,修改代碼如下:

    private String[] contents = {"Welcome to my office ,I like surfing internet.",
"hello boys like haha",
"hello girls we like each other.",
"I like football,you like too.",
"I like basketball very much, how about you?",
"bye-bye see you I don't like."};
private Map<String, Float> scores = new HashMap<String, Float>();
public void IndexUtilBefore() {
try {
scores.put("qq.com", 2.0f);
scores.put("sina.org", 1.5f);
directory = FSDirectory.open(new File("D:/luceneData/index02"));
} catch (IOException e) {
e.printStackTrace();
}
}
public void index() {
IndexWriter writer = null;
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,
new StandardAnalyzer(Version.LUCENE_35));
try {
writer = new IndexWriter(directory, iwc);
//清空所有索引
writer.deleteAll();
Document doc = null;
for (int i = 0;i < ids.length; i++) {
/**

                 * Field.Store.YES或者NO(存儲域選項) 
                 * 1、設置為YES表示把這個域中的內容完全存儲到文件中,方便進行文本的還原 
                 * 2、設置為NO表示把這個域中的內容不存儲到文件中,但是可以被索引,此時內容無法還原(doc.get) 
                 */  
                /** 
                 * 使用Field.Index.*來進行操作 
                 * Index.ANALYZED:進行分詞和索引,適用于標題和內容 
                 * Index.NOT_ANALYZED:進行索引,但不進行分詞,如身份證號碼,姓名,ID等,適用于精確搜索 
                 * Index.ANALYZED_NO_NORMS進行分詞但是不存儲norms信息,這個norms中包含了創建索引的時間和權值等信息 
                 * Index.NOT_ANALYZED_NO_NORMS即不進行分詞也不存儲norms信息 
                 * Index.NO不進行索引 
                 */  
                /** 
                 * NOT_ANALYZED_NO_NORMS         YES    標示符(主鍵,文件名),電話號碼,身份證號,姓名,日期 
                 * ANALYZED                      YES    文檔標題和摘要 
                 * ANALYZED                      NO     文檔正文 
                 * NO                            YES    文檔類型,數據庫主鍵(不進行索引) 
                 * NOT_ANALYZED                  NO     隱藏關鍵字 
                 */  
                doc = new Document();  
                doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
                doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));  
                doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));  
                doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));  
                /** 
                 * 加權操作 
                 */  
                String et = emails[i].substring(emails[i].indexOf("@")+1);  
                System.out.println(et);  
                if(scores.containsKey(et)) {  
                    doc.setBoost(scores.get(et));  
                } else {  
                    doc.setBoost(0.5f);  
                }  

                writer.addDocument(doc);  

            }  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (LockObtainFailedException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            if(writer!=null)  
                try {  
                    writer.close();  
                } catch (CorruptIndexException e) {  
                    e.printStackTrace();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
        }  
    }  

@Test  
    public void search() {  
        try {  
            IndexReader reader = IndexReader.open(directory);  
            IndexSearcher searcher = new IndexSearcher(reader);  
            TermQuery query = new TermQuery(new Term("content", "like"));  
            TopDocs tds = searcher.search(query, 10);  
            for(ScoreDoc sd : tds.scoreDocs) {  
                Document d = searcher.doc(sd.doc);  
                System.out.println("("+sd.doc+") "+d.get("name")+"["+d.get("email")+"] "+d.get("id"));  
            }  
        } catch (CorruptIndexException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  

    }  </pre><br />

輸出結果如下: (5) Tony[ff@qq.com] 6
(3) Cindy[dd@sina.org] 4
(4) Tom[ee@qq.com] 5
(2) Kitty[cc@sina.org] 3
(1) Nancy[bb@csdn.org] 2
(0) Victor[aa@csdn.org] 1

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!