Lucene.net入門學習（結合盤古分詞）

jopen 11年前發布 | 97K 次閱讀 Lucene 搜索引擎 Lucene.net

Lucene簡介

Lucene是apache軟件基金會4 jakarta項目組的一個子項目，是一個開放源代碼的全文檢索引擎工具包，即它不是一個完整的全文檢索引擎，而是一個全文檢索引擎的架構，提供了完整的查詢引擎和索引引擎，部分文本分析引擎（英文與德文兩種西方語言）。Lucene的目的是為軟件開發人員提供一個簡單易用的工具包，以方便的在目標系統中實現全文檢索的功能，或者是以此為基礎建立起完整的全文檢索引擎。

Lucene.net簡介

Lucene.net是Lucene的.net移植版本，是一個開源的全文檢索引擎開發包，即它不是一個完整的全文檢索引擎，而是一個全文檢索引擎的架構，提供了完整的查詢引擎和索引引擎。開發人員可以基于Lucene.net實現全文檢索的功能。

Lucene.net工作原理

Lucene.net提供的服務需要兩部分：索引文件的寫入和索引文件的讀取。

1寫入流程
源數據字符串經過analyzer處理，將源中需要搜索的信息加入Document的各個字段中，并把需要索引的字段起來并存儲。
將索引寫入存儲器，存儲器可以是內存或磁盤。

2讀出流程
用戶提供搜索關鍵詞，經過analyzer處理。（我們下面代碼采用的是盤古分詞，其相關分詞原理可以再它的官網上可以看到 http://pangusegment.codeplex.com/）
對處理后的關鍵詞搜索索引找出對應的Document，用戶根據需要從找到的Document中提取需要的Field。

Lucene.net安裝

大家可以去官網看下：https://www.nuget.org/packages/Lucene.Net/3.0.3

Lucene.net入門學習（結合盤古分詞）

盤古分詞安裝

盤古分詞主頁：http://pangusegment.codeplex.com/

下載：http://pangusegment.codeplex.com/downloads/get/144143

Lucene.net結合盤古分詞使用

http://pangusegment.codeplex.com/downloads/get/144145

大家可以看到相關使用的案列

Lucene.net創建索引（結合盤古分詞）

    /*code 釋迦苦僧*/
    class Program
    {
        static void Main(string[] args)
        {
            Stopwatch sw = new Stopwatch();//加入時間統計
            //獲取 數據列表
            PostBll bll = new PostBll();
            IList<PostInfo> posts = bll.GetAllPost();
            Console.WriteLine(posts.Count);
            //創建Lucene索引文件
            string IndexDic = @"D:\Lucene\post\";
            sw.Start();
            IndexWriter writer = new IndexWriter(FSDirectory.Open(IndexDic), new PanGuAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
            foreach (PostInfo item in posts)
            {
                Document doc = new Document();
                Field postid = new Field("PostId", item.PostId.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
                Field title = new Field("Title", item.Title.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
                Field postscore = new Field("PostScore", item.PostScore.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
                doc.Add(postid);
                doc.Add(title);
                doc.Add(postscore);
                writer.AddDocument(doc);
            }
            writer.Optimize();
            writer.Commit();
            sw.Stop();
            Console.Write("建立" + posts.Count + "索引,花費: " + sw.Elapsed);
            Console.ReadLine(); 
        }
    }

如代碼所示：
D:\Lucene\post\ 存儲Lucene.net生成的索引文件，如下圖

Lucene.net入門學習（結合盤古分詞）

這些索引存儲文件存儲了PostInfo表中 PostId，Title，PostScore 三個字段信息。

需要注意的是：使用盤古分詞操作時，需要將PanGu.xml和盤古分詞自帶的分詞文件放入項目中，如下圖：

Lucene.net入門學習（結合盤古分詞）

Lucene.net執行搜索（結合盤古分詞）

namespace LuceneNetStudy.Search
{
    /*code 釋迦苦僧*/
    public partial class MainForm : Form
    {
        private string IndexDic = @"D:\Lucene\post\";

        public MainForm()
        {
            InitializeComponent();
        }

        private void btnSearch_Click(object sender, EventArgs e)
        {
            /*開啟搜索用的后臺線程*/
            BackgroundWorker backWorker = new BackgroundWorker();
            backWorker.DoWork += new DoWorkEventHandler(backWorker_DoWork);
            backWorker.RunWorkerAsync(txtKey.Text.Trim());
        }

        void backWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            string key = e.Argument as string;

            List<PostInfo> result = new List<PostInfo>();
            /*加入時間統計*/
            Stopwatch sw = new Stopwatch();
            sw.Start();

            /*創建 Lucene.net 搜索實例*/
            IndexSearcher search = new IndexSearcher(FSDirectory.Open(IndexDic), true);

            /*為搜索實例 加入搜索分詞規則  來源 盤古分詞*/
            key = GetKeyWordsSplitBySpace(key, new PanGuTokenizer());
            BooleanQuery bq = new BooleanQuery();
            if (!string.IsNullOrEmpty(key))
            {
                /*如果搜索關鍵字不為空  知道關鍵字搜索列為Title*/
                QueryParser queryParser = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, new string[] { "Title" }, new PanGuAnalyzer());
                Query query = queryParser.Parse(key);
                bq.Add(query, Occur.MUST);
            }


            /*指定排序方式  按 PostScore 字段來排序*/
            List<SortField> sorts = new List<SortField>();
            SortField sf = new SortField("PostScore", SortField.DOUBLE, true);
            sorts.Add(sf);
            Sort sort = new Sort(sorts.ToArray());
            TopFieldDocs docs = search.Search(bq, null, search.MaxDoc, sort);
            int allCount = docs.TotalHits;
            /*獲取匹配的前10條*/
            ScoreDoc[] hits = TopDocs(0, 10, docs);
            foreach (ScoreDoc sd in hits)//遍歷搜索到的結果
            {
                try
                {
                    Document doc = search.Doc(sd.Doc);
                    var model = new PostInfo();
                    model.PostId = Guid.Parse(doc.Get("PostId"));
                    model.PostScore = double.Parse(doc.Get("PostScore"));
                    model.Title = doc.Get("Title");
                    result.Add(model);
                }
                catch
                {

                }
            }
            search.Close();
            search.Dispose();
            sw.Stop();
            if (result != null)
            {
                Invoke(new MethodInvoker(delegate()
                {
                    lblRunTime.Text = "花費: " + sw.Elapsed;

                    txtResult.Text = "";
                    foreach (PostInfo info in result)//遍歷搜索到的結果
                    {
                        txtResult.Text += info.PostScore + "\t" + info.Title + "\r\n";
                    }
                }));
            }
        }

        public static ScoreDoc[] TopDocs(int start, int limit, TopFieldDocs docs)
        {
            int endIndex = 0;
            int hc = docs.TotalHits;
            if (hc - start > limit)
            {
                endIndex = start + limit;
            }
            else
            {
                endIndex = hc;
            }

            List<ScoreDoc> dl = new List<ScoreDoc>();
            var da = docs.ScoreDocs;
            for (int i = start; i < endIndex; i++)
            {
                dl.Add(da[i]);
            }
            return dl.ToArray();
        }

        static public string GetKeyWordsSplitBySpace(string keywords, PanGuTokenizer ktTokenizer)
        {
            StringBuilder result = new StringBuilder();
            /*執行分詞操作 一個關鍵字可以拆分為多個次和單個字*/
            ICollection<WordInfo> words = ktTokenizer.SegmentToWordInfos(keywords);

            foreach (WordInfo word in words)
            {
                if (word == null)
                {
                    continue;
                }

                result.AppendFormat("{0} ", word.Word);
            }

            return result.ToString().Trim();
        }
    }
}

作者：釋迦苦僧出處：http://www.cnblogs.com/woxpp/p/3972233.html

本文由用戶 jopen 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。

轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。

本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！

本文地址：http://www.baiduhome.net/lib/view/open1411524503656.html

Lucene 搜索引擎 Lucene.net

Lucene.net入門學習（結合盤古分詞）

相關經驗

相關資訊

相關文檔

目錄