C#通過編輯距離計算兩個字符串的相似度
編輯距離的算法是首先由俄國科學家Levenshtein提出的,故又叫 Levenshtein Distance。一個字符串可以通過增加一個字符,刪除一個字符,替換一個字符得到另外一個字符串,假設,我們把從字符串A轉換成字符串B,前面3種操 作所執行的最少次數稱為AB相似度 
 如 abc adc 度為 1 
 ababababa babababab 度為 2 
 abcd acdb 度為2
using System; using System.Text.RegularExpressions; using System.Threading.Tasks;namespace Levenshtein { /// <summary> /// 分析完成事件委托 /// </summary> /// <param name="sim">相似度</param> public delegate void AnalyzerCompletedHander(double sim);
/// <summary> /// 文章相似度工具 /// </summary> public class LevenshteinDistance:IDisposable { private string str1; private string str2; private int[,] index; int k; Task<double> task; /// <summary> /// 分析完成事件 /// </summary> public event AnalyzerCompletedHander AnalyzerCompleted; /// <summary> /// 獲取或設置文章1 /// </summary> public string Str1 { get { return str1; } set { str1 = Format(value); index = new int[str1.Length, str2.Length]; } } /// <summary> /// 獲取或設置文章2 /// </summary> public string Str2 { get { return str2; } set { str2 = Format(value); index = new int[str1.Length, str2.Length]; } } /// <summary> /// 運算總次數 /// </summary> public int TotalTimes { get { return str1.Length * str2.Length; } } /// <summary> /// 是否完成 /// </summary> public bool IsCompleted { get { return task.IsCompleted; } } /// <summary> /// 實例化 /// </summary> /// <param name="str1">文章1</param> /// <param name="str2">文章2</param> public LevenshteinDistance(string str1, string str2) { this.str1 = Format(str1); this.str2 = Format(str2); index = new int[str1.Length, str2.Length]; } public LevenshteinDistance() { } /// <summary> /// 異步開始任務 /// </summary> public void Start() { task = new Task<double>(Analyzer); task.Start(); task.ContinueWith(o => Completed(o.Result)); } /// <summary> /// 同步開始任務 /// </summary> /// <returns>相似度</returns> public double StartAyns() { task = new Task<double>(Analyzer); task.Start(); task.Wait(); return task.Result; } private void Completed(double s) { if (AnalyzerCompleted != null) { AnalyzerCompleted(s); } } private double Analyzer() { if (str1.Length == 0 || str2.Length == 0) return 0; for (int i = 0; i < str1.Length; i++) { for (int j = 0; j < str2.Length; j++) { k = str1[i] == str2[j] ? 0 : 1; if (i == 0&&j==0) { continue; } else if (i == 0) { index[i, j] = k + index[i, j - 1]; continue; } else if (j == 0) { index[i, j] = k + index[i - 1, j]; continue; } int temp = Min(index[i, j - 1], index[i - 1, j], index[i - 1, j - 1]); index[i, j] = temp + k; } } float similarty = 1 - (float)index[str1.Length - 1, str2.Length - 1] / (str1.Length > str2.Length ? str1.Length : str2.Length); return similarty; } private string Format(string str) { str = Regex.Replace(str, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", ""); return str; } private int Min(int a, int b, int c) { int temp = a < b ? a : b; temp = temp < c ? temp : c; return temp; } public void Dispose() { task.Dispose(); } }}</pre>
 本文由用戶 ybny  自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
                         轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
                         本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!