提取網頁中的超鏈接C#代碼
using System; using System.Xml; using System.Text; using System.Net; using System.IO; using System.Collections; using System.Text.RegularExpressions; public class App { public static void Main() { string strCode; ArrayList alLinks; Console.Write("請輸入一個網頁地址:"); string strURL = Console.ReadLine(); if(strURL.Substring(0,7) != @"http://") { strURL = @"http://" + strURL; } Console.WriteLine("正在獲取頁面代碼,請稍侯..."); strCode = GetPageSource(strURL); Console.WriteLine("正在提取超鏈接,請稍侯..."); alLinks = GetHyperLinks(strCode); Console.WriteLine("正在寫入文件,請稍侯..."); WriteToXml(strURL,alLinks); } // 獲取指定網頁的HTML代碼 static string GetPageSource(string URL) { Uri uri =new Uri(URL); HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri); HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse(); hwReq.Method = "Get"; hwReq.KeepAlive = false; StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312")); return reader.ReadToEnd(); } // 提取HTML代碼中的網址 static ArrayList GetHyperLinks(string htmlCode) { ArrayList al = new ArrayList(); string strRegex = @"http://([/w-]+/.)+[/w-]+(/[/w- ./?%&=]*)?"; Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); MatchCollection m = r.Matches(htmlCode); for(int i=0; i<=m.Count-1; i++) { bool rep = false; string strNew = m[i].ToString(); // 過濾重復的URL foreach(string str in al) { if(strNew==str) { rep =true; break; } } if(!rep) al.Add(strNew); } al.Sort(); return al; } // 把網址寫入xml文件 static void WriteToXml(string strURL, ArrayList alHyperLinks) { XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8); writer.Formatting = Formatting.Indented; writer.WriteStartDocument(false); writer.WriteDocType("HyperLinks", null, "urls.dtd", null); writer.WriteComment("提取自" + strURL + "的超鏈接"); writer.WriteStartElement("HyperLinks"); writer.WriteStartElement("HyperLinks", null); writer.WriteAttributeString("DateTime",DateTime.Now.ToString());foreach(string str in alHyperLinks) { string title = GetDomain(str); string body = str; writer.WriteElementString(title,null,body); } writer.WriteEndElement(); writer.WriteEndElement(); writer.Flush(); writer.Close(); } // 獲取網址的域名后綴 static string GetDomain(string strURL) { string retVal; string strRegex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)"; Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); Match m = r.Match(strURL); retVal = m.ToString(); strRegex = @"/.|/$"; retVal = Regex.Replace(retVal, strRegex, "").ToString(); if(retVal == "") retVal = "other"; return retVal; } } </pre>
本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!