提取網頁中的超鏈接C#代碼

jopen 12年前發布 | 1K 次閱讀

using System; 
using System.Xml; 
using System.Text; 
using System.Net; 
using System.IO; 
using System.Collections; 
using System.Text.RegularExpressions; 
public class App 
{ 
public static void Main() 
{ 
string strCode; 
ArrayList alLinks; 
Console.Write("請輸入一個網頁地址:"); 
string strURL = Console.ReadLine(); 
if(strURL.Substring(0,7) != @"http://") 
{ 
strURL = @"http://" + strURL; 
} 
Console.WriteLine("正在獲取頁面代碼,請稍侯..."); 
strCode = GetPageSource(strURL); 
Console.WriteLine("正在提取超鏈接,請稍侯..."); 
alLinks = GetHyperLinks(strCode); 
Console.WriteLine("正在寫入文件,請稍侯..."); 
WriteToXml(strURL,alLinks); 
} 
// 獲取指定網頁的HTML代碼 
static string GetPageSource(string URL) 
{ 
Uri uri =new Uri(URL); 
HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri); 
HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse(); 
hwReq.Method = "Get"; 
hwReq.KeepAlive = false; 
StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312")); 
return reader.ReadToEnd(); 
} 
// 提取HTML代碼中的網址 
static ArrayList GetHyperLinks(string htmlCode) 
{ 
ArrayList al = new ArrayList(); 
string strRegex = @"http://([/w-]+/.)+[/w-]+(/[/w- ./?%&=]*)?"; 
Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); 
MatchCollection m = r.Matches(htmlCode); 
for(int i=0; i<=m.Count-1; i++) 
{ 
bool rep = false; 
string strNew = m[i].ToString(); 
// 過濾重復的URL 
foreach(string str in al) 
{ 
if(strNew==str) 
{ 
rep =true; 
break; 
} 
} 
if(!rep) al.Add(strNew); 
} 
al.Sort(); 
return al; 
} 
// 把網址寫入xml文件 
static void WriteToXml(string strURL, ArrayList alHyperLinks) 
{ 
XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8); 
writer.Formatting = Formatting.Indented; 
writer.WriteStartDocument(false); 
writer.WriteDocType("HyperLinks", null, "urls.dtd", null); 
writer.WriteComment("提取自" + strURL + "的超鏈接"); 
writer.WriteStartElement("HyperLinks"); 
writer.WriteStartElement("HyperLinks", null); 
writer.WriteAttributeString("DateTime",DateTime.Now.ToString());

foreach(string str in alHyperLinks) { string title = GetDomain(str); string body = str; writer.WriteElementString(title,null,body); } writer.WriteEndElement(); writer.WriteEndElement(); writer.Flush(); writer.Close(); } // 獲取網址的域名后綴 static string GetDomain(string strURL) { string retVal; string strRegex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)"; Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); Match m = r.Match(strURL); retVal = m.ToString(); strRegex = @"/.|/$"; retVal = Regex.Replace(retVal, strRegex, "").ToString(); if(retVal == "") retVal = "other"; return retVal; } } </pre>

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!