基于Java的web爬蟲,Arachnid
Arachnid是一個基于Java的web spider框架.它包含一個簡單的HTML剖析器能夠分析包含HTML內容的輸入流.通過實現Arachnid的子類就能夠開發一個簡單的Web spiders并能夠在Web站上的每個頁面被解析之后增加幾行代碼調用。 Arachnid的下載包中包含兩個spider應用程序例子用于演示如何使用該框架。
import java.io.; import java.net.; import java.util.; import bplatt.spider.;public class SimpleSiteMapGen { private String site; private final static String header = "<html><head><title>Site Map</title></head><body><ul>"; private final static String trailer = "</ul></body></html>";
public static void main(String[] args) { if (args.length != 1) { System.err.println("java SimpleSiteMapGen <url>"); System.exit(-1); } SimpleSiteMapGen s = new SimpleSiteMapGen(args[0]); s.generate(); }
public SimpleSiteMapGen(String site) { this.site = site; }
public void generate() { MySpider spider = null; try { spider = new MySpider(site); } catch(MalformedURLException e) { System.err.println(e); System.err.println("Invalid URL: "+site); return; } System.out.println(header); spider.traverse(); System.out.println(trailer); } }
class MySpider extends Arachnid { public MySpider(String base) throws MalformedURLException { super(base); }
protected void handleLink(PageInfo p) { String link = p.getUrl().toString(); String title = p.getTitle(); if (link == null || title == null || link.length() == 0 || title.length() ==0) return; else System.out.println("<li><a href=\""+link+"\">"+title+"</a></li>"); } protected void handleBadLink(URL url,URL parent, PageInfo p) { } protected void handleBadIO(URL url, URL parent) { } protected void handleNonHTMLlink(URL url, URL parent,PageInfo p) { } protected void handleExternalLink(URL url, URL parent) { } }</pre>