Python抓取指定網頁以及該網頁上所有鏈接

pythopen 9年前發布 | 3K 次閱讀 Python
#!/usr/bin/env python
-- coding: utf-8 --
**
Copyright (C) 2010 yangyingchao@gmail.com
Author: yangyingchao <yangyingchao@gmail.com>
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
GNU Emacs; see the file COPYING.  If not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
**
from copy import deepcopy
from sgmllib import SGMLParser
from xml.dom.minidom import *
import os
import re
import sys
import urllib2
title = "Untitled"
class MyParser(SGMLParser):

def __init__(self):
    self.data = ""
    self.links = []
    self.TAG_BEG = False
    self.TAG_END = False
    SGMLParser.__init__(self, 0)

def handle_data(self, data):
    if (self.TAG_BEG is True) and (self.TAG_END is False):
        self.data += data
    pass

def start_title(self, attrs):
    self.link = ""
    self.data=""

    self.TAG_BEG = True
    self.TAG_END = False
    for (key, val) in attrs:
        if key == "href":
            self.link = val

def end_title(self):
    self.TAG_BEG = False
    self.TAG_END = True

    self.title = self.data.strip()


def flush(self):
    pass

def handle_comment(self, data):
    pass

def start_a(self, attrs):
    self.data=""

    self.TAG_BEG = True
    self.TAG_END = False
    for (key, val) in attrs:
        if key == "href":
            self.link = val

def end_a(self):
    self.TAG_BEG = False
    self.TAG_END = True
    tmp = {}
    tmp["name"] = self.data
    tmp["link"] = self.link
    self.links.append(deepcopy(tmp))


def unknown_starttag(self, tag, attrs):
    pass

def unknown_endtag(self, tag):
    pass


def unknown_entityref(self, ref):
    pass

def unknown_charref(self, ref):
    pass

def unknown_decl(self, data):
    pass

def close(self):
    SGMLParser.close(self)
    self.flush()


def lst2str(lst):
    string = ""
    for item in lst:
        string += item.strip()+ "\n"
    return string
def downURL(url, filename):
    print "Download %s, save as %s"%(url, filename)
    try:
        fp = urllib2.urlopen(url)
    except:
        print "download exception"
        print sys.exc_info()
        return 0
    op = open(filename, "wb")
    while 1:
        s = fp.read()
        if not s:
            break
        op.write(s)
    fp.close( )
    op.close( )
    return 1
def reptile(base_url):
    """
    Download all articles from base_url.
    Arguments:

- `base_url`: Url of website.
"""
page_list = []
if not len(base_url):
    print "No page to reptile!"
    sys.exit(1)

parser = MyParser()

if base_url.startswith("http"):
    myopen = urllib2.urlopen
else:
    myopen = open

try:
    content = myopen(base_url).read()
except:
    print "Failed to read from %s."%base_url
    print sys.exc_info()

for item in content:
    parser.feed(item)

for tmp in parser.links:
    page_list.append(tmp.get("link"))

global title
title = parser.title
parser.close()

item_list = list(set(page_list))

for item in item_list:
    # Strip '#' from url.
    pos = item.find('#')
    if pos != -1:
        item = item[:pos]

    # Added base_url to item if necessary
    if not item.startswith("http"):
        item = base_url.rstrip("/")+"/"+item
        pass

    local_file = item.split("/")[-1]
    print item, local_file
    if not local_file:
        print "Empty local file! Continue from next one!"
        continue

    if os.access(local_file, os.F_OK):
        print "File: %s existed, skip ..."%local_file
    else:
        ret = downURL(item, local_file)

# Remember to download the index file!
downURL(base_url, "index.html")
print "Total: %d articles."%(len(item_list))
pass



def walk_dir(lst, dirname, filenames):
    for filename in filenames:
        fn = os.path.join(dirname, filename)
        if os.path.isdir(fn) or \
               not filename.endswith("html"):
            continue
        print "Processing: %s"%fn
        tmp = {}
        parser = MyParser()
        content = open(fn).read()
        for item in content:
            parser.feed(item)
        tmp["file"] = filename
        tmp["title"] = parser.title
        parser.close()
        lst.append(deepcopy(tmp))
    pass
def gen_index():
    """
    Generate index of all htmls in this directory.
    """
    file_lists = []
    os.path.walk(".", walk_dir, file_lists)

fp = open("%s.devhelp2"%os.path.basename(os.getcwd()), "w")
string = '<?xml version="1.0" encoding="utf-8"?>\n<book author=""' +\
    ' language="c" link="index.html" name="" title="%s"'%title+\
    ' version="2" xmlns="http://www.devhelp.net/book">\n  <chapters>'
for item in file_lists:
    link = item.get("file")
    try:
        name =item.get("title").decode('gbk').encode('utf-8')
    except:
        name = item.get("title")
    finally:
        string += '<sub link="%s" name="%s"/>\n'%(link, name)

string +=   '\n</chapters>\n   </book>\n'
fp.write(string)
fp.close()


if name == 'main':
    if len(sys.argv) != 2:
        print "Usage: %s url of baidu space"%sys.argv[0]
        print "Such as: %s http://hi.baidu.com/Username&quot;
        gen_index()
        sys.exit(1)
    base_url = sys.argv[1]
    reptile (base_url)
    gen_index()</pre>
本文由用戶 pythopen 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。
轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！
本文地址：http://www.baiduhome.net/code/view/1421983996890
Python
Python抓取指定網頁以及該網頁上所有鏈接

相關代碼

相關文檔

相關經驗

目錄