python抓取圖片示例

openocode 11年前發布 | 1K 次閱讀 Python

#!/usr/bin/python
-- coding:utf-8 --
import re
import os
import urllib, urllib2, cookielib
import shutil
from BeautifulSoup import BeautifulSoup
---- utils ----
def normalize_url(url):
    return "http://&quot; + url if cmp(url[0:7],"http://&quot;) != 0 else url
def safeDir(dir):
    return dir.replace('/', '')
---- variable ----
homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-&quot;
homepageSuffix = ".html"
threadPrefix = "http://60dxw.comww1.baisex.me/&quot;
homedir = "baixingge"
---- login ----
cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie)
---- file ----
if (os.path.exists(homedir) == False):
    os.mkdir(homedir)
os.chdir(homedir)
---- crawl ----
for page in range(1, 25):
    pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix)

# ---- mkdir ----
if (os.path.exists(str(page)) == False):
    os.mkdir(str(page))
os.chdir(str(page))
print pageUrl

# ---- download ----
html_body = urllib.urlopen(pageUrl).read()
soup = BeautifulSoup(html_body)

# ---- extract ----
threaddUrls = []
urlRaws = soup.findAll('th', attrs = {'class' : ['new', 'common']})
urlPattern = re.compile(r'href="([^"]*)"')
titlePattern = re.compile(r'>([^<]*)</a>')
for urlRaw in urlRaws: 
    h = urlPattern.search(str(urlRaw))
    t = titlePattern.search(str(urlRaw))
    threadUrl = h.group(1)
    threadTitle = t.group(1)
    if (os.path.exists(threadTitle) == False):
        os.mkdir(safeDir(threadTitle))
    else:
        continue
    os.chdir(safeDir(threadTitle))

    page_url = threadPrefix + threadUrl
    print "---->{0}".format(page_url)
    print "---->{0}".format(safeDir(threadTitle))
    page_body = urllib.urlopen(page_url).read()
    page_soup = BeautifulSoup(page_body)

    imgPattern = re.compile(r'img src="([^"]*)" onload')
    i = imgPattern.findall(str(page_soup))
    index = 0
    for img in i:
        print "-------->{0}".format(img)
        imgSuffix = img[img.rindex('.'):]
        imgName = "{0}{1}".format(str(index), imgSuffix)
        urllib.urlretrieve(img, imgName, None)
        index += 1

    os.chdir("../")
os.chdir("../")

</pre>

本文由用戶 openocode 自行上傳分享，僅供網友學習交流。所有權歸原作者，若您的權利被侵害，請聯系管理員。

轉載本站原創文章，請注明出處，并保留原始鏈接、圖片水印。

本站是一個以用戶分享為主的開源技術平臺，歡迎各類分享！

本文地址：http://www.baiduhome.net/code/view/1420710406125

Python

python抓取圖片示例

相關代碼

相關文檔

相關經驗

目錄