Python抓妹子圖+多進程

LasonyaHart 8年前發布 | 926 次閱讀 Python

Pic_downloader.py  

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 07 17:30:58 2015

@author: Dreace
"""
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool 
type_ = sys.getfilesystemencoding()
def rename():
    return time.strftime("%Y%m%d%H%M%S")
def rename_2(name):  
    if len(name) == 2:  
        name = '0' + name + '.jpg'  
    elif len(name) == 1:  
        name = '00' + name + '.jpg'  
    else:  
        name = name + '.jpg'  
    return name
def download_pic(i):
    global count
    global time_out
    if Filter(i):
        try: 
            content = urllib2.urlopen(i,timeout = time_out)
            url_content = content.read()
            file_name = repr(random.randint(10000,999999999)) + "_" + rename_2(repr(count))
            f = open(file_name,"wb")
            f.write(url_content)
            f.close()
            if os.path.getsize(file_name) >= 1024*11:
                count += 1
            else:
                os.remove(file_name)
        except Exception, e:
            print e
def Filter(content):
    for line in Filter_list:
        if content.find(line) == -1:
            return True
def get_pic(url_address):
    global pic_list
    global time_out
    global headers
    try:
        req = urllib2.Request(url = url_address,headers = headers)
        str_ = urllib2.urlopen(req, timeout = time_out).read()
        url_content = str_.split("\'")
        for i in url_content:
            if i.find(".jpg") != -1:
                pic_list.append(i)   
    except Exception, e:
        print e
MAX = 100
count = 0
time_out = 60
thread_num = 50
pic_list = []
page_list = []
pic_kind = ["hot","share","mm","taiwan","japan","model"]
Filter_list = ["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"]
dir_name = "C:\Photos\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = "http://www.mzitu.com/model/page/"
headers = {"User-Agent":" Mozilla/5.0 (Windows NT 10.0; rv:39.0) Gecko/20100101 Firefox/39.0"}
for pic_i in pic_kind:     
    for i in range(1,MAX + 1):  
        page_list.append(url_address + pic_i + "/page/" + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
page_pool.close()
page_pool.join()
print "獲取到".decode("utf-8").encode(type_),len(pic_list),"張圖片,開始下載!".decode("utf-8").encode(type_)
pool = ThreadPool(thread_num) 
pool.map(download_pic,pic_list)
pool.close() 
pool.join()
print count,"張圖片保存在".decode("utf-8").encode(type_) + dir_name
print "共耗時".decode("utf-8").encode(type_),time.time() - start_time,"s"

在爬取過程中出現403的話可以采取如下加瀏覽器頭的方法    

headers = {'User-Agent':' Mozilla/5.0 (Windows NT 10.0; rv:39.0) Gecko/20100101 Firefox/39.0'}
req = urllib2.Request(url = url_address,headers = headers)
str_ = urllib2.urlopen(req,timeout = time_out).read()
 本文由用戶 LasonyaHart 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!