Python爬蟲之正則表達式 PK Pyquery

jopen 8年前發布 | 20K 次閱讀 網絡爬蟲

抓取的目標網頁:http://ypk.39.net/2017019/manual

主要抓取內容為藥品說明書內容

下面先給出正則表達式的抓取方式:

#-*- coding:gbk -*-

import urllib2
import re
import random
import os

def ziduantiqu(page):
    pattern1 = re.compile('<div class="tab_box">(.*?)批準文號.*?<dd>(.*?)<.*?</div>',re.S)
    pattern2 = re.compile('<div class="tab_box">(.*?)<dt>(.*?)</div>',re.S)
    items1 = re.findall(pattern1,page)
    for item1 in items1:
        filename = re.sub("[^A-Z0-9]","",item1[1])
        if (filename+'.txt') in os.listdir("E:/yaopintong1/"):
            filename = filename + '_1'
        print filename
    items2 = re.findall(pattern2,page)
    for item2 in items2:
        #print item2[1]
        content = re.sub("[\s]","",item2[1])
        content = content.replace('</dt>','\n')
        content = content.replace('</strong><br/>','')
        content = content.replace('<br/>','\n')
        content = content.replace('</p>','\n')
        content = content.replace('</dd>','\n')
        content = content.replace('</dd>','\n')

        content = content.replace('&nbsp;','')
        dr = re.compile(r'<[^>]+>',re.S)
        dd = dr.sub('',content)
        #dd = dd.replace('\n\n',',')
        #dd = dd.replace(' ','')
        print dd
        f1 = open('E:/yaopintong1/'+filename+'.txt','a')
        f1.write(dd)
        f1.close()

def proxy():       
    proxylist = ('59.39.88.190:8080',
                '59.41.154.148:3128',
                '59.41.239.13:9797',
                '59.42.251.197:80',
                '59.42.251.214:80',
                '59.42.251.215:80',
                '59.42.251.216:80',
                '59.49.145.151:3128',
                '59.49.248.216:3128')
    ii = random.randint(0,8)
    print ii
    proxy = proxylist[ii]
    proxies = {'http': proxy}
    proxy_support = urllib2.ProxyHandler(proxies)
    opener = urllib2.build_opener(proxy_support)
    urllib2.install_opener(opener)

if __name__ == '__main__':

    file = open("E://url2.txt")
    i = 1
    while 1:
        line = file.readline().rstrip()
        if not line:
            break
        print '開始抓取第---' +str(i) + '------頁內容' 
        proxy()       
        url = line + 'manual'
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'   
        headers = { 'User-Agent' : user_agent }  
        request = urllib2.Request(url,headers = headers)
        try:
            response = urllib2.urlopen(request, timeout = 30)
            page = response.read()
        except Exception, e:
            print Exception,":",e
            f1 = open('E:/url2_error.txt','a')
            f1.write(line+ '\n')
            f1.close()
        else: 
            ziduantiqu(page)
        print '第---' +str(i) + '------頁內容抓取完成'
        i = i + 1



上面的程序是批量采集的,其中網頁鏈接是從記事本文件中讀取的,里面一大堆匹配、替換等操作,好惡心有沒有。

下面給出pyquery模塊代碼

>>> from pyquery import PyQuery as pq
>>> from lxml import etree
>>> import re
>>> v_source = pq(url = 'http://ypk.39.net/2017019/manual')
>>> for data in v_source('div').filter('.tab_box'):
    for i in range(len(pq(data).find('dt'))):
        f = open('yaopin.txt','a')
        f.write(re.sub("[\s]","",pq(data).find('dt').eq(i).text().encode('utf8')))
        f.write('\n')
        f.write(pq(data).find('dd').eq(i).text().encode('utf8'))
        f.write('\n')
        f.close()
        print pq(data).find('dt').eq(i).text()
        print pq(data).find('dd').eq(i).text()



來自: http://my.oschina.net/dfsj66011/blog/598826

 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!