Python爬蟲之正則表達式 PK Pyquery
抓取的目標網頁:http://ypk.39.net/2017019/manual
主要抓取內容為藥品說明書內容
下面先給出正則表達式的抓取方式:
#-*- coding:gbk -*-
import urllib2
import re
import random
import os
def ziduantiqu(page):
pattern1 = re.compile('<div class="tab_box">(.*?)批準文號.*?<dd>(.*?)<.*?</div>',re.S)
pattern2 = re.compile('<div class="tab_box">(.*?)<dt>(.*?)</div>',re.S)
items1 = re.findall(pattern1,page)
for item1 in items1:
filename = re.sub("[^A-Z0-9]","",item1[1])
if (filename+'.txt') in os.listdir("E:/yaopintong1/"):
filename = filename + '_1'
print filename
items2 = re.findall(pattern2,page)
for item2 in items2:
#print item2[1]
content = re.sub("[\s]","",item2[1])
content = content.replace('</dt>','\n')
content = content.replace('</strong><br/>','')
content = content.replace('<br/>','\n')
content = content.replace('</p>','\n')
content = content.replace('</dd>','\n')
content = content.replace('</dd>','\n')
content = content.replace(' ','')
dr = re.compile(r'<[^>]+>',re.S)
dd = dr.sub('',content)
#dd = dd.replace('\n\n',',')
#dd = dd.replace(' ','')
print dd
f1 = open('E:/yaopintong1/'+filename+'.txt','a')
f1.write(dd)
f1.close()
def proxy():
proxylist = ('59.39.88.190:8080',
'59.41.154.148:3128',
'59.41.239.13:9797',
'59.42.251.197:80',
'59.42.251.214:80',
'59.42.251.215:80',
'59.42.251.216:80',
'59.49.145.151:3128',
'59.49.248.216:3128')
ii = random.randint(0,8)
print ii
proxy = proxylist[ii]
proxies = {'http': proxy}
proxy_support = urllib2.ProxyHandler(proxies)
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
if __name__ == '__main__':
file = open("E://url2.txt")
i = 1
while 1:
line = file.readline().rstrip()
if not line:
break
print '開始抓取第---' +str(i) + '------頁內容'
proxy()
url = line + 'manual'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
headers = { 'User-Agent' : user_agent }
request = urllib2.Request(url,headers = headers)
try:
response = urllib2.urlopen(request, timeout = 30)
page = response.read()
except Exception, e:
print Exception,":",e
f1 = open('E:/url2_error.txt','a')
f1.write(line+ '\n')
f1.close()
else:
ziduantiqu(page)
print '第---' +str(i) + '------頁內容抓取完成'
i = i + 1
下面給出pyquery模塊代碼
>>> from pyquery import PyQuery as pq
>>> from lxml import etree
>>> import re
>>> v_source = pq(url = 'http://ypk.39.net/2017019/manual')
>>> for data in v_source('div').filter('.tab_box'):
for i in range(len(pq(data).find('dt'))):
f = open('yaopin.txt','a')
f.write(re.sub("[\s]","",pq(data).find('dt').eq(i).text().encode('utf8')))
f.write('\n')
f.write(pq(data).find('dd').eq(i).text().encode('utf8'))
f.write('\n')
f.close()
print pq(data).find('dt').eq(i).text()
print pq(data).find('dd').eq(i).text()
本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!