簡單的電子郵件爬蟲Python代碼
import requests import re try: from urllib.parse import urljoin except ImportError: from urlparse import urljoinregex
email_re = re.compile(r'([\w.,]+@[\w.,]+.\w+)') link_re = re.compile(r'href="(.*?)"')
def crawl(url):
result = set() req = requests.get(url) # Check if successful if(req.status_code != 200): return [] # Find links links = link_re.findall(req.text) print("\nFound {} links".format(len(links))) # Search links for emails for link in links: # Get an absolute URL for a link link = urljoin(url, link) # Find all emails on current page result.update(email_re.findall(req.text)) return result
if name == 'main': emails = crawl('
print("\nScrapped e-mail addresses:") for email in emails: print(email) print("\n")</pre>
本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!