Python采集百度地圖數據
百度利用其強大的中文搜索引擎數據,結合地圖應用,包含了海量的公司聯系方式,比Google要強,更別說什么黃頁網站了。
因為一些業務需要,寫了這個行業公司地址采集程序,使用方便,直接運行,支持命令行設定查詢參數。
使用方法:
把代碼保存成bmap.py
python bmap.py
或
python bmap.py 服飾廠
運行后會自動采集百度地圖中所有的結果,保存為以tab分割的txt文件,方便導入各種數據庫。
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2012 Channing Wong # # @mail: channing.wong@yahoo.com # @home: http://blog.3363.me/ # @date: Mar 3, 2012 # import json import sys import time import types import urllib reload(sys) sys.setdefaultencoding('utf-8') class BaiduMap: """ """ def __init__(self, keyword): self.keyword = keyword self.query = [ ('b', '(-1599062.039999999,811604.75;24779177.96,8168020.75)'), ('c', '1'), ('from', 'webmap'), ('ie', 'utf-8'), ('l', '4'), ('newmap', '1'), ('qt', 's'), ('src', '0'), ('sug', '0'), ('t', time.time().__int__()), ('tn', 'B_NORMAL_MAP'), ('wd', keyword), ('wd2', '') ] self.mapurl = 'http://map.baidu.com/' self.file = open('%s.txt' % keyword, 'w') self.count = 0 self.count_c = 0 self.total_num = 0 self._get_city() def _fetch(self, query=None, json=True): data = urllib.urlencode(query) url = self.mapurl + '?' + data opener = urllib.FancyURLopener() data = opener.open(url).read() if json: return self._tojson(data) else: return data def _tojson(self, data): try: js = json.loads(data, 'utf-8') except: js = None return js def _get_city(self): data = self._fetch(self.query) if type(data['content']) is not types.ListType: print 'keyworld error.' sys.exit() self.city = data['content'] if data.has_key('more_city'): for c in data['more_city']: self.city.extend(c['city']) for city in self.city: self.total_num += city['num'] def _get_data(self, city, page=0): query = [ ('addr', '0'), ('b', '(%s)' % city['geo'].split('|')[1]), ('c', city['code']), ('db', '0'), ('gr', '3'), ('ie', 'utf-8'), ('l', '9'), ('newmap', '1'), ('on_gel', '1'), ('pn', page), ('qt', 'con'), ('src', '7'), ('sug', '0'), ('t', time.time().__int__()), ('tn', 'B_NORMAL_MAP'), ('wd', self.keyword), ('wd2', ''), ] data = self._fetch(query) return data def _save(self, content, city): for c in content: self.count += 1 self.count_c += 1 if c.has_key('tel'): tel = c['tel'] else: tel = '' _data = '%s\t%s\t%s\t%s\n' % (city['name'], c['name'], c['addr'], tel) self.file.write(_data) print '(%s/%s) %s[%s/%s]' % (self.count, self.total_num, city['name'], self.count_c, city['num']) def get(self, city): self.count_c = 0 pages = abs(-city['num'] / 10) for page in range(0, pages): data = self._get_data(city, page) if data.has_key('content'): self._save(data['content'], city) def get_all(self): for city in self.city: self.get(city) self.file.close() if __name__ == '__main__': if sys.argv.__len__() > 1: keyword = sys.argv[1] else: keyword = '鉆石' baidumap = BaiduMap(keyword) print '_' * 20 print 'CITY: %s' % baidumap.city.__len__() print 'DATA: %s' % baidumap.total_num baidumap.get_all()
本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!