Python采集百度地圖數據

jopen 12年前發布 | 124K 次閱讀 Python Python開發

百度利用其強大的中文搜索引擎數據,結合地圖應用,包含了海量的公司聯系方式,比Google要強,更別說什么黃頁網站了。
因為一些業務需要,寫了這個行業公司地址采集程序,使用方便,直接運行,支持命令行設定查詢參數。

使用方法:
把代碼保存成bmap.py
python bmap.py

python bmap.py 服飾廠

運行后會自動采集百度地圖中所有的結果,保存為以tab分割的txt文件,方便導入各種數據庫。 10164653_enfn.png

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2012 Channing Wong
#
# @mail: channing.wong@yahoo.com
# @home: http://blog.3363.me/
# @date: Mar 3, 2012
#

import json
import sys
import time
import types
import urllib

reload(sys)
sys.setdefaultencoding('utf-8')


class BaiduMap:
    """
    """
    def __init__(self, keyword):
        self.keyword = keyword
        self.query = [
                ('b', '(-1599062.039999999,811604.75;24779177.96,8168020.75)'),
                ('c', '1'),
                ('from', 'webmap'),
                ('ie', 'utf-8'),
                ('l', '4'),
                ('newmap', '1'),
                ('qt', 's'),
                ('src', '0'),
                ('sug', '0'),
                ('t', time.time().__int__()),
                ('tn', 'B_NORMAL_MAP'),
                ('wd', keyword),
                ('wd2', '')
                 ]
        self.mapurl = 'http://map.baidu.com/'
        self.file = open('%s.txt' % keyword, 'w')
        self.count = 0
        self.count_c = 0
        self.total_num = 0

        self._get_city()

    def _fetch(self, query=None, json=True):
        data = urllib.urlencode(query)
        url = self.mapurl + '?' + data
        opener = urllib.FancyURLopener()
        data = opener.open(url).read()

        if json:
            return self._tojson(data)
        else:
            return data

    def _tojson(self, data):
        try:
            js = json.loads(data, 'utf-8')
        except:
            js = None

        return js

    def _get_city(self):
        data = self._fetch(self.query)

        if type(data['content']) is not types.ListType:
            print 'keyworld error.'
            sys.exit()

        self.city = data['content']

        if data.has_key('more_city'):
            for c in data['more_city']:
                self.city.extend(c['city'])

        for city in self.city:
            self.total_num += city['num']

    def _get_data(self, city, page=0):
        query = [
                ('addr', '0'),
                ('b', '(%s)' % city['geo'].split('|')[1]),
                ('c', city['code']),
                ('db', '0'),
                ('gr', '3'),
                ('ie', 'utf-8'),
                ('l', '9'),
                ('newmap', '1'),
                ('on_gel', '1'),
                ('pn', page),
                ('qt', 'con'),
                ('src', '7'),
                ('sug', '0'),
                ('t', time.time().__int__()),
                ('tn', 'B_NORMAL_MAP'),
                ('wd', self.keyword),
                ('wd2', ''),
                 ]
        data = self._fetch(query)
        return data

    def _save(self, content, city):
        for c in content:
            self.count += 1
            self.count_c += 1
            if c.has_key('tel'):
                tel = c['tel']
            else:
                tel = ''

            _data = '%s\t%s\t%s\t%s\n' % (city['name'], c['name'], c['addr'], tel)
            self.file.write(_data)
            print '(%s/%s) %s[%s/%s]' % (self.count, self.total_num, city['name'], self.count_c, city['num'])

    def get(self, city):
        self.count_c = 0
        pages = abs(-city['num'] / 10)
        for page in range(0, pages):
            data = self._get_data(city, page)
            if data.has_key('content'):
                self._save(data['content'], city)

    def get_all(self):
        for city in self.city:
            self.get(city)

        self.file.close()


if __name__ == '__main__':
    if sys.argv.__len__() > 1:
        keyword = sys.argv[1]
    else:
        keyword = '鉆石'

    baidumap = BaiduMap(keyword)
    print '_' * 20
    print 'CITY: %s' % baidumap.city.__len__()
    print 'DATA: %s' % baidumap.total_num
    baidumap.get_all()
 本文由用戶 jopen 自行上傳分享,僅供網友學習交流。所有權歸原作者,若您的權利被侵害,請聯系管理員。
 轉載本站原創文章,請注明出處,并保留原始鏈接、圖片水印。
 本站是一個以用戶分享為主的開源技術平臺,歡迎各類分享!