微博搜索爬虫 - 《Python爬虫笔记》

下面的python爬虫，可以用来爬取“微博搜索”提供的结果：

# Author: Wei Meng-Xin
# E-mail: weimengxin2012@hotmail.com
# Time: 10/12/2018
# Tool: Pycharm 2018.2
# 该文件是主程序文件
# 运行说明：
# 1. 运行本程序前需安装Python requests库，可以在命令提示符窗口中输入：pip3 install requests 来完成安装，前提是已经配置好python和pip环境；
# 2. 通过 weibo_search_run.py 文件运行全部程序；
# 3. 请务必将 weibo_search.py 和 weibo_search_run.py 文件放到python同一个目录下，或者你可以在IDE中新建两个空文件，再将两份code分别复制进去也可
def quote_func(location, keywords):
    '''该函数将汉字转化为http可用的utf编码'''
    from urllib.request import quote
    url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D" + location + "+" + keywords + "%26t%3D0&page_type=searchall"
    http = quote(url, safe=";/?:@&=+$,", encoding='utf-8')
    return http
def login_weibo(location, keywords):
    '''这是个较复杂的主函数，负责爬取weibo的搜索数据'''
    import requests
    import time
    import random
    res = []
    res_dict = dict()
    addr = quote_func(location, keywords)
    # headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
    #                           " Chrome/69.0.3497.100 Safari/537.36",
    #             'Accept': 'text/html;q=0.9,*/*;q=0.8',
    #             'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    #             'Connection': 'close'}]
    headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
                              " Chrome/69.0.3497.100 Safari/537.36",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML,"
                              " like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, "
                              "like Gecko) Version/7.0.3 Safari/7046A194A",
                'Accept': 'text/html;q=0.9,*/*;q=0,8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'}]
    # 代理服务器
    proxyHost = "http-dyn.abuyun.com"
    proxyPort = "9020"
    # 代理隧道验证信息
    proxyUser = "H8J5995E1K5NF88D"
    proxyPass = "2E09DAB9F476C071"
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
      "host": proxyHost,
      "port": proxyPort,
      "user": proxyUser,
      "pass": proxyPass,
    }
    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }
    k = 1
    try:
        while True:
            urls = addr + "&page=" + str(k)
            # print(urls)
            s = requests.Session()
            s.keep_alive = False
            ob = s.get(urls, headers=random.choice(headers), allow_redirects=False).json()
            time.sleep(1 + random.random() * 3.5)
            if ob["ok"] == 1:
                try:
                    if ob["data"]["cards"][0]["desc"] == "抱歉，未找到相关结果。":
                        res.append("NaN")
                except KeyError:
                    texts = ob["data"]["cards"]
                    for i in texts:
                        try:
                            for w in i["card_group"]:
                                if w["card_type"] == 9 and w["mblog"]["user"]["verified_type"] != 1:
                                    time_obj = w["mblog"]["created_at"]
                                    # print(time_obj)
                                    res.append(time_obj)
                        except KeyError:
                            if i["card_type"] == 9 and i["mblog"]["user"]["verified_type"] != 1:
                                time_obj = i["mblog"]["created_at"]
                                res.append(time_obj)
                k += 1
                s.close()
            elif ob["msg"] == "这里还没有内容":
                # print("这里没有内容")
                s.close()
                break
        res_dict[keywords] = res
        return res_dict
    except requests.exceptions.ProxyError:
        return login_weibo(location, keywords)
def search_weibo(local):
    '''该函数负责遍历所有关键字，这些关键字可以自定义'''
    res = []
    result = dict()
    # keys = ["电子政务", "政务公开", "政务服务", "行政职权", "腐败", "反腐", "监督", "政务平台", "行政体制", "服务型政府",
    #         "依法行政", "阳光施政", "政务服务试点", "行政权力", "公开", "透明", "电子政务平台", "信息共享", "政府信息",
    #         "便民服务", "职权法定", "权责一致", "监察", "政务网络", "政府网站", "廉政"]
    # keys = ["教育", "基础教育", "在校学生", "中学", "小学", "医院", "医疗", "床位", "社会福利", "福利院", "基础设施投资",
    #         "固定资产投资", "工业用地", "土地拍卖", "协议转让", "土地价格", "政企合谋", "管理费用", "官商勾结"]
    keys = ["基本建设", "基础设施", "交通", "公共交通", "公路", "铁路", "机场", "航空", "桥梁", "轻轨", "公共汽车",
            "地铁", "水运", "港口", "邮政", "通信", "电信", "电话", "网络", "电网", "园林", "绿化", "垃圾清除",
            "污水处理", "防卫防灾", "安全系统", "电力", "煤气", "热力", "自来水", "供水", "供电", "水利", "公共服务", "入学率",
            "环保", "环卫", "生态环境", "科技", "卫生", "文化", "体育"]
    for word in keys:
        keyword = login_weibo(local, word)
        res.append(keyword)
        print("-{ %s }-关键字-{ %s }-检索完毕：共%d个" % (local, word, len(keyword[word])))
    result[local] = res
    return result
# def output_data(all_data, output_file_name):
#     '''该函数将最终数据输出为CSV文件'''
#     import pandas as pd
#     name = ["contents"]
#     table = pd.DataFrame(columns=name, data=all_data)
#     table.to_csv("C:\\Users\\viemax\\Desktop\\" + output_file_name + ".csv")
#     return table
#
#
# def read_csv(name):
#     '''该函数读取CSV文件数据'''
#     import csv
#     csv_file = csv.reader(open("C:\\Users\\viemax\\Desktop\\" + name + ".csv", "r"))
#     object_website = []
#     for i in csv_file:
#         object_website.append(i)
#         # print(i)
#     return object_website

下面的code，通过多进程提高爬虫效率：

    # Author: Wei Meng-Xin
    # E-mail: weimengxin2012@hotmail.com
    # Time: 10/13/2018
    # Tool: Pycharm 2018.2
    # 该文件是运行文件

    import multiprocessing as mp
    import weibo_search
    import time


    def read_csv(name):
        '''该函数读取CSV文件数据'''
        import csv
        csv_file = csv.reader(open("C:\\Users\\viemax\\Desktop\\" + name + ".csv", "r"))
        object_website = []
        for i in csv_file:
            object_website.append(i)
            # print(i)
        return object_website


    if __name__ == "__main__":
        # 多进程并发
        object_location = []
        location = read_csv("weibo")
        for i in location[1:]:
            object_location.append(i[1])

        # obj = object_location[6::8]
        # obj_1 = obj[1::4][0::2]
        # object_location.remove("习水县")

        start = time.time()
        pool = mp.Pool()
        res_uid = pool.map(weibo_search.search_weibo, object_location)
        end = time.time()
        print("总共耗时：%f" % (end - start))