下面的python爬虫,可以用来爬取“微博搜索”提供的结果:

    1. # Author: Wei Meng-Xin
    2. # E-mail: weimengxin2012@hotmail.com
    3. # Time: 10/12/2018
    4. # Tool: Pycharm 2018.2
    5. # 该文件是主程序文件
    6. # 运行说明:
    7. # 1. 运行本程序前需安装Python requests库,可以在命令提示符窗口中输入:pip3 install requests 来完成安装,前提是已经配置好python和pip环境;
    8. # 2. 通过 weibo_search_run.py 文件运行全部程序;
    9. # 3. 请务必将 weibo_search.py 和 weibo_search_run.py 文件放到python同一个目录下,或者你可以在IDE中新建两个空文件,再将两份code分别复制进去也可
    10. def quote_func(location, keywords):
    11. '''该函数将汉字转化为http可用的utf编码'''
    12. from urllib.request import quote
    13. url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D" + location + "+" + keywords + "%26t%3D0&page_type=searchall"
    14. http = quote(url, safe=";/?:@&=+$,", encoding='utf-8')
    15. return http
    16. def login_weibo(location, keywords):
    17. '''这是个较复杂的主函数,负责爬取weibo的搜索数据'''
    18. import requests
    19. import time
    20. import random
    21. res = []
    22. res_dict = dict()
    23. addr = quote_func(location, keywords)
    24. # headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
    25. # " Chrome/69.0.3497.100 Safari/537.36",
    26. # 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    27. # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    28. # 'Connection': 'close'}]
    29. headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
    30. " Chrome/69.0.3497.100 Safari/537.36",
    31. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    32. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    33. 'Connection': 'close'},
    34. {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML,"
    35. " like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4",
    36. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    37. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    38. 'Connection': 'close'},
    39. {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, "
    40. "like Gecko) Version/7.0.3 Safari/7046A194A",
    41. 'Accept': 'text/html;q=0.9,*/*;q=0,8',
    42. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    43. 'Connection': 'close'},
    44. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    45. 'Accept': 'application/json, text/plain, */*',
    46. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    47. 'Connection': 'close'},
    48. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
    49. 'Accept': 'application/json, text/plain, */*',
    50. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    51. 'Connection': 'close'}]
    52. # 代理服务器
    53. proxyHost = "http-dyn.abuyun.com"
    54. proxyPort = "9020"
    55. # 代理隧道验证信息
    56. proxyUser = "H8J5995E1K5NF88D"
    57. proxyPass = "2E09DAB9F476C071"
    58. proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    59. "host": proxyHost,
    60. "port": proxyPort,
    61. "user": proxyUser,
    62. "pass": proxyPass,
    63. }
    64. proxies = {
    65. "http": proxyMeta,
    66. "https": proxyMeta,
    67. }
    68. k = 1
    69. try:
    70. while True:
    71. urls = addr + "&page=" + str(k)
    72. # print(urls)
    73. s = requests.Session()
    74. s.keep_alive = False
    75. ob = s.get(urls, headers=random.choice(headers), allow_redirects=False).json()
    76. time.sleep(1 + random.random() * 3.5)
    77. if ob["ok"] == 1:
    78. try:
    79. if ob["data"]["cards"][0]["desc"] == "抱歉,未找到相关结果。":
    80. res.append("NaN")
    81. except KeyError:
    82. texts = ob["data"]["cards"]
    83. for i in texts:
    84. try:
    85. for w in i["card_group"]:
    86. if w["card_type"] == 9 and w["mblog"]["user"]["verified_type"] != 1:
    87. time_obj = w["mblog"]["created_at"]
    88. # print(time_obj)
    89. res.append(time_obj)
    90. except KeyError:
    91. if i["card_type"] == 9 and i["mblog"]["user"]["verified_type"] != 1:
    92. time_obj = i["mblog"]["created_at"]
    93. res.append(time_obj)
    94. k += 1
    95. s.close()
    96. elif ob["msg"] == "这里还没有内容":
    97. # print("这里没有内容")
    98. s.close()
    99. break
    100. res_dict[keywords] = res
    101. return res_dict
    102. except requests.exceptions.ProxyError:
    103. return login_weibo(location, keywords)
    104. def search_weibo(local):
    105. '''该函数负责遍历所有关键字,这些关键字可以自定义'''
    106. res = []
    107. result = dict()
    108. # keys = ["电子政务", "政务公开", "政务服务", "行政职权", "腐败", "反腐", "监督", "政务平台", "行政体制", "服务型政府",
    109. # "依法行政", "阳光施政", "政务服务试点", "行政权力", "公开", "透明", "电子政务平台", "信息共享", "政府信息",
    110. # "便民服务", "职权法定", "权责一致", "监察", "政务网络", "政府网站", "廉政"]
    111. # keys = ["教育", "基础教育", "在校学生", "中学", "小学", "医院", "医疗", "床位", "社会福利", "福利院", "基础设施投资",
    112. # "固定资产投资", "工业用地", "土地拍卖", "协议转让", "土地价格", "政企合谋", "管理费用", "官商勾结"]
    113. keys = ["基本建设", "基础设施", "交通", "公共交通", "公路", "铁路", "机场", "航空", "桥梁", "轻轨", "公共汽车",
    114. "地铁", "水运", "港口", "邮政", "通信", "电信", "电话", "网络", "电网", "园林", "绿化", "垃圾清除",
    115. "污水处理", "防卫防灾", "安全系统", "电力", "煤气", "热力", "自来水", "供水", "供电", "水利", "公共服务", "入学率",
    116. "环保", "环卫", "生态环境", "科技", "卫生", "文化", "体育"]
    117. for word in keys:
    118. keyword = login_weibo(local, word)
    119. res.append(keyword)
    120. print("-{ %s }-关键字-{ %s }-检索完毕:共%d个" % (local, word, len(keyword[word])))
    121. result[local] = res
    122. return result
    123. # def output_data(all_data, output_file_name):
    124. # '''该函数将最终数据输出为CSV文件'''
    125. # import pandas as pd
    126. # name = ["contents"]
    127. # table = pd.DataFrame(columns=name, data=all_data)
    128. # table.to_csv("C:\\Users\\viemax\\Desktop\\" + output_file_name + ".csv")
    129. # return table
    130. #
    131. #
    132. # def read_csv(name):
    133. # '''该函数读取CSV文件数据'''
    134. # import csv
    135. # csv_file = csv.reader(open("C:\\Users\\viemax\\Desktop\\" + name + ".csv", "r"))
    136. # object_website = []
    137. # for i in csv_file:
    138. # object_website.append(i)
    139. # # print(i)
    140. # return object_website

    下面的code,通过多进程提高爬虫效率:

        # Author: Wei Meng-Xin
        # E-mail: weimengxin2012@hotmail.com
        # Time: 10/13/2018
        # Tool: Pycharm 2018.2
        # 该文件是运行文件
    
        import multiprocessing as mp
        import weibo_search
        import time
    
    
        def read_csv(name):
            '''该函数读取CSV文件数据'''
            import csv
            csv_file = csv.reader(open("C:\\Users\\viemax\\Desktop\\" + name + ".csv", "r"))
            object_website = []
            for i in csv_file:
                object_website.append(i)
                # print(i)
            return object_website
    
    
        if __name__ == "__main__":
            # 多进程并发
            object_location = []
            location = read_csv("weibo")
            for i in location[1:]:
                object_location.append(i[1])
    
            # obj = object_location[6::8]
            # obj_1 = obj[1::4][0::2]
            # object_location.remove("习水县")
    
            start = time.time()
            pool = mp.Pool()
            res_uid = pool.map(weibo_search.search_weibo, object_location)
            end = time.time()
            print("总共耗时:%f" % (end - start))