1. import requests, time
    2. from pyquery import PyQuery as pq
    3. # 这里的抓取最不同的就是,翻页参数since_id的获取,这个参数的下一页是在上一页的since_id中,所以要从响应中获取并更新since_id,其余就没有什么难度了
    4. # 获取响应数据
    5. def get_page(url):
    6. print('since_id----', since_id)
    7. headers = {
    8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    9. 'Referer': 'https://m.weibo.cn/u/2830678474',
    10. 'X-Requested-With': 'XMLHttpRequest'
    11. }
    12. params = {
    13. 'type': 'uid',
    14. 'value': '2830678474',
    15. 'containerid': '1076032830678474',
    16. 'since_id': since_id
    17. }
    18. try:
    19. response = requests.get(url, headers=headers, params=params)
    20. if response.status_code == 200:
    21. return response.json()
    22. except requests.ConnectionError as e:
    23. print('Error', e)
    24. # 解析响应数据,包括更新since_id参数
    25. def parse_page(json_data):
    26. global since_id
    27. items = json_data.get('data').get('cards')
    28. for item in items:
    29. weibo = {}
    30. weibo['id'] = item.get('mblog').get('id')
    31. weibo['created_at'] = item.get('mblog').get('created_at')
    32. weibo['text'] = pq(item.get('mblog').get('text')).text()
    33. weibo['bid'] = item.get('mblog').get('bid')
    34. since_id = json_data.get('data').get('cardlistInfo').get('since_id')
    35. yield weibo
    36. # 执行主程序
    37. def run(page):
    38. url = 'https://m.weibo.cn/api/container/getIndex'
    39. json_data = get_page(url)
    40. results = parse_page(json_data)
    41. for result in results:
    42. print(result)
    43. print(f'第{page}页完成')
    44. if __name__ == '__main__':
    45. since_id = ''
    46. for page in range(3):
    47. run(page+1)