import requests, time
from pyquery import PyQuery as pq
# 这里的抓取最不同的就是,翻页参数since_id的获取,这个参数的下一页是在上一页的since_id中,所以要从响应中获取并更新since_id,其余就没有什么难度了
# 获取响应数据
def get_page(url):
print('since_id----', since_id)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Referer': 'https://m.weibo.cn/u/2830678474',
'X-Requested-With': 'XMLHttpRequest'
}
params = {
'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'since_id': since_id
}
try:
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e)
# 解析响应数据,包括更新since_id参数
def parse_page(json_data):
global since_id
items = json_data.get('data').get('cards')
for item in items:
weibo = {}
weibo['id'] = item.get('mblog').get('id')
weibo['created_at'] = item.get('mblog').get('created_at')
weibo['text'] = pq(item.get('mblog').get('text')).text()
weibo['bid'] = item.get('mblog').get('bid')
since_id = json_data.get('data').get('cardlistInfo').get('since_id')
yield weibo
# 执行主程序
def run(page):
url = 'https://m.weibo.cn/api/container/getIndex'
json_data = get_page(url)
results = parse_page(json_data)
for result in results:
print(result)
print(f'第{page}页完成')
if __name__ == '__main__':
since_id = ''
for page in range(3):
run(page+1)