1. #coding:utf-8
    2. import requests
    3. from lxml import etree
    4. class Tieba(object):
    5. def __init__(self, name):
    6. self.name = name
    7. self.url = 'http://tieba.baidu.com/f?kw={}'.format(self.name)
    8. self.headers = {
    9. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    10. # 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) '
    11. }
    12. def get_data(self, url):
    13. response = requests.get(url,headers=self.headers)
    14. return response.content
    15. def parse_list_page(self, data):
    16. """
    17. 解析贴吧帖子列表页面的响应
    18. :param data: 帖子列表页面的响应
    19. :return: 帖子标题和链接列表 与 下一页url
    20. """
    21. data = data.decode().replace("<!--","").replace("-->","")
    22. html = etree.HTML(data)
    23. el_list = html.xpath('//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a')
    24. # print(len(el_list))
    25. data_list = []
    26. for el in el_list:
    27. temp = {}
    28. temp['title'] = el.xpath('./text()')[0]
    29. temp['link'] = 'http://tieba.baidu.com' + el.xpath('./@href')[0]
    30. data_list.append(temp)
    31. try:
    32. next_url = 'http:' + html.xpath('//*[contains(text(),"下一页>")]/@href')[0]
    33. except:
    34. next_url = None
    35. return data_list, next_url
    36. def parse_detail_page(self, data):
    37. html = etree.HTML(data)
    38. return html.xpath('//*[contains(@id,"post_content_")]/img/@src')
    39. def run(self):
    40. # url
    41. # headers
    42. next_url = self.url
    43. while True:
    44. # 发送列表请求,获取响应
    45. list_page_data = self.get_data(next_url)
    46. # 解析列表页面的响应,提取帖子列表数据和下一页url
    47. data_list, next_url = self.parse_list_page(list_page_data)
    48. # 遍历帖子列表,获取每一个详细url
    49. for data in data_list:
    50. # 发起请求,获取到详情页面的响应
    51. data = self.get_data(data['link'])
    52. # 从响应中提取图片地址
    53. image_list = self.parse_detail_page(data)
    54. # 保存
    55. print(image_list)
    56. # 翻页&循环终止条件
    57. if next_url == None:
    58. break
    59. if __name__ == '__main__':
    60. tieba = Tieba("传智播客")
    61. tieba.run()