1. import requests
    2. from bs4 import BeautifulSoup
    3. import json
    4. class QiuSpiker(object):
    5. def init(self):
    6. self.baseURL = 'https://www.qiushibaike.com/8hr/page/{}/'
    7. self.URL = 'https://www.qiushibaike.com'
    8. self.list = []
    9. self.page = '1'
    10. def get_URL(self):
    11. url_List = []
    12. for i in range(1, (int(self.page) + 1)):
    13. url_List.append(self.baseURL.format(str(i)))
    14. return url_List
    15. def get_Request(self, url):
    16. data = requests.get(url).content.decode()
    17. return data
    18. def xpath(self, data):
    19. soup = BeautifulSoup(data, 'lxml')
    20. html_xpath = soup.select('.item')
    21. for xpath in html_xpath:
    22. xpathDic = {}
    23. # 1、调整连接
    24. xpathDic['detailURL'] = self.URL + xpath.select_one(
    25. '.recmd-left').get('href')
    26. # 2、标题
    27. xpathDic['title'] = xpath.select_one('.recmd-content').get_text()
    28. # 3、好笑数
    29. xpathDic['goodsNum'] = xpath.select_one(
    30. '.recmd-num').get_text().replace('\n', '').replace('\r', '')
    31. # 4、作者
    32. xpathDic['nick'] = xpath.select_one('.recmd-name').get_text()
    33. # 5、头像
    34. xpathDic['headURL'] = 'https:' + xpath.select_one(
    35. '.recmd-user img').get('src')
    36. # 6、图片
    37. xpathDic['URL'] = 'https:' + xpath.select_one('.recmd-left img').get(
    38. 'src')
    39. self.list.append(xpathDic)
    40. def get_Page(self):
    41. url = self.baseURL.format(self.page)
    42. data = self.get_Request(url)
    43. soup = BeautifulSoup(data, 'lxml')
    44. self.page = soup.select('.page-numbers')[-1].get_text().replace('\n',
    45. '')
    46. def save_data(self, data):
    47. json.dump(data, open('qiushibaike.json', 'w'))
    48. def start(self):
    49. #获取总页码数
    50. self.get_Page()
    51. url_list = self.get_URL()
    52. for url in url_list:
    53. print(url)
    54. data = self.get_Request(url)
    55. self.xpath(data)
    56. self.save_data(self.list)
    57. if name == 'main':
    58. QiuSpiker().start()