1. import requests
    2. from bs4 import BeautifulSoup
    3. import json
    4. class QiuVideoSpider(object):
    5. def init(self):
    6. self.baseURL = 'https://www.qiushibaike.com/video/page/{}/'
    7. self.URL = 'https://www.qiushibaike.com'
    8. self.list = []
    9. self.page = '1'
    10. def get_URL(self):
    11. url_List = []
    12. for i in range(1, (int(self.page) + 1)):
    13. url_List.append(self.baseURL.format(str(i)))
    14. return url_List
    15. def get_Request(self, url):
    16. data = requests.get(url).content.decode()
    17. return data
    18. def xpath(self, data):
    19. soup = BeautifulSoup(data, 'lxml')
    20. html_xpath = soup.select('.article')
    21. for xpath in html_xpath:
    22. xpathDic = {}
    23. # 1、头像
    24. xpathDic['headerURL'] = 'https:' + xpath.select_one(
    25. '.author img').get('src')
    26. # 2、昵称
    27. xpathDic['nick'] = xpath.select_one(
    28. '.author h2').get_text().replace('\n', '')
    29. # 3、标题
    30. xpathDic['title'] = xpath.select_one('.content').get_text().replace(
    31. '\n', '')
    32. # 4、性别
    33. xpathDic['age'] = xpath.select_one('.articleGender').get_text()
    34. # 5、视频链接
    35. xpathDic['video'] = 'https:' + xpath.select_one('source').get('src')
    36. self.list.append(xpathDic)
    37. def get_Page(self):
    38. url = self.baseURL.format(self.page)
    39. data = self.get_Request(url)
    40. soup = BeautifulSoup(data, 'lxml')
    41. self.page = soup.select('.page-numbers')[-1].get_text().replace('\n',
    42. '')
    43. def save_data(self, data):
    44. json.dump(data, open('qiushibaikeVideo.json', 'w'))
    45. def start(self):
    46. self.get_Page()
    47. url_list = self.get_URL()
    48. for url in url_list:
    49. print(url)
    50. data = self.get_Request(url)
    51. self.xpath(data)
    52. self.save_data(self.list)
    53. if name == 'main':
    54. QiuVideoSpider().start()