import requestsfrom bs4 import BeautifulSoupimport jsonclass QiuSpiker(object): def init(self): self.baseURL = 'https://www.qiushibaike.com/8hr/page/{}/' self.URL = 'https://www.qiushibaike.com' self.list = [] self.page = '1' def get_URL(self): url_List = [] for i in range(1, (int(self.page) + 1)): url_List.append(self.baseURL.format(str(i))) return url_List def get_Request(self, url): data = requests.get(url).content.decode() return data def xpath(self, data): soup = BeautifulSoup(data, 'lxml') html_xpath = soup.select('.item') for xpath in html_xpath: xpathDic = {} # 1、调整连接 xpathDic['detailURL'] = self.URL + xpath.select_one( '.recmd-left').get('href') # 2、标题 xpathDic['title'] = xpath.select_one('.recmd-content').get_text() # 3、好笑数 xpathDic['goodsNum'] = xpath.select_one( '.recmd-num').get_text().replace('\n', '').replace('\r', '') # 4、作者 xpathDic['nick'] = xpath.select_one('.recmd-name').get_text() # 5、头像 xpathDic['headURL'] = 'https:' + xpath.select_one( '.recmd-user img').get('src') # 6、图片 xpathDic['URL'] = 'https:' + xpath.select_one('.recmd-left img').get( 'src') self.list.append(xpathDic) def get_Page(self): url = self.baseURL.format(self.page) data = self.get_Request(url) soup = BeautifulSoup(data, 'lxml') self.page = soup.select('.page-numbers')[-1].get_text().replace('\n', '') def save_data(self, data): json.dump(data, open('qiushibaike.json', 'w')) def start(self): #获取总页码数 self.get_Page() url_list = self.get_URL() for url in url_list: print(url) data = self.get_Request(url) self.xpath(data) self.save_data(self.list)if name == 'main': QiuSpiker().start()