import requests
from bs4 import BeautifulSoup
import json
class QiuSpiker(object):
def init(self):
self.baseURL = 'https://www.qiushibaike.com/8hr/page/{}/'
self.URL = 'https://www.qiushibaike.com'
self.list = []
self.page = '1'
def get_URL(self):
url_List = []
for i in range(1, (int(self.page) + 1)):
url_List.append(self.baseURL.format(str(i)))
return url_List
def get_Request(self, url):
data = requests.get(url).content.decode()
return data
def xpath(self, data):
soup = BeautifulSoup(data, 'lxml')
html_xpath = soup.select('.item')
for xpath in html_xpath:
xpathDic = {}
# 1、调整连接
xpathDic['detailURL'] = self.URL + xpath.select_one(
'.recmd-left').get('href')
# 2、标题
xpathDic['title'] = xpath.select_one('.recmd-content').get_text()
# 3、好笑数
xpathDic['goodsNum'] = xpath.select_one(
'.recmd-num').get_text().replace('\n', '').replace('\r', '')
# 4、作者
xpathDic['nick'] = xpath.select_one('.recmd-name').get_text()
# 5、头像
xpathDic['headURL'] = 'https:' + xpath.select_one(
'.recmd-user img').get('src')
# 6、图片
xpathDic['URL'] = 'https:' + xpath.select_one('.recmd-left img').get(
'src')
self.list.append(xpathDic)
def get_Page(self):
url = self.baseURL.format(self.page)
data = self.get_Request(url)
soup = BeautifulSoup(data, 'lxml')
self.page = soup.select('.page-numbers')[-1].get_text().replace('\n',
'')
def save_data(self, data):
json.dump(data, open('qiushibaike.json', 'w'))
def start(self):
#获取总页码数
self.get_Page()
url_list = self.get_URL()
for url in url_list:
print(url)
data = self.get_Request(url)
self.xpath(data)
self.save_data(self.list)
if name == 'main':
QiuSpiker().start()