import requests
from bs4 import BeautifulSoup
import json
class QiuVideoSpider(object):
def init(self):
self.baseURL = 'https://www.qiushibaike.com/video/page/{}/'
self.URL = 'https://www.qiushibaike.com'
self.list = []
self.page = '1'
def get_URL(self):
url_List = []
for i in range(1, (int(self.page) + 1)):
url_List.append(self.baseURL.format(str(i)))
return url_List
def get_Request(self, url):
data = requests.get(url).content.decode()
return data
def xpath(self, data):
soup = BeautifulSoup(data, 'lxml')
html_xpath = soup.select('.article')
for xpath in html_xpath:
xpathDic = {}
# 1、头像
xpathDic['headerURL'] = 'https:' + xpath.select_one(
'.author img').get('src')
# 2、昵称
xpathDic['nick'] = xpath.select_one(
'.author h2').get_text().replace('\n', '')
# 3、标题
xpathDic['title'] = xpath.select_one('.content').get_text().replace(
'\n', '')
# 4、性别
xpathDic['age'] = xpath.select_one('.articleGender').get_text()
# 5、视频链接
xpathDic['video'] = 'https:' + xpath.select_one('source').get('src')
self.list.append(xpathDic)
def get_Page(self):
url = self.baseURL.format(self.page)
data = self.get_Request(url)
soup = BeautifulSoup(data, 'lxml')
self.page = soup.select('.page-numbers')[-1].get_text().replace('\n',
'')
def save_data(self, data):
json.dump(data, open('qiushibaikeVideo.json', 'w'))
def start(self):
self.get_Page()
url_list = self.get_URL()
for url in url_list:
print(url)
data = self.get_Request(url)
self.xpath(data)
self.save_data(self.list)
if name == 'main':
QiuVideoSpider().start()