# -*- coding = ut-8 -*-
# @Time : 2021/4/14 10:15
# @Author : PHC
# @File lastGoodGoal.py
# @Software : PyCharm
from bs4 import BeautifulSoup
import xlwt
import re
import urllib.request,urllib.error
import sqlite3
def main():
link="https://v.qq.com/channel/nba?channel=nba&feature=4&iplayer=1&listpage=1"
askURl(link)
# savepath()
datalist=getDate(link)
savepath="科比个人锦集.xls"
# saveData(datalist,savepath)
dbpath="KoBeShow.db"
saveDB(datalist,dbpath)
# 定制规则
#1、获得影片的链接
getVideoLink=re.compile(r'<a.*href="(.*?)".*>') # .*
#2、获得影片的封面
getImgSrc=re.compile(r'<img.*src="(.*)".')
#3、视频时长
getTime=re.compile(r'<div class="figure_caption">(.*?)</div>')
#4、获取标题名字
getName=re.compile(r'<a.*target="_blank" title=.*>(.*?)</a>')
#1、访问网页,得到网页内容
def askURl(url):
head={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76"
}
request=urllib.request.Request(url,headers=head) #发送请求
html=""
try:
response=urllib.request.urlopen(request) #响应请求
html=response.read().decode("utf-8") #根据请求获取网页内容
# print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#2、定制规则、获取网页所需要的数据
def getDate(link):
datalist=[]
html=askURl(link)
#逐一解析数据
soup=BeautifulSoup(html,"html.parser")
# for item in soup.find_all('div',class_="mod_figure mod_figure_h_default mod_figure_list_box"):
for item in soup.find_all('div', class_="list_item"):
data=[] #保存一部视频的信息
# # print(type(item))
item=str(item)
# print(item)
videoLink=re.findall(getVideoLink,item)[0]
# videoLink=videoLink.replace("[","")
# videoLink=re.sub("]","",videoLink)
data.append(videoLink)
# print(videoLink)
img=re.findall(getImgSrc,item)[0]
# img=img.replace("[","")
# img=re.sub("]","",img)
data.append(img)
# print(img)
# #
time=re.findall(getTime,item)[0]
data.append(time)
# print(time)
name=re.findall(getName,item)[0]
data.append(name)
# print(name)
datalist.append(data)
# print(datalist)
return datalist
#3、保存数据
# def saveData(datalist,savepath):
# # print("数据已保存")
# table=xlwt.Workbook(encoding="utf-8",style_compression=0)
# sheet=table.add_sheet("科比绝杀锦集",cell_overwrite_ok=True)
# colum=("视频链接","视频封面","视频时长","视频名称")
# for i in range(0,4):
# sheet.write(0,i,colum[i])
# for i in range(len(datalist)):
# print("第%s条"%(i+1))
# data=datalist[i]
# for j in range(0,4):
# sheet.write(i+1,j,data[j])
# table.save(savepath)
#保存数据到数据库
def saveDB(datalist,dbpath):
create_db(dbpath)
conn=sqlite3.connect(dbpath)
cursor=conn.cursor()
for data in datalist:
for index in range(len(data)):
# if index==2:
# continue
data[index]='"'+data[index]+'"'
sql='''
insert into kobeshow
(videolink,img,time,name)
values(%s)'''%",".join(data)
print(sql,";")
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
print("已保存到数据库")
#创建数据库
def create_db(dbpath):
sql='''
create table if not exists kobeshow (
Id integer primary key autoincrement,
videolink text,
img text,
time varchar (10),
name text
)
'''
conn=sqlite3.connect(dbpath)
cur=conn.cursor()
cur.execute(sql)
conn.commit()
conn.close()
print("数据库已创建")
if __name__=="__main__":
main()
# create_db("test1.db")