# -*- coding = ut-8 -*-# @Time : 2021/4/14 10:15# @Author : PHC# @File lastGoodGoal.py# @Software : PyCharmfrom bs4 import BeautifulSoupimport xlwtimport reimport urllib.request,urllib.errorimport sqlite3def main(): link="https://v.qq.com/channel/nba?channel=nba&feature=4&iplayer=1&listpage=1" askURl(link) # savepath() datalist=getDate(link) savepath="科比个人锦集.xls" # saveData(datalist,savepath) dbpath="KoBeShow.db" saveDB(datalist,dbpath)# 定制规则#1、获得影片的链接getVideoLink=re.compile(r'<a.*href="(.*?)".*>') # .*#2、获得影片的封面getImgSrc=re.compile(r'<img.*src="(.*)".')#3、视频时长getTime=re.compile(r'<div class="figure_caption">(.*?)</div>')#4、获取标题名字getName=re.compile(r'<a.*target="_blank" title=.*>(.*?)</a>')#1、访问网页,得到网页内容def askURl(url): head={ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.76" } request=urllib.request.Request(url,headers=head) #发送请求 html="" try: response=urllib.request.urlopen(request) #响应请求 html=response.read().decode("utf-8") #根据请求获取网页内容 # print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html#2、定制规则、获取网页所需要的数据def getDate(link): datalist=[] html=askURl(link) #逐一解析数据 soup=BeautifulSoup(html,"html.parser") # for item in soup.find_all('div',class_="mod_figure mod_figure_h_default mod_figure_list_box"): for item in soup.find_all('div', class_="list_item"): data=[] #保存一部视频的信息 # # print(type(item)) item=str(item) # print(item) videoLink=re.findall(getVideoLink,item)[0] # videoLink=videoLink.replace("[","") # videoLink=re.sub("]","",videoLink) data.append(videoLink) # print(videoLink) img=re.findall(getImgSrc,item)[0] # img=img.replace("[","") # img=re.sub("]","",img) data.append(img) # print(img) # # time=re.findall(getTime,item)[0] data.append(time) # print(time) name=re.findall(getName,item)[0] data.append(name) # print(name) datalist.append(data) # print(datalist) return datalist#3、保存数据# def saveData(datalist,savepath):# # print("数据已保存")# table=xlwt.Workbook(encoding="utf-8",style_compression=0)# sheet=table.add_sheet("科比绝杀锦集",cell_overwrite_ok=True)# colum=("视频链接","视频封面","视频时长","视频名称")# for i in range(0,4):# sheet.write(0,i,colum[i])# for i in range(len(datalist)):# print("第%s条"%(i+1))# data=datalist[i]# for j in range(0,4):# sheet.write(i+1,j,data[j])# table.save(savepath)#保存数据到数据库def saveDB(datalist,dbpath): create_db(dbpath) conn=sqlite3.connect(dbpath) cursor=conn.cursor() for data in datalist: for index in range(len(data)): # if index==2: # continue data[index]='"'+data[index]+'"' sql=''' insert into kobeshow (videolink,img,time,name) values(%s)'''%",".join(data) print(sql,";") cursor.execute(sql) conn.commit() cursor.close() conn.close() print("已保存到数据库")#创建数据库def create_db(dbpath): sql=''' create table if not exists kobeshow ( Id integer primary key autoincrement, videolink text, img text, time varchar (10), name text ) ''' conn=sqlite3.connect(dbpath) cur=conn.cursor() cur.execute(sql) conn.commit() conn.close() print("数据库已创建")if __name__=="__main__": main() # create_db("test1.db")