python新闻爬虫实战
- 代码如下,尚未进行调试。
# 爬取腾讯新闻首页所有的新闻内容'''1. 爬取新闻首页2. 得到各新闻链接3. 爬取新闻链接4. 寻找有没有frame5. 若有,抓取frame下对应的网页内容6. 若没有,直接抓取当前页面。'''import urllib.requestimport reurl="https://news.qq.com/"data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")pat1='<a target="_blank" class="linkto" href="(.*?)"'alllink=re.compile(pat1).findall(data)for i in range(0,len(alllink)):thislink=alllink[i]thispage=urllib.request.urlopen(thislink).read().decode("gb2312","ignore")pat2="<frame src=(.*?)>"isframe=re.compile(pat2).findall(thispage)if(len(isframe)==0):#直接爬取print(i)urllib.request.urlretrieve(thislink,"F:\\python-file\\data\\"+str(i)+".html")else:#得到frame的网址,爬取flink=isframe[0]urllib.request.urlretrieve(flink,"F:\\python-file\\data\\"+str(i)+".html")
博文信息
- 代码如下,尚未调试
#CSDN博文爬虫import urllib.requestimport reurl="https://blog.csdn.net/"headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36")opener=urllib.request.build_opener()opener.addheaders=[headers]#安装为全局urllib.request.install_opener(opener)data=urllib.request.urlopen(url).read().decode("utf-8","ignore")pat='''<div class="list_con" data-track-view='{"mod":"popu_775","con":",(.*?)"}'>'''alllink=re.compile(pat).findall(data)print(alllink)for i in range(0,len(alllink)):localpath="D:\\python-file\\rst\\"+str(i)+".html"thislink=alllink[i]urllib.request.urlretrieve(thislink,filename=localpath)print("当前文章(第"+str(i)+"篇)爬取成功!")
糗事百科段子爬虫
代码如下,尚未调试。有瑕疵。
#糗事百科段子爬虫import urllib.requestimport re#url="https://www.qiushibaike.com/"headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36")opener=urllib.request.build_opener()opener.addheaders=[headers]#安装为全局urllib.request.install_opener(opener)for i in range(0,35):thisurl="https://www.qiushibaike.com/8hr/page/"+str(i+2)+"/article/121219914/"data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")pat='''<div class="content">.*?<span>(.*?)</span>.*?</div>'''rst=re.compile(pat,re.S).findall(data)# rst=rst.translate(non_bmp_map)for j in range(0,len(rst)):print(rst[j])print("---------")
用户代理池构建
代码如下,
#用户代理池的构建import urllib.requestimport reimport randomuapools=["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763","Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",]def ua(uapools):thisua=random.choice(uapools)print(thisua)headers=("User-Agent",thisua)opener=urllib.request.build_opener()opener.addheaders=[headers]#安装为全局urllib.request.install_opener(opener)for i in range(0,35):ua(uapools)thisurl="https://www.qiushibaike.com/8hr/page/"+str(i+2)+"/article/121219914/"data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")pat='''<div class="content">.*?<span>(.*?)</span>.*?</div>'''rst=re.compile(pat,re.S).findall(data)# rst=rst.translate(non_bmp_map)for j in range(0,len(rst)):print(rst[j])print("---------")
IP代理池构建
- 代码如下,
#IP代理的构建实战import urllib.request#ip="218.6.145.67:8082"ip="218.6.145.113:8082"proxy=urllib.request.ProxyHandler({"http":ip})#print(proxy)opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)urllib.request.install_opener(opener)url="http://www.sjpopc.net"#data=urllib.request.urlopen(url).read().decode("utf-8","ignore")data1=urllib.request.urlopen(url).read()data=data1.decode("utf-8","ignore")print(len(data))fh=open("F:\\python-file\\data\\ip_baidu.html","wb")fh.write(data1)fh.close()
IP代理池构建的第一种方案
- 适合于代理IP稳定的情况,如下,
#IP代理池构建的第一种方案(适合于代理IP稳定的情况)import randomimport urllib.requestippools=["112.247.66.254:9999","112.247.100.200:9999","112.247.5.22:9999","218.6.145.67:8081",]def ip(ippools):thisip=random.choice(ippools)print(thisip)proxy=urllib.request.ProxyHandler({"http":thisip})opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)urllib.request.install_opener(opener)for i in range(0,5):try:ip(ippools)url="http://www.baidu.com"data1=urllib.request.urlopen(url).read()data=data1.decode("utf-8","ignore")print(len(data))fh=open("F:\\python-file\\data\\ip_baidu_"+str(i)+".html","wb")fh.write(data1)fh.close()except Exception as err:print(err)
IP代理池构建的第二种方式
- 通过接口调用法实现IP代理池的构建,如下,接口不可用。
#IP代理池实现的第二种方式(接口调用法,适合于代理IP不稳定的情况)import urllib.requestimport redef api():print("这一次调用了接口")thisall=urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only")ippools=[]for item in thisall:#print(item.decode("utf-8","ignore"))ippools.append(item.decode("utf-8","ignore"))return ippoolsdef ip(ippools,time):thisip=ippools[time]print("当前所用IP是:"+ippools[time])proxy=urllib.request.ProxyHandler({"http":thisip})opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)urllib.request.install_opener(opener)x=0for i in range(0,5):try:if(x%10==0):time=x%10ippools=api()ip(ippools,time)else:time=x%10ip(ippools,time)url="http://www.baidu.com"data1=urllib.request.urlopen(url).read()data=data1.decode("utf-8","ignore")print(len(data))fh=open("F:\\python-file\\data\\ip_baidu_"+str(i)+".html","wb")fh.write(data1)fh.close()x+=1except Exception as err:print(err)x+=1
淘宝商品图片爬虫
- 代码如下,有瑕疵
#淘宝商品图片爬虫import urllib.requestimport reimport randomkeyname="dress"key=urllib.request.quote(keyname)uapools=["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763","Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",]def ua(uapools):thisua=random.choice(uapools)print(thisua)headers=("User-Agent",thisua)opener=urllib.request.build_opener()opener.addheaders=[headers]#安装为全局urllib.request.install_opener(opener)for i in range(1,10):url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)ua(uapools)data=urllib.request.urlopen(url).read().decode("utf-8","ignore")pat='''"pic_url":"//(.*?)"'''imglist=re.compile(pat,re.S).findall(data)#print(imglist)for j in range(0,len(imglist)):thisimg=imglist[j]thisimgurl="http://"+thisimglocalfile="F:\\python-file\\data\\taobao\\"+str(i)+str(j)+".jpg"urllib.request.urlretrieve(thisimgurl,filename=localfile)
