python新闻爬虫实战
- 代码如下,尚未进行调试。
# 爬取腾讯新闻首页所有的新闻内容
'''
1. 爬取新闻首页
2. 得到各新闻链接
3. 爬取新闻链接
4. 寻找有没有frame
5. 若有,抓取frame下对应的网页内容
6. 若没有,直接抓取当前页面。
'''
import urllib.request
import re
url="https://news.qq.com/"
data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")
pat1='<a target="_blank" class="linkto" href="(.*?)"'
alllink=re.compile(pat1).findall(data)
for i in range(0,len(alllink)):
thislink=alllink[i]
thispage=urllib.request.urlopen(thislink).read().decode("gb2312","ignore")
pat2="<frame src=(.*?)>"
isframe=re.compile(pat2).findall(thispage)
if(len(isframe)==0):
#直接爬取
print(i)
urllib.request.urlretrieve(thislink,"F:\\python-file\\data\\"+str(i)+".html")
else:
#得到frame的网址,爬取
flink=isframe[0]
urllib.request.urlretrieve(flink,"F:\\python-file\\data\\"+str(i)+".html")
博文信息
- 代码如下,尚未调试
#CSDN博文爬虫
import urllib.request
import re
url="https://blog.csdn.net/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='''<div class="list_con" data-track-view='{"mod":"popu_775","con":",(.*?)"}'>'''
alllink=re.compile(pat).findall(data)
print(alllink)
for i in range(0,len(alllink)):
localpath="D:\\python-file\\rst\\"+str(i)+".html"
thislink=alllink[i]
urllib.request.urlretrieve(thislink,filename=localpath)
print("当前文章(第"+str(i)+"篇)爬取成功!")
糗事百科段子爬虫
代码如下,尚未调试。有瑕疵。
#糗事百科段子爬虫
import urllib.request
import re
#url="https://www.qiushibaike.com/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
for i in range(0,35):
thisurl="https://www.qiushibaike.com/8hr/page/"+str(i+2)+"/article/121219914/"
data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
pat='''<div class="content">.*?<span>(.*?)</span>.*?</div>'''
rst=re.compile(pat,re.S).findall(data)
# rst=rst.translate(non_bmp_map)
for j in range(0,len(rst)):
print(rst[j])
print("---------")
用户代理池构建
代码如下,
#用户代理池的构建
import urllib.request
import re
import random
uapools=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
]
def ua(uapools):
thisua=random.choice(uapools)
print(thisua)
headers=("User-Agent",thisua)
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
for i in range(0,35):
ua(uapools)
thisurl="https://www.qiushibaike.com/8hr/page/"+str(i+2)+"/article/121219914/"
data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
pat='''<div class="content">.*?<span>(.*?)</span>.*?</div>'''
rst=re.compile(pat,re.S).findall(data)
# rst=rst.translate(non_bmp_map)
for j in range(0,len(rst)):
print(rst[j])
print("---------")
IP代理池构建
- 代码如下,
#IP代理的构建实战
import urllib.request
#ip="218.6.145.67:8082"
ip="218.6.145.113:8082"
proxy=urllib.request.ProxyHandler({"http":ip})
#print(proxy)
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
url="http://www.sjpopc.net"
#data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
data1=urllib.request.urlopen(url).read()
data=data1.decode("utf-8","ignore")
print(len(data))
fh=open("F:\\python-file\\data\\ip_baidu.html","wb")
fh.write(data1)
fh.close()
IP代理池构建的第一种方案
- 适合于代理IP稳定的情况,如下,
#IP代理池构建的第一种方案(适合于代理IP稳定的情况)
import random
import urllib.request
ippools=[
"112.247.66.254:9999",
"112.247.100.200:9999",
"112.247.5.22:9999",
"218.6.145.67:8081",
]
def ip(ippools):
thisip=random.choice(ippools)
print(thisip)
proxy=urllib.request.ProxyHandler({"http":thisip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
for i in range(0,5):
try:
ip(ippools)
url="http://www.baidu.com"
data1=urllib.request.urlopen(url).read()
data=data1.decode("utf-8","ignore")
print(len(data))
fh=open("F:\\python-file\\data\\ip_baidu_"+str(i)+".html","wb")
fh.write(data1)
fh.close()
except Exception as err:
print(err)
IP代理池构建的第二种方式
- 通过接口调用法实现IP代理池的构建,如下,接口不可用。
#IP代理池实现的第二种方式(接口调用法,适合于代理IP不稳定的情况)
import urllib.request
import re
def api():
print("这一次调用了接口")
thisall=urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only")
ippools=[]
for item in thisall:
#print(item.decode("utf-8","ignore"))
ippools.append(item.decode("utf-8","ignore"))
return ippools
def ip(ippools,time):
thisip=ippools[time]
print("当前所用IP是:"+ippools[time])
proxy=urllib.request.ProxyHandler({"http":thisip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
x=0
for i in range(0,5):
try:
if(x%10==0):
time=x%10
ippools=api()
ip(ippools,time)
else:
time=x%10
ip(ippools,time)
url="http://www.baidu.com"
data1=urllib.request.urlopen(url).read()
data=data1.decode("utf-8","ignore")
print(len(data))
fh=open("F:\\python-file\\data\\ip_baidu_"+str(i)+".html","wb")
fh.write(data1)
fh.close()
x+=1
except Exception as err:
print(err)
x+=1
淘宝商品图片爬虫
- 代码如下,有瑕疵
#淘宝商品图片爬虫
import urllib.request
import re
import random
keyname="dress"
key=urllib.request.quote(keyname)
uapools=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
]
def ua(uapools):
thisua=random.choice(uapools)
print(thisua)
headers=("User-Agent",thisua)
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
for i in range(1,10):
url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
ua(uapools)
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='''"pic_url":"//(.*?)"'''
imglist=re.compile(pat,re.S).findall(data)
#print(imglist)
for j in range(0,len(imglist)):
thisimg=imglist[j]
thisimgurl="http://"+thisimg
localfile="F:\\python-file\\data\\taobao\\"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,filename=localfile)