python新闻爬虫实战

  • 代码如下,尚未进行调试。
  1. # 爬取腾讯新闻首页所有的新闻内容
  2. '''
  3. 1. 爬取新闻首页
  4. 2. 得到各新闻链接
  5. 3. 爬取新闻链接
  6. 4. 寻找有没有frame
  7. 5. 若有,抓取frame下对应的网页内容
  8. 6. 若没有,直接抓取当前页面。
  9. '''
  10. import urllib.request
  11. import re
  12. url="https://news.qq.com/"
  13. data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")
  14. pat1='<a target="_blank" class="linkto" href="(.*?)"'
  15. alllink=re.compile(pat1).findall(data)
  16. for i in range(0,len(alllink)):
  17. thislink=alllink[i]
  18. thispage=urllib.request.urlopen(thislink).read().decode("gb2312","ignore")
  19. pat2="<frame src=(.*?)>"
  20. isframe=re.compile(pat2).findall(thispage)
  21. if(len(isframe)==0):
  22. #直接爬取
  23. print(i)
  24. urllib.request.urlretrieve(thislink,"F:\\python-file\\data\\"+str(i)+".html")
  25. else:
  26. #得到frame的网址,爬取
  27. flink=isframe[0]
  28. urllib.request.urlretrieve(flink,"F:\\python-file\\data\\"+str(i)+".html")

博文信息

  • 代码如下,尚未调试
  1. #CSDN博文爬虫
  2. import urllib.request
  3. import re
  4. url="https://blog.csdn.net/"
  5. headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36")
  6. opener=urllib.request.build_opener()
  7. opener.addheaders=[headers]
  8. #安装为全局
  9. urllib.request.install_opener(opener)
  10. data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
  11. pat='''<div class="list_con" data-track-view='{"mod":"popu_775","con":",(.*?)"}'>'''
  12. alllink=re.compile(pat).findall(data)
  13. print(alllink)
  14. for i in range(0,len(alllink)):
  15. localpath="D:\\python-file\\rst\\"+str(i)+".html"
  16. thislink=alllink[i]
  17. urllib.request.urlretrieve(thislink,filename=localpath)
  18. print("当前文章(第"+str(i)+"篇)爬取成功!")

糗事百科段子爬虫

  • 代码如下,尚未调试。有瑕疵。

    1. #糗事百科段子爬虫
    2. import urllib.request
    3. import re
    4. #url="https://www.qiushibaike.com/"
    5. headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36")
    6. opener=urllib.request.build_opener()
    7. opener.addheaders=[headers]
    8. #安装为全局
    9. urllib.request.install_opener(opener)
    10. for i in range(0,35):
    11. thisurl="https://www.qiushibaike.com/8hr/page/"+str(i+2)+"/article/121219914/"
    12. data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    13. pat='''<div class="content">.*?<span>(.*?)</span>.*?</div>'''
    14. rst=re.compile(pat,re.S).findall(data)
    15. # rst=rst.translate(non_bmp_map)
    16. for j in range(0,len(rst)):
    17. print(rst[j])
    18. print("---------")

    用户代理池构建

  • 代码如下,

  1. #用户代理池的构建
  2. import urllib.request
  3. import re
  4. import random
  5. uapools=[
  6. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36",
  7. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
  8. "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
  9. ]
  10. def ua(uapools):
  11. thisua=random.choice(uapools)
  12. print(thisua)
  13. headers=("User-Agent",thisua)
  14. opener=urllib.request.build_opener()
  15. opener.addheaders=[headers]
  16. #安装为全局
  17. urllib.request.install_opener(opener)
  18. for i in range(0,35):
  19. ua(uapools)
  20. thisurl="https://www.qiushibaike.com/8hr/page/"+str(i+2)+"/article/121219914/"
  21. data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
  22. pat='''<div class="content">.*?<span>(.*?)</span>.*?</div>'''
  23. rst=re.compile(pat,re.S).findall(data)
  24. # rst=rst.translate(non_bmp_map)
  25. for j in range(0,len(rst)):
  26. print(rst[j])
  27. print("---------")

IP代理池构建

  • 代码如下,
  1. #IP代理的构建实战
  2. import urllib.request
  3. #ip="218.6.145.67:8082"
  4. ip="218.6.145.113:8082"
  5. proxy=urllib.request.ProxyHandler({"http":ip})
  6. #print(proxy)
  7. opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
  8. urllib.request.install_opener(opener)
  9. url="http://www.sjpopc.net"
  10. #data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
  11. data1=urllib.request.urlopen(url).read()
  12. data=data1.decode("utf-8","ignore")
  13. print(len(data))
  14. fh=open("F:\\python-file\\data\\ip_baidu.html","wb")
  15. fh.write(data1)
  16. fh.close()

IP代理池构建的第一种方案

  • 适合于代理IP稳定的情况,如下,
  1. #IP代理池构建的第一种方案(适合于代理IP稳定的情况)
  2. import random
  3. import urllib.request
  4. ippools=[
  5. "112.247.66.254:9999",
  6. "112.247.100.200:9999",
  7. "112.247.5.22:9999",
  8. "218.6.145.67:8081",
  9. ]
  10. def ip(ippools):
  11. thisip=random.choice(ippools)
  12. print(thisip)
  13. proxy=urllib.request.ProxyHandler({"http":thisip})
  14. opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
  15. urllib.request.install_opener(opener)
  16. for i in range(0,5):
  17. try:
  18. ip(ippools)
  19. url="http://www.baidu.com"
  20. data1=urllib.request.urlopen(url).read()
  21. data=data1.decode("utf-8","ignore")
  22. print(len(data))
  23. fh=open("F:\\python-file\\data\\ip_baidu_"+str(i)+".html","wb")
  24. fh.write(data1)
  25. fh.close()
  26. except Exception as err:
  27. print(err)

IP代理池构建的第二种方式

  • 通过接口调用法实现IP代理池的构建,如下,接口不可用
  1. #IP代理池实现的第二种方式(接口调用法,适合于代理IP不稳定的情况)
  2. import urllib.request
  3. import re
  4. def api():
  5. print("这一次调用了接口")
  6. thisall=urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only")
  7. ippools=[]
  8. for item in thisall:
  9. #print(item.decode("utf-8","ignore"))
  10. ippools.append(item.decode("utf-8","ignore"))
  11. return ippools
  12. def ip(ippools,time):
  13. thisip=ippools[time]
  14. print("当前所用IP是:"+ippools[time])
  15. proxy=urllib.request.ProxyHandler({"http":thisip})
  16. opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
  17. urllib.request.install_opener(opener)
  18. x=0
  19. for i in range(0,5):
  20. try:
  21. if(x%10==0):
  22. time=x%10
  23. ippools=api()
  24. ip(ippools,time)
  25. else:
  26. time=x%10
  27. ip(ippools,time)
  28. url="http://www.baidu.com"
  29. data1=urllib.request.urlopen(url).read()
  30. data=data1.decode("utf-8","ignore")
  31. print(len(data))
  32. fh=open("F:\\python-file\\data\\ip_baidu_"+str(i)+".html","wb")
  33. fh.write(data1)
  34. fh.close()
  35. x+=1
  36. except Exception as err:
  37. print(err)
  38. x+=1

淘宝商品图片爬虫

  • 代码如下,有瑕疵
  1. #淘宝商品图片爬虫
  2. import urllib.request
  3. import re
  4. import random
  5. keyname="dress"
  6. key=urllib.request.quote(keyname)
  7. uapools=[
  8. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36",
  9. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763",
  10. "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
  11. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
  12. ]
  13. def ua(uapools):
  14. thisua=random.choice(uapools)
  15. print(thisua)
  16. headers=("User-Agent",thisua)
  17. opener=urllib.request.build_opener()
  18. opener.addheaders=[headers]
  19. #安装为全局
  20. urllib.request.install_opener(opener)
  21. for i in range(1,10):
  22. url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
  23. ua(uapools)
  24. data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
  25. pat='''"pic_url":"//(.*?)"'''
  26. imglist=re.compile(pat,re.S).findall(data)
  27. #print(imglist)
  28. for j in range(0,len(imglist)):
  29. thisimg=imglist[j]
  30. thisimgurl="http://"+thisimg
  31. localfile="F:\\python-file\\data\\taobao\\"+str(i)+str(j)+".jpg"
  32. urllib.request.urlretrieve(thisimgurl,filename=localfile)

同时使用用户代理池和IP代理池