一 代理池服务器搭建

https://github.com/jhao104/proxy_pool

搭建好后,如下:
image.png
image.png

二 为scrapy项目添加代理中间件

1 编写中间件类

示例目录
image.png

随机UA中间件
  1. # proxies/user_agent.py
  2. import random
  3. class UA(object):
  4. @classmethod
  5. def random_ua(cls):
  6. ua_list = [
  7. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
  8. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101',
  9. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122',
  10. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71',
  11. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95',
  12. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
  13. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
  14. 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
  15. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
  16. ]
  17. return random.choice(ua_list)
  18. class RandomUserAgentMiddleware(object):
  19. def __init__(self, crawler):
  20. super(RandomUserAgentMiddleware, self).__init__()
  21. self.ua = UA.random_ua()
  22. @classmethod
  23. def from_crawler(cls, crawler):
  24. return cls(crawler)
  25. def process_requests(self, request, spider):
  26. request.headers.setdefault('User-Agent', self.ua)

获取可用的代理IP
  1. # proxies/pool.py
  2. import time
  3. import requests
  4. from omim import settings
  5. from user_agent import UA
  6. class Pool(object):
  7. proxy_url = settings.PROXY_URL
  8. @classmethod
  9. def get(cls):
  10. url = cls.proxy_url + '/get/'
  11. res = requests.get(url).json()
  12. if not res:
  13. print('proxy pool is empty, wait a minute ...')
  14. time.sleep(60)
  15. return cls.get()
  16. ip = res['proxy']
  17. return ip
  18. @classmethod
  19. def delete(cls, ip):
  20. url = '{}/delete/?proxy={}'.format(cls.proxy_url, ip)
  21. requests.get(url)
  22. @classmethod
  23. def check(cls, ip):
  24. print('checking ip: {} ...'.format(ip))
  25. url = 'https://www.omim.org/statistics/entry'
  26. proxies = {'https': 'https://{}'.format(ip)}
  27. headers = {'User-Agent': UA.random_ua()}
  28. try:
  29. resp = requests.get(url, proxies=proxies, headers=headers, timeout=15)
  30. if '<title>Error 403</title>' in resp.text:
  31. return False
  32. return True
  33. except Exception as e:
  34. print e
  35. return False
  36. @classmethod
  37. def get_good_proxy(cls):
  38. ip = cls.get()
  39. if cls.check(ip):
  40. return 'https://{}'.format(ip)
  41. else:
  42. print('delete ip: {}'.format(ip))
  43. cls.delete(ip)
  44. return cls.get_good_proxy()

代理池中间件编写
  1. from pool import Pool
  2. from user_agent import UA
  3. class ProxyMiddleware(object):
  4. def __init__(self):
  5. pass
  6. def process_request(self, request, spider):
  7. proxy = Pool.get_good_proxy()
  8. print '>>> crawling with proxy: {}'.format(proxy)
  9. request.meta['proxy'] = proxy

2 开启中间件

  1. # setttings.py
  2. DOWNLOADER_MIDDLEWARES = {
  3. # 'omim.middlewares.OmimDownloaderMiddleware': 543,
  4. 'omim.proxies.ProxyMiddleware': 543,
  5. 'omim.proxies.user_agent.RandomUserAgentMiddleware': 542,
  6. 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
  7. }

https://blog.csdn.net/jjjndk1314/article/details/80250146