1、编写爬虫文件
import scrapyclass CookiespiderSpider(scrapy.Spider):name = 'cookieSpider'allowed_domains = ['douban.com']start_urls = ['http://www.douban.com']def start_requests(self):yield scrapy.Request(url=self.start_urls[0],callback=self.parse)# 这里打印登录之后的账号名称def parse(self, response):print(response.xpath('.//li[@class="nav-user-account"]/a/span[1]/text()').get())
2、在middelwares.py文件中,定义用于格式化与设置的Cookies的中间件。
class CookiesdemoMiddleware(object):def __init__(self,cookies_str):self.cookies_str = cookies_str@classmethoddef from_crawler(cls,crawler):return cls(cookies_str = crawler.settings.get('COOKIES_DEMO'))cookies = {}def process_request(self,request,spider):for cookie in self.cookies_str.split(';'):key,value = cookie.split('=',1)self.cookies.__setitem__(key,value) # 将分割后的数据保存至字典中request.cookies = self.cookies说明:from_crawler()方法是类方法,通过cls返回cookies的值。这里的cls在python中表示类本身,self为类的一个实例。cls可以返回类的一个实例。返回实例之后,进行初始化操作。这样cookie就有了值。那么处理当前的请求。
3、在middlewares.py文件中,定义随机设置请求头的中间件
from fake_useragent import UserAgentclass RandomHeaderMiddleware(object):def __init__(self,crawler):self.ua = UserAgent()self.type = crawler.settings.get('RANDOM_UA_TYPE','chrome')@classmethoddef from_crawler(cls,crawler):return cls(crawler)def process_request(self,request,spider):request.headers.setdefault('User-Agent',getattr(self.ua,self.type))说明:from_crawler()将crawler对象返回后进行获取settings.py中的参数。重写process_request()方法,进行设置请求头
4、打开settings.py文件,将DOWNLOADER_MIDDLEWARES配置信息中的默认信息禁用。然后添加用于处理Cookies与随机请求头的配置信息并激活,最后定义从浏览器中获取的Cookies信息。
DOWNLOADER_MIDDLEWARES = {# 启动自定义的Cookies中间件'cookies.middlewares.CookiesdemoMiddleware': 201,# 启动自定义随机请求头中间件'cookies.middlewares.RandomHeaderMiddleware': 202,'cookies.middlewares.CookiesDownloaderMiddleware': None,}# 定义从浏览器中获取的CookiesCOOKIES_DEMO = 'bid=tE6HN3Ew1o4;douban-fav-remind=1;__utmc=30149280;ll="118172";__gads=ID=57783c64c61b95f0-222169cb8ece00a5:T=1635952049:RT=1635952049:S=ALNI_MY6y5oy9OjMOtd8IX0CuXxdSF4SXQ; __yadk_uid=D4nsK7yTmxlHJtWl3jpeF49o8FSF0SMl;gr_user_id=d4f2576e-6cd8-4af1-b939-2c8645bbdd3c;__utmz=30149280.1642507321.17.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _ga=GA1.1.483069360.1648042716; _ga_RXNMP372GL=GS1.1.1648042715.1.1.1648045498.60; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1650805129%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DR75LUNuCjy8xpMTQz6Nn6l6FeId2G5FBbzIoYEl-bY7arUYpCLVCA4S_-enQc62w%26wd%3D%26eqid%3Df6c97a9c0002b34500000005618fa6dd%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.453644371.1620303194.1648042711.1650805130.21; push_noty_num=0; push_doumail_num=0;ap_v=0,6.0;dbcl2="76448342:kj7z+0x3R68";ck=hRfi; __utmv=30149280.7644; _pk_id.100001.8cb4=ffe3c5e109f5d223.1636804819.7.1650805362.1648044149.; __utmb=30149280.7.10.1650805130'
