python爬取携程爬取全北京的酒店信息（改进） - 《python实战》

原贴：https://blog.csdn.net/weixin_41779359/article/details/86162674
由于年代久远，代码已经不能使用，目前实现恢复之前功能另加爬取图片地址

登录之后通过浏览器复制cookie

import re
import time
import requests
from urllib import request
import xlwt
from lxml import etree
def get_page(url):
    try:
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36",
            "cookie": "_abtest_userid=32f3b780-d932-42d8-bb4e-c2a4b09d2bae; ibulanguage=CN; ibulocale=zh_cn; cookiePricesDisplayed=CNY; IBU_TRANCE_LOG_P=27275857214; _RF1=222.128.58.224; _RSG=Tfq6O3wWMy2JZGT7K1re29; _RDG=28405636752cde207800d434a21e92c6f4; _RGUID=2f1683a0-d81b-4a5d-90a8-d415077508d3; _ga=GA1.2.1586531925.1613975034; _gid=GA1.2.1164383882.1613975034; Session=SmartLinkCode=csdn&SmartLinkKeyWord=&SmartLinkQuary=_UTF.&SmartLinkHost=link.csdn.net&SmartLinkLanguage=zh; MKT_CKID=1613975034222.o0wmp.s437; MKT_CKID_LMT=1613975034223; MKT_Pagesource=PC; _ctm_t=ctrip; Customer=HAL=ctrip_cn; StartCity_Pkg=PkgStartCity=2; GUID=09031125311912901733; librauuid=5oZmuXbMsAAGCg4L; cticket=163D247556DA45CD15D82927349EEE19E23CB036E730209A59D0D604E3D69074; AHeadUserInfo=VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=1; ticket_ctrip=bJ9RlCHVwlu1ZjyusRi+ypZ7X2r4+yojXN5UTMe2Bf1JKH8i3Drs9Pzu3Z0AIlCxJBdgVTo45f9ZEaRO6asDQ78Hql76TCWs5OAf2oQ328Lso/sAtvnT4OSfHdvLLyBXWl2HYpF5Gnt1c1IQD9AhgqrxC5RmxSfCWuwWVV25lURAvaLBnG9RecjqZK3FDi/WH3p8zJvY7ctaIHyjdYCuzOpiGNEwPi7N2rPl2Zkx/UxubtL5Nmpg8T0pd1QaKBgrpusXeDep2itU5ysDcocu/B/LA7TmK+JPoovNZWC5bi+1NFXVPiWPwg==; DUID=u=54C3F49AA58CC4ACE398230C18CC817FFC6B7346868BC43A98766B342D31186A&v=0; IsNonUser=F; IsPersonalizedLogin=F; UUID=59B664D9059D49D0A748DD0EC78E7926; _bfs=1.9; _bfa=1.1613975030415.2jx4dg.1.1613975030415.1613977027656.2.17; hotelhst=1164390341; _uetsid=87d4286074d611ebb15f89f05388a1db; _uetvid=87d44e7074d611eb987f693b286d26c4; _bfi=p1%3D102002%26p2%3D100101991%26v1%3D17%26v2%3D16; _jzqco=%7C%7C%7C%7C1613975034459%7C1.786017307.1613975034235.1613978187508.1613978198001.1613978187508.1613978198001.0.0.0.14.14; __zpspc=9.2.1613977031.1613978198.7%233%7Clink.csdn.net%7C%7C%7C%7C%23; appFloatCnt=12"
        }
        req = request.Request(url=url, headers=headers)
        rsq = request.urlopen(req)
        r = rsq.read().decode()
        # r=requests.get(url,headers=headers)
        # r.raise_for_status()  #这行代码的作用是如果请求失败，那就执行except
        # r.encoding=r.apparent_encoding  #防止出现乱码现象
        return r  #将返回对象的文本作为返回值
    except Exception as e: #找出失败的原因
        print(e)
basic_url='http://hotels.ctrip.com/hotel/beijing1/p'
urls=[basic_url + str(i) for i in range(1,10)]  #所有url都urls里。
def get_info(page):
    names = re.findall(r'"name font-bold">(.*?)</span>', page, re.S)  # 酒店名称
    locations = re.findall(r'<span class="position">(.*?)</span>', page, re.S)  # 酒店地址
    for i in range(len(locations)):  # 清洗数据
        if '】' in locations[i]:
            locations[i] = locations[i].split('】')[1]
    scores = re.findall(r'"real font-bold">(.*?)</span>', page, re.S)  # 酒店评分
    prices = re.findall(r'"real-price font-bold">(.*?)</span>', page, re.S)  # 酒店价格
    #recommends = re.findall(r'009933.*?>(.*?)</span>用户推荐', page, re.S)  # 推荐比例
    peoples = re.findall(r'class="count"><a>(\d+).*?</a>', page, re.S)  # 推荐人数
    """strbb = b'bbbbbb'
    with open("xiecheng".replace('/', '_') + ".html", "wb") as f:
        # 写文件用bytes而不是str，所以要转码
        page=bytes(page, encoding = "utf8")
        f.write(page)
    with open('xiecheng.html', 'r', encoding='utf-8') as f:
        content = f.read()"""
    pattern = re.compile(r'//dimg04.c-ctrip.com/images/.*?.jpg')  # 酒店的图片地址
    imgs = re.findall(pattern,page)
    for name, location, score, img, people in zip(names, locations, scores, imgs, peoples):
        data = {}  # 将每个酒店的信息保存为字典形式
        data['name'] = name
        data['score'] = score
        #data['price'] = price
        # data['recommend_ratio'] = recommend
        data['img'] = img
        data['people_num'] = people
        data['location'] = location
        print(data)
        yield data
if __name__ == '__main__':
    DATA = []  # 建一个空列表，为了储存所有提取出来的酒店数据
    for i in range(1, 2):
        url = basic_url + str(i)
        page = get_page(url)
        print('request data from:' + url)  # 主要是为了显示进度
        time.sleep(1)  # 每请求一次服务器就暂停50s，请求数据太快会被服务器识别为爬虫
        datas = get_info(page)
        for data in datas:
            DATA.append(data)  # 将数据添加到DATA列表里，为下一步保存数据使用
    f = xlwt.Workbook(encoding='utf-8')
    sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
    sheet01.write(0, 0, 'name')  # 第一行第一列
    sheet01.write(0, 1, 'score')
    #sheet01.write(0, 2, 'price')
    #sheet01.write(0, 3, 'recommand_ratio')
    sheet01.write(0, 3, 'img')
    sheet01.write(0, 4, 'people_num')
    sheet01.write(0, 5, 'location')
    # 写内容
    for i in range(len(DATA)):
        sheet01.write(i + 1, 0, DATA[i]['name'])
        sheet01.write(i + 1, 1, DATA[i]['score'])
        #sheet01.write(i + 1, 2, DATA[i]['price'])
        sheet01.write(i + 1, 3, DATA[i]['img'])
        sheet01.write(i + 1, 4, DATA[i]['people_num'])
        sheet01.write(i + 1, 5, DATA[i]['location'])
        print('$', end='')
    f.save(u'携程酒店2.xls')

由于携程的反爬机制过于恶心，无法得到价格，甚至浏览器查看页面源代码都没有价格信息，未解决
爬取多页更改爬取的页数

思路：python 如何爬取审查元素中Elements里有的元素，而源代码里没有的标签？
https://blog.csdn.net/weixin_41931602/article/details/81711190