requests:基于网络请求的模块,功能强大,效率极高
作用:模拟浏览器发请求

如何使用:(requests模块的编码流程)

  1. 指定url
  2. 发起请求
  3. 获取响应数据
  4. 持久化存储(数据)

环境安装:
pip install requests

基础案例

1、爬取搜狗指定词条对应的搜索结果页面(简易网页采集器)

可能出现的问题

  • 乱码的问题:响应数据不是‘utf-8’格式(原因),用.encoding=’utf-8’转换编码格式(解决方法)
  • 数据丢失(比人工浏览到的数据少):异常访问请求(没有伪请求头UA-headers) ```python

    encoding: utf-8

    “”” @author: linpions @software: PyCharm @file: 案例1:爬取搜狗词条结果.py @time: 2021-12-26 20:10 “””

import requests

url = ‘https://www.sogou.com/web‘ keywords = input(‘enter a key word:’) headers = { ‘User-Agent’:’Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36’ }

实现参数动态化

params参数(字典):保存请求时URL携带的参数

params = { ‘query’: keywords, }

response = requests.get(url=url, params=params, headers=headers) response.encoding = ‘utf-8’ page_text = response.text

file_Name = ‘搜狗词条:’ + keywords + ‘.html’ with open(file_Name, ‘w’, encoding=’utf-8’) as fp: fp.write(page_text) print(file_Name, ‘爬取完毕!’)

  1. <a name="DhTVW"></a>
  2. ### 2、破解百度翻译
  3. - url: https://fanyi.baidu.com/v2transapi?from=en&to=zh
  4. - 局部刷新,Ajax请求
  5. ```python
  6. def baidu_fanyi(keyword=None):
  7. url = 'https://fanyi.baidu.com/v2transapi'
  8. headers = {
  9. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
  10. '92.0.4515.131 Safari/537.36',
  11. 'cookie': 'BIDUPSID=2437C970398B3648E3DCEFC3DA3F453F; PSTM=1588134641; BDUSS=2FNVTNXWFREN3lOS2VZWlh6eHJvcjdUWGN4d3RueWRKaGVCYWtJWHV6QnVDdjFmRVFBQUFBJCQAAAAAAAAAAAEAAADdjgBCwe7GvdfTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG591V9ufdVfN; BDUSS_BFESS=2FNVTNXWFREN3lOS2VZWlh6eHJvcjdUWGN4d3RueWRKaGVCYWtJWHV6QnVDdjFmRVFBQUFBJCQAAAAAAAAAAAEAAADdjgBCwe7GvdfTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG591V9ufdVfN; H_WISE_SIDS=110085_127969_128701_131423_144966_154214_156927_160878_164955_165135_165328_166148_167069_167112_167300_168029_168542_168626_168748_168763_168896_169308_169536_169882_170154_170240_170244_170330_170475_170548_170578_170581_170583_170588_170632_170754_170806_171446_171463_171567_171581; __yjs_duid=1_583e334168fb61ad031df70449fa28b11620848721011; BAIDUID=88DF09015FADA78B679498D7716BC9F3:FG=1; BAIDUID_BFESS=9AC90C484D04DC1AD589BE66E2DC00C5:FG=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1629183095; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1629184241; __yjs_st=2_ZDRiNjhmMDM0ZWI2MTRmM2MyZDYyZDg3NTg5NGFhZmJkNmQzODZjZmUxNGQ5NmYxZmIyNTRiOTg1Y2Y5NjYzMDM0NzMzNWVjYTYyYjNjMjlkMThmZWRhYjZhZWYzNzliZjI1ZWM2YjExZmJlODUyMTI3YjFjNTU4Y2I5OGM1ZGFjNzNmNDA0N2Q2NjAzYzY4ZDZiYzUzZTcxYzE1ZjA5OTAwMmVkNWM2YjlmYTFjY2U3ZTQwOWU4NzVhZTlmMDEyYmU3NDVhYTVmOWEyYTVjOGUxNzUzNjI2Y2U3NTRkNmQyYTExMjUzNGJhYWVkMzgyOTMzZDFjOTVhOGVhODM3OV83XzI0MTkxNGNi; ab_sr=1.0.1_ZDg0NDMwZGY4ZWZiMTA5YjgzMmVlMDU0M2MxZDRkMzY4NDQ3MGI4YThlNjNhYTFiMGJhZTUzNzRkZTI5NjI5NzUwMGNkYjBmZTQ3MTBhN2FkMjgyMDg5OGNkMTkyOTg5M2UxYmNjYmE3NzUyZGM1ZGM4N2M5NzliNzBlYTg3ZTJlNjMwNTU3OTdjMmFkNTRjMDg3OTEyMDJiMjg2MmU5NGMzZjdiM2U3NjUyZTBjNjg1ODEyYTc5Yzk5MTI5NTAw',
  12. }
  13. data = {
  14. 'from': 'en',
  15. 'to': 'zh',
  16. 'query': keyword,
  17. 'transtype': 'realtime',
  18. 'simple_means_flag': '3',
  19. 'sign': '871501.634748',
  20. 'token': '6567483e2686ce76cd8bbdb797a1a5bd',
  21. 'domain': 'common',
  22. }
  23. response = requests.post(url=url, data=data, headers=headers)
  24. page_text = response.json()
  25. # page_text = page_text['data']
  26. return page_text

当改变要翻译的内容时,返回的不是想要的结果,出现997/998错误
原因:百度翻译增加了反爬机制,所以爬虫程序获取不到翻译结果了;参考分析方式进行学习;因为你被反爬了,headers(重点是useragent)和代理ip和动态验证码,这三个加上基本就没问题了
【Python】关于爬取百度翻译以及”errno”:998&”errno”:997_RedMaple-程序员宅基地

  • sign、cookie是动态的,会变

image.png
解决方案:

image.png

  1. # 解决cookie和sign动态变化问题
  2. def baidu_fanyi2(keyword=None):
  3. url = 'https://fanyi.baidu.com/sug'
  4. headers = {
  5. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
  6. '92.0.4515.131 Safari/537.36',
  7. # 'cookie': 'BIDUPSID=2437C970398B3648E3DCEFC3DA3F453F; PSTM=1588134641; BDUSS=2FNVTNXWFREN3lOS2VZWlh6eHJvcjdUWGN4d3RueWRKaGVCYWtJWHV6QnVDdjFmRVFBQUFBJCQAAAAAAAAAAAEAAADdjgBCwe7GvdfTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG591V9ufdVfN; BDUSS_BFESS=2FNVTNXWFREN3lOS2VZWlh6eHJvcjdUWGN4d3RueWRKaGVCYWtJWHV6QnVDdjFmRVFBQUFBJCQAAAAAAAAAAAEAAADdjgBCwe7GvdfTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG591V9ufdVfN; H_WISE_SIDS=110085_127969_128701_131423_144966_154214_156927_160878_164955_165135_165328_166148_167069_167112_167300_168029_168542_168626_168748_168763_168896_169308_169536_169882_170154_170240_170244_170330_170475_170548_170578_170581_170583_170588_170632_170754_170806_171446_171463_171567_171581; __yjs_duid=1_583e334168fb61ad031df70449fa28b11620848721011; BAIDUID=88DF09015FADA78B679498D7716BC9F3:FG=1; BAIDUID_BFESS=9AC90C484D04DC1AD589BE66E2DC00C5:FG=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1629183095; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1629184241; __yjs_st=2_ZDRiNjhmMDM0ZWI2MTRmM2MyZDYyZDg3NTg5NGFhZmJkNmQzODZjZmUxNGQ5NmYxZmIyNTRiOTg1Y2Y5NjYzMDM0NzMzNWVjYTYyYjNjMjlkMThmZWRhYjZhZWYzNzliZjI1ZWM2YjExZmJlODUyMTI3YjFjNTU4Y2I5OGM1ZGFjNzNmNDA0N2Q2NjAzYzY4ZDZiYzUzZTcxYzE1ZjA5OTAwMmVkNWM2YjlmYTFjY2U3ZTQwOWU4NzVhZTlmMDEyYmU3NDVhYTVmOWEyYTVjOGUxNzUzNjI2Y2U3NTRkNmQyYTExMjUzNGJhYWVkMzgyOTMzZDFjOTVhOGVhODM3OV83XzI0MTkxNGNi; ab_sr=1.0.1_ZDg0NDMwZGY4ZWZiMTA5YjgzMmVlMDU0M2MxZDRkMzY4NDQ3MGI4YThlNjNhYTFiMGJhZTUzNzRkZTI5NjI5NzUwMGNkYjBmZTQ3MTBhN2FkMjgyMDg5OGNkMTkyOTg5M2UxYmNjYmE3NzUyZGM1ZGM4N2M5NzliNzBlYTg3ZTJlNjMwNTU3OTdjMmFkNTRjMDg3OTEyMDJiMjg2MmU5NGMzZjdiM2U3NjUyZTBjNjg1ODEyYTc5Yzk5MTI5NTAw',
  8. }
  9. data = {
  10. 'kw': keyword,
  11. }
  12. response = requests.post(url=url, data=data, headers=headers)
  13. page_text = response.json()
  14. # page_text = page_text['data']
  15. return page_text

3、爬取豆瓣电影详情数据

https://movie.douban.com/chart

难题:动态加载的数据
动态加载的数据:

  • 无法每次实现可见即可得
  • 非固定URL请求到的数据

判断是否动态加载数据小技巧:进入抓包工具,在preview页面搜索页面数据,不能搜索到即为动态加载
定位数据包(不是都能定位到):原因:如果动态加载的数据是经过加密的密文数据
image.png
从该数据包可以获取:

  • 请求的URL
  • 请求方式
  • 请求携带的参数
  • 看到响应的数据

解析json数据时出错:【Python】JSONDecodeError: Expecting value: line 1 column 1 (char 0)
https://blog.csdn.net/qq_29757283/article/details/98252728
原因:但是因为传递给 json.loads 的参数不符合 JSON 格式,所以抛出异常。

  • 键值对使用单引号而非双引号。
  • 参数为(或含有)普通的字符串格式(plain or html)。

image.png
pandas创建空dataframe
Python Pandas 向DataFrame中添加一行/一列

  1. def douban_movies():
  2. url = 'https://movie.douban.com/j/new_search_subjects'
  3. headers = {
  4. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
  5. '92.0.4515.131 Safari/537.36'
  6. }
  7. params = {
  8. 'sort': 'U',
  9. 'range': '0,10',
  10. 'tags': '',
  11. 'start': '0',
  12. }
  13. response = requests.get(url=url, params=params, headers=headers)
  14. page_text = response.json() # .json()获取字符串形式的json数据序列化成列表
  15. df = pd.DataFrame(columns=['电影名', '评分']) # 数据写入dataframe
  16. # 解析出电影的名称+评分
  17. id = 0
  18. for movie in page_text['data']:
  19. df.loc[id] = [movie['title'], movie['rate']]
  20. id += 1
  21. return df
  22. # 多次爬取后有反爬机制,需登录操作

4、分页操作——爬取肯德基餐厅位置

  • URL:http://www.kfc.com.cn/kfccda/storelist/index.aspx
  • 录入关键字并按搜索才加载出位置信息:发起的是Ajax请求
  • 基于抓包工具定位到该Ajax请求的数据包,从该数据包中捕获到:
    • 请求的URL
    • 请求方式(GET or POST)
    • 请求携带的参数(一般在headers最后)
    • 看到响应数据
  • 跟GET请求参数动态化的封装不同的是封装为data,不是params
  • 爬取多页数据:修改参数并加入循环 ```python def kfc_info(): url = ‘http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword‘ headers = {

    1. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    2. '92.0.4515.131 Safari/537.36'

    } df = pd.DataFrame(columns=[‘餐厅名’, ‘详细位置’]) id = 0 for page in range(1, 9):

    1. data = {
    2. 'cname': '',
    3. 'pid': '',
    4. 'keyword': '广州',
    5. 'pageIndex': str(page),
    6. 'pageSize': '10',
    7. }
    8. response = requests.post(url=url, data=data, headers=headers)
    9. page_text = response.json()
    10. for dic in page_text['Table1']:
    11. df.loc[id] = [dic['storeName'], dic['addressDetail']]
    12. id += 1

    return df

  1. <a name="axdgb"></a>
  2. ### 5、药监局数据
  3. 爬取[药监局](http://scxk.nmpa.gov.cn:81/xk/)中的企业详情数据<br />![image.png](https://cdn.nlark.com/yuque/0/2021/png/2981571/1629182476319-5c35a766-02fe-44dd-aa6b-0c50d57a090a.png#clientId=u6abbc4df-8bd4-4&crop=0&crop=0&crop=1&crop=1&from=paste&height=421&id=cmh1c&margin=%5Bobject%20Object%5D&name=image.png&originHeight=841&originWidth=1281&originalType=binary&ratio=1&rotation=0&showTitle=false&size=636118&status=done&style=none&taskId=u6154b936-12d0-4d91-a0fa-e643fbe8607&title=&width=640.5)<br />![image.png](https://cdn.nlark.com/yuque/0/2021/png/2981571/1629182520593-8e64eafc-ff33-428d-b275-c85c38c6b1ac.png#clientId=u6abbc4df-8bd4-4&crop=0&crop=0&crop=1&crop=1&from=paste&height=386&id=jsFyr&margin=%5Bobject%20Object%5D&name=image.png&originHeight=772&originWidth=1106&originalType=binary&ratio=1&rotation=0&showTitle=false&size=267104&status=done&style=none&taskId=u9f8ece40-65ae-4716-bee7-508b3ae138b&title=&width=553)
  4. - 不用数据解析
  5. - 数据都是动态出来的(Ajax请求)
  6. - 突破点:每个企业的详情页的参数id是列表页的id
  7. ```python
  8. def get_canpanysid(pageNum=None):
  9. url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
  10. headers = {
  11. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
  12. '92.0.4515.131 Safari/537.36'
  13. }
  14. df = pd.DataFrame(columns=['企业名称', 'ID'])
  15. id = 0
  16. for page in range(1, pageNum+1):
  17. data = {
  18. 'on': 'true',
  19. 'page': str(page),
  20. 'pageSize': '15',
  21. 'productName': '',
  22. 'conditionType': '1',
  23. 'applyname': '',
  24. 'applysn': '',
  25. }
  26. response = requests.post(url=url, data=data, headers=headers)
  27. page_text = response.json()
  28. for dic in page_text['list']:
  29. df.loc[id] = [dic['EPS_NAME'], dic['ID']]
  30. id += 1
  31. return df
  32. def get_canpanysinfo(pageNum=None):
  33. url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
  34. headers = {
  35. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
  36. '92.0.4515.131 Safari/537.36'
  37. }
  38. df = pd.DataFrame(columns=['企业名称', '许可证编号', '许可项目', '企业住所', '生产地址', '社会信用代码', '法定代表人',
  39. '企业负责人', '质量负责人', '发证机关', '签发人', '日常监督管理机构', '日常监督管理人员',
  40. '有效期至', '发证日期', '状态', '投诉举报电话'])
  41. ID_df = get_canpanysid(pageNum=pageNum)
  42. id = 0
  43. for ID in ID_df['ID']:
  44. data = {
  45. 'id': ID,
  46. }
  47. response = requests.post(url=url, data=data, headers=headers)
  48. response.encoding = 'utf-8'
  49. page_text = response.json()
  50. df.loc[id, '企业名称'] = page_text['epsName']
  51. df.loc[id, '许可证编号'] = page_text['productSn']
  52. df.loc[id, '许可项目'] = page_text['certStr']
  53. df.loc[id, '企业住所'] = page_text['epsAddress']
  54. df.loc[id, '生产地址'] = page_text['epsProductAddress']
  55. df.loc[id, '社会信用代码'] = page_text['businessLicenseNumber']
  56. df.loc[id, '法定代表人'] = page_text['legalPerson']
  57. df.loc[id, '企业负责人'] = page_text['businessPerson']
  58. df.loc[id, '质量负责人'] = page_text['qualityPerson']
  59. df.loc[id, '发证机关'] = page_text['qfManagerName']
  60. df.loc[id, '签发人'] = page_text['xkName']
  61. df.loc[id, '日常监督管理机构'] = page_text['rcManagerDepartName']
  62. df.loc[id, '日常监督管理人员'] = page_text['rcManagerUser']
  63. df.loc[id, '有效期至'] = page_text['xkDate']
  64. df.loc[id, '发证日期'] = page_text['xkDateStr']
  65. df.loc[id, '状态'] = '正常'
  66. df.loc[id, '投诉举报电话'] = '12331'
  67. id += 1
  68. df.to_csv('canpanysinfo.csv', encoding='utf-8')
  69. return df