关键词:python 爬虫

get示例

  1. log_in_url = "http://www.uimaker.com/member/ubi.php?action=qd&_=1526202830017"
  2. # headers
  3. headers = {
  4. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
  5. "Referer": "http://www.uimaker.com/member/index.php"
  6. }
  7. # 参数使用params参数
  8. cookies = {}
  9. print(os.path.join(os.path.abspath("."), "cookies"))
  10. with open(os.path.join(os.path.abspath("."), "cookies"), 'r', encoding='utf-8') as f:
  11. for item in f.read().split(';'):
  12. name, value = item.split('=', 1)
  13. cookies[name] = value
  14. # 请求时保留cookies
  15. session = requests.session()
  16. response = session.get(log_in_url, headers=headers, cookies=cookies)
  17. print(response.json())
  18. with open('./log.txt', 'a+', encoding='utf-8') as f:
  19. f.write(str(datetime.datetime.now()) + "\t")
  20. f.write(str(response.status_code) + os.linesep)

post示例

jq22签到示例,其实cookies直接放入到header里面即可

  1. import requests
  2. from bs4 import BeautifulSoup as bs
  3. import os
  4. import datetime
  5. session = requests.session()
  6. log_in_url = "http://www.jq22.com/signIn.aspx"
  7. # headers
  8. headers = {
  9. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
  10. "Referer":"http://www.jq22.com/signIn.aspx"
  11. }
  12. # 参数
  13. data = {
  14. "Button1": "签 到",
  15. "__VIEWSTATE": "OQz0gcxiE2uemW6coqrtvy4sKAViSr6o8Xxn4QjA2WoOiQFGhlAYIOEl9am94jQqkvf4+19RlxfIFvNagglpiyBGz4smPulqXEeQ7q4kU0VsdhfifzVlfGrj+DTYHKTR",
  16. "__VIEWSTATEGENERATOR": "ECDA716A", # 不变
  17. "__EVENTVALIDATION": "/Up6CRbE1Igwin/Bz2lx2qJo9+keEisPxD70frbfphQPhghu+ViAae9BGgf/NgO1peWGTpB1g0dHGJUyuCP9R53m201mCW3WyGvL6fjdFH58Ds7ZxC7HP3GlDan4Jl1C"
  18. }
  19. cookies = {}
  20. print(os.path.join(os.path.abspath("."),"cookies"))
  21. with open(os.path.join(os.path.abspath("."),"cookies"), 'r', encoding='utf-8') as f:
  22. for item in f.read().split(';'):
  23. name, value = item.split('=', 1)
  24. cookies[name] = value
  25. cookie
  26. response = session.post(log_in_url, headers=headers, data=data, cookies=cookies)
  27. print(response.status_code)
  28. with open('./log.txt','a+',encoding='utf-8') as f:
  29. f.write(str(datetime.datetime.now()) + "\t")
  30. f.write(str(response.status_code) + os.linesep)

json请求示例

  1. import requests
  2. import json
  3. headers = {
  4. "Content-Type": "application/json; charset=UTF-8",
  5. "Referer": "http://jinbao.pinduoduo.com/index?page=5",
  6. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
  7. }
  8. url = "http://jinbao.pinduoduo.com/network/api/common/goodsList"
  9. pyload = {"keyword": "", "sortType": 0, "withCoupon": 0, "categoryId": 16, "pageNumber": 1, "pageSize": 60}
  10. response = requests.post(url, data=json.dumps(pyload), headers=headers).text
  11. print(response

获取代理

  1. import requests
  2. import re
  3. """
  4. 可能是歧梦谷系统不完善,每天访问推广链接添加的梦感不超过10,这里面以不同的ip访问一百次
  5. """
  6. proxies_list = []
  7. # 获取proxies
  8. url = "http://www.xicidaili.com/wn"
  9. headers = {
  10. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
  11. }
  12. response = requests.get(url, headers=headers)
  13. re_list = re.findall(
  14. r'<td>\d+\.\d+\.\d+\.\d+</td>[\s\S]+?<td>\d+</td>', response.text, re.S)
  15. for x in re_list:
  16. ip = re.search('<td>([\s\S]+)</td>[\s\S]+?<td>', x).group(1)
  17. port = re.search('<td>[\s\S]+</td>[\s\S]+?<td>([\s\S]+)</td>', x).group(1)
  18. proxies_list.append("http://" + ip + ":" + port)
  19. url = "http://www.68m.com/?fromuid=566043"
  20. headers = {
  21. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
  22. }
  23. proxies = {}
  24. i = 0
  25. for x in proxies_list:
  26. proxies["https"] = x
  27. response = requests.get(url, proxies=proxies, headers=headers)
  28. print(response)
  29. i += 1
  30. if i > 10:
  31. break

其他问题

cookie问题参考