关键词:python 爬虫
get示例
log_in_url = "http://www.uimaker.com/member/ubi.php?action=qd&_=1526202830017"
# headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Referer": "http://www.uimaker.com/member/index.php"
}
# 参数使用params参数
cookies = {}
print(os.path.join(os.path.abspath("."), "cookies"))
with open(os.path.join(os.path.abspath("."), "cookies"), 'r', encoding='utf-8') as f:
for item in f.read().split(';'):
name, value = item.split('=', 1)
cookies[name] = value
# 请求时保留cookies
session = requests.session()
response = session.get(log_in_url, headers=headers, cookies=cookies)
print(response.json())
with open('./log.txt', 'a+', encoding='utf-8') as f:
f.write(str(datetime.datetime.now()) + "\t")
f.write(str(response.status_code) + os.linesep)
post示例
jq22签到示例,其实cookies直接放入到header里面即可
import requests
from bs4 import BeautifulSoup as bs
import os
import datetime
session = requests.session()
log_in_url = "http://www.jq22.com/signIn.aspx"
# headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Referer":"http://www.jq22.com/signIn.aspx"
}
# 参数
data = {
"Button1": "签 到",
"__VIEWSTATE": "OQz0gcxiE2uemW6coqrtvy4sKAViSr6o8Xxn4QjA2WoOiQFGhlAYIOEl9am94jQqkvf4+19RlxfIFvNagglpiyBGz4smPulqXEeQ7q4kU0VsdhfifzVlfGrj+DTYHKTR",
"__VIEWSTATEGENERATOR": "ECDA716A", # 不变
"__EVENTVALIDATION": "/Up6CRbE1Igwin/Bz2lx2qJo9+keEisPxD70frbfphQPhghu+ViAae9BGgf/NgO1peWGTpB1g0dHGJUyuCP9R53m201mCW3WyGvL6fjdFH58Ds7ZxC7HP3GlDan4Jl1C"
}
cookies = {}
print(os.path.join(os.path.abspath("."),"cookies"))
with open(os.path.join(os.path.abspath("."),"cookies"), 'r', encoding='utf-8') as f:
for item in f.read().split(';'):
name, value = item.split('=', 1)
cookies[name] = value
cookie
response = session.post(log_in_url, headers=headers, data=data, cookies=cookies)
print(response.status_code)
with open('./log.txt','a+',encoding='utf-8') as f:
f.write(str(datetime.datetime.now()) + "\t")
f.write(str(response.status_code) + os.linesep)
json请求示例
import requests
import json
headers = {
"Content-Type": "application/json; charset=UTF-8",
"Referer": "http://jinbao.pinduoduo.com/index?page=5",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
}
url = "http://jinbao.pinduoduo.com/network/api/common/goodsList"
pyload = {"keyword": "", "sortType": 0, "withCoupon": 0, "categoryId": 16, "pageNumber": 1, "pageSize": 60}
response = requests.post(url, data=json.dumps(pyload), headers=headers).text
print(response)
获取代理
import requests
import re
"""
可能是歧梦谷系统不完善,每天访问推广链接添加的梦感不超过10,这里面以不同的ip访问一百次
"""
proxies_list = []
# 获取proxies
url = "http://www.xicidaili.com/wn"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
response = requests.get(url, headers=headers)
re_list = re.findall(
r'<td>\d+\.\d+\.\d+\.\d+</td>[\s\S]+?<td>\d+</td>', response.text, re.S)
for x in re_list:
ip = re.search('<td>([\s\S]+)</td>[\s\S]+?<td>', x).group(1)
port = re.search('<td>[\s\S]+</td>[\s\S]+?<td>([\s\S]+)</td>', x).group(1)
proxies_list.append("http://" + ip + ":" + port)
url = "http://www.68m.com/?fromuid=566043"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
proxies = {}
i = 0
for x in proxies_list:
proxies["https"] = x
response = requests.get(url, proxies=proxies, headers=headers)
print(response)
i += 1
if i > 10:
break