爬虫
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import urllib
import time
import json
def geturltoutf8(srcString):
unquotedString = urllib.unquote(srcString)
#print repr(unquotedString)
textString = unquotedString.decode('utf-8')
#print textString
return textString
# guiZhou = geturltoutf8("贵州科技")
guiZhou = "贵州科技"
sEncodeMsg = guiZhou.encode('utf-8')
# headers={'cookie':'mysession=aliyungf_tc=AQAAABqomXD6gQ4AwQEN3ayZFay1Ythz; ssuid=5083515037; bannerFlag=undefined; csrfToken=kLXc2RY4psjXXbdf_moqrET0; TYCID=6a1bd9b08f7011e9bb0ae1f5d8c7846a; undefined=6a1bd9b08f7011e9bb0ae1f5d8c7846a; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1560604857; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1560604964; _ga=GA1.2.733340950.1560604857; _gid=GA1.2.1066415593.1560604857; RTYCID=99c213b00d934cfcad6533f10fcf616a; CT_TYCID=4b21c036c56d4536a8e2c8d9ecab1fe3; cloud_token=4176018aad674c48a5f52fead2177574; cloud_utm=35924b8b851248f3af36f292e3308fc5; tyc-user-info=%257B%2522claimEditPoint%2522%253A%25220%2522%252C%2522myAnswerCount%2522%253A%25220%2522%252C%2522myQuestionCount%2522%253A%25220%2522%252C%2522signUp%2522%253A%25220%2522%252C%2522explainPoint%2522%253A%25220%2522%252C%2522privateMessagePointWeb%2522%253A%25220%2522%252C%2522nickname%2522%253A%2522%25E7%2590%25BC%25C2%25B7%25E6%259D%25B0%25E7%2589%25B9%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522privateMessagePoint%2522%253A%25220%2522%252C%2522state%2522%253A%25220%2522%252C%2522announcementPoint%2522%253A%25220%2522%252C%2522isClaim%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522discussCommendCount%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252247%2522%252C%2522onum%2522%253A%25220%2522%252C%2522claimPoint%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU4NTEyOTU5MiIsImlhdCI6MTU2MDYwNDk1MiwiZXhwIjoxNTkyMTQwOTUyfQ.SUzI3jVP7P9EW-Qb_kpE8GQSb_ZIV46VdfmOMSHqCgOoRDtJN8oRkc9fiZ5LvuvKtav3xfNRwfboEUtnNX0eaw%2522%252C%2522pleaseAnswerCount%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215585129592%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU4NTEyOTU5MiIsImlhdCI6MTU2MDYwNDk1MiwiZXhwIjoxNTkyMTQwOTUyfQ.SUzI3jVP7P9EW-Qb_kpE8GQSb_ZIV46VdfmOMSHqCgOoRDtJN8oRkc9fiZ5LvuvKtav3xfNRwfboEUtnNX0eaw'}
headers={'cookie':'mysession=aliyungf_tc=AQAAABqomXD6gQ4AwQEN3ayZFay1Ythz; ssuid=5083515037; bannerFlag=true; csrfToken=kLXc2RY4psjXXbdf_moqrET0; TYCID=6a1bd9b08f7011e9bb0ae1f5d8c7846a; undefined=6a1bd9b08f7011e9bb0ae1f5d8c7846a; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1560604857,1560612797; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1560613527; _ga=GA1.2.733340950.1560604857; _gid=GA1.2.1066415593.1560604857; RTYCID=99c213b00d934cfcad6533f10fcf616a; CT_TYCID=4b21c036c56d4536a8e2c8d9ecab1fe3; cloud_token=4176018aad674c48a5f52fead2177574; token=ef9f6b2a6a62424b801800ca33fccff9; _utm=0c0560531bf743f7abcc969e22005677; tyc-user-info=%257B%2522claimEditPoint%2522%253A%25220%2522%252C%2522myAnswerCount%2522%253A%25220%2522%252C%2522myQuestionCount%2522%253A%25220%2522%252C%2522signUp%2522%253A%25220%2522%252C%2522explainPoint%2522%253A%25220%2522%252C%2522privateMessagePointWeb%2522%253A%25220%2522%252C%2522nickname%2522%253A%2522%25E5%25A4%259A%25E6%2581%25A9%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522privateMessagePoint%2522%253A%25220%2522%252C%2522state%2522%253A%25220%2522%252C%2522announcementPoint%2522%253A%25220%2522%252C%2522isClaim%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522discussCommendCount%2522%253A%25221%2522%252C%2522monitorUnreadCount%2522%253A%2522144%2522%252C%2522onum%2522%253A%25220%2522%252C%2522claimPoint%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODY4NTAwMDY1NCIsImlhdCI6MTU2MDYxMzUyOCwiZXhwIjoxNTkyMTQ5NTI4fQ.6FmjYlUDgJzRUew1bMHEHRm3ccyWjfil84OKKLJOdXKy_T3f299dSRbCfuyO6AWgiou9BGPX3xhQbZVjXInr1Q%2522%252C%2522pleaseAnswerCount%2522%253A%25221%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252218685000654%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODY4NTAwMDY1NCIsImlhdCI6MTU2MDYxMzUyOCwiZXhwIjoxNTkyMTQ5NTI4fQ.6FmjYlUDgJzRUew1bMHEHRm3ccyWjfil84OKKLJOdXKy_T3f299dSRbCfuyO6AWgiou9BGPX3xhQbZVjXInr1Q; _gat_gtag_UA_123487620_1=1'}
# req=requests.get('https://www.tianyancha.com/search?key=贵州科技&rnd=',headers=headers)
# with open('page1', 'w+') as f:
# f.write(req.text)
for i in range(2,251):
# time.sleep(30)
urlPage = 'https://www.tianyancha.com/search/p' + str(i)+ '?key=贵州科技&rnd='
filename = "22page" + str(i)
print(urlPage)
req=requests.get(urlPage,headers=headers)
if i % 100 == 0:
print(req.text)
# filetest = str(req.text.encode('GB18030'))
with open(filename, 'w+') as f:
json.dump(req.text, f)
# f.write(req.text)
#
# with open(filename, 'r+') as f:
# json.dump(req.text, f)
# headers={'cookie':'mysession=aliyungf_tc=AQAAABqomXD6gQ4AwQEN3ayZFay1Ythz; ssuid=5083515037; bannerFlag=undefined; csrfToken=kLXc2RY4psjXXbdf_moqrET0; TYCID=6a1bd9b08f7011e9bb0ae1f5d8c7846a; undefined=6a1bd9b08f7011e9bb0ae1f5d8c7846a; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1560604857; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1560604964; _ga=GA1.2.733340950.1560604857; _gid=GA1.2.1066415593.1560604857; RTYCID=99c213b00d934cfcad6533f10fcf616a; CT_TYCID=4b21c036c56d4536a8e2c8d9ecab1fe3; cloud_token=4176018aad674c48a5f52fead2177574; cloud_utm=35924b8b851248f3af36f292e3308fc5; tyc-user-info=%257B%2522claimEditPoint%2522%253A%25220%2522%252C%2522myAnswerCount%2522%253A%25220%2522%252C%2522myQuestionCount%2522%253A%25220%2522%252C%2522signUp%2522%253A%25220%2522%252C%2522explainPoint%2522%253A%25220%2522%252C%2522privateMessagePointWeb%2522%253A%25220%2522%252C%2522nickname%2522%253A%2522%25E7%2590%25BC%25C2%25B7%25E6%259D%25B0%25E7%2589%25B9%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522privateMessagePoint%2522%253A%25220%2522%252C%2522state%2522%253A%25220%2522%252C%2522announcementPoint%2522%253A%25220%2522%252C%2522isClaim%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522discussCommendCount%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252247%2522%252C%2522onum%2522%253A%25220%2522%252C%2522claimPoint%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU4NTEyOTU5MiIsImlhdCI6MTU2MDYwNDk1MiwiZXhwIjoxNTkyMTQwOTUyfQ.SUzI3jVP7P9EW-Qb_kpE8GQSb_ZIV46VdfmOMSHqCgOoRDtJN8oRkc9fiZ5LvuvKtav3xfNRwfboEUtnNX0eaw%2522%252C%2522pleaseAnswerCount%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215585129592%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU4NTEyOTU5MiIsImlhdCI6MTU2MDYwNDk1MiwiZXhwIjoxNTkyMTQwOTUyfQ.SUzI3jVP7P9EW-Qb_kpE8GQSb_ZIV46VdfmOMSHqCgOoRDtJN8oRkc9fiZ5LvuvKtav3xfNRwfboEUtnNX0eaw'}
# req=requests.get(urlPage,headers=headers)
# print(req.text)
# :https://www.tianyancha.com/search/p2?key=贵州科技&rnd=
# https://www.tianyancha.com/search?key=贵州科技&rnd=