1.作业一
- 目标网站:https://www.dmzj.com/
- 爬取要求:
- 1、到这个网站上面找一张自己喜欢的漫画里面的随便一张图片的url
- 2、把图片爬取下来,保存到本地
代码
from fake_useragent import UserAgent
import re
import requests
import urllib.request
url = ‘https://www.dmzj.com/‘
headers = {
‘User-Agent’ : UserAgent().chrome
}
resp=requests.get(url,headers=headers)
print(resp.text)
urls=re.findall(‘class=”lazy” src=”(.?)”‘,resp.text)
for url in urls:
try:
urllib.request.urlretrieve(url*,f’{urls.index(url)}.jpg’)
except:
…
2.作业二
- 目标网站:https://www.cnblogs.com/
- 爬取要求:
- 1、输入要搜索的内容
- 2、翻页爬取相关页面html代码
- 3、保存到本地
代码:
from fake_useragent import UserAgent
import re
import requests
import urllib.request
headers = {
‘User-Agent’ : UserAgent().chrome,
‘cookie’:’_ga=GA1.2.523319485.1628591907; gads=ID=118a91cb5c641f9e:T=1628591906:S=ALNI_MapEtp4QhQrLiO4DU0oVCn7KR6EuA; UM_distinctid=17cf50e9c70c08-08566e116b026-561a135a-144000-17cf50e9c71bd5; gpi=00000000-0000-0000-0000-000000000000; sc_is_visitor_unique=rx10890287.1643861383.DD3E4393E4034FCB677B1FBBC8CFAB4D.1.1.1.1.1.1.1.1.1-12123033.1637376428.1.1.1.1.1.1.1.1.1; _gid=GA1.2.1545364938.1644808997; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1644499169,1644548173,1644588972,1644845942; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1644845942; .AspNetCore.Session=CfDJ8GsLOKiGtk1Au0UP1SouGdUvpA980ss2riFovCgWH6iklD8ykgMJwZG7U4Xfo69ck8WnRP%2F9HFiwpiHkQhRNRK07HdzVVgg5ZcFSOb8cwbRoY4gsXX4%2F47B%2FonNq3PmkoFhf9tZvuBdw9tJISr2DtXxtbnM3%2FXDaRECZJoCmYUAz; Hm_lvt_eaa57ca47dacb4ad4f5a257001a3457c=1644845962; utma=59123430.523319485.1628591907.1644845962.1644845962.1; utmc=59123430; utmz=59123430.1644845962.1.1.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; NotRobot=CfDJ8GsLOKiGtk1Au0UP1SouGdX5NcTiyWMo9Pd8pxu4uYUp7-nXO1w49RxvsR0kqxG1huN5aQVrM31-w7xxu8CkbR9gppSiyaOs3QxWVUjcWgdHZ4O3h0TuWAAQhbJRQaO7iw; utmb=59123430.5.10.1644845962; Hm_lpvt_eaa57ca47dacb4ad4f5a257001a3457c=1644846507’
}
startpage=int(input(‘请输入爬取起始页’))
endpage=int(input(‘请输入爬取终止页’))
inp=input(“请输入”)
for i in range(startpage,endpage):
url = f’https://zzk.cnblogs.com/s/blogpost?Keywords={inp}&pageindex={i}‘
resp = requests.get(url, headers=headers)
with open(f’{i}.html’,‘w’,encoding=’utf-8’) as f:
f.write(resp.text)