1.作业一

  • 目标网站:https://www.dmzj.com/
  • 爬取要求:
    • 1、到这个网站上面找一张自己喜欢的漫画里面的随便一张图片的url
    • 2、把图片爬取下来,保存到本地


代码
import requests
from lxml import etree
import os
url = ‘https://www.dmzj.com/

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ‘
‘Chrome/95.0.4638.69 Safari/537.36 ‘,
‘Referer’: ‘https://www.dmzj.com/
}
response = requests.get(url, headers=headers).content
tree = etree.HTML(response)
img_list = tree.xpath(‘//ul[@class=”update_con”]/li’)
# 创建一个文件夹
if not os.path.exists(‘./漫画’):
os.mkdir(‘./漫画’)
for img in img_list:
# 解析图片地址
src = ‘https:’ + img.xpath(‘./a/img/@src’)[0]
# 解析名称
name = img.xpath(‘./a/@title’)[0] + ‘.jpg’
# 请求图片进行储存
img_data = requests.get(url=src, headers=headers).content
img_path = ‘漫画/‘ + name
with open(img_path, ‘wb’)as fp:
fp.write(img_data)
print(name, ‘爬取成功’)

2.作业二

  • 目标网站:https://www.cnblogs.com/
  • 爬取要求:
    • 1、输入要搜索的内容
    • 2、翻页爬取相关页面html代码
    • 3、保存到本地

代码
import requests

headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ‘
‘Chrome/95.0.4638.69 Safari/537.36 ‘,

  1. 'cookie': '_ga=GA1.2.523319485.1628591907; '<br /> '__gads=ID=118a91cb5c641f9e:T=1628591906:S=ALNI_MapEtp4QhQrLiO4DU0oVCn7KR6EuA; '<br /> 'UM_distinctid=17cf50e9c70c08-08566e116b026-561a135a-144000-17cf50e9c71bd5; '<br /> '__gpi=00000000-0000-0000-0000-000000000000; '<br /> 'sc_is_visitor_unique=rx10890287.1643861383.DD3E4393E4034FCB677B1FBBC8CFAB4D.1.1.1.1.1.1.1.1.1-12123033'<br /> '.1637376428.1.1.1.1.1.1.1.1.1; _gid=GA1.2.1545364938.1644808997; '<br /> 'Hm_lvt_866c9be12d4a814454792b1fd0fed295=1644499169,1644548173,1644588972,1644845942; '<br /> 'Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1644845942; '<br /> '.AspNetCore.Session=CfDJ8GsLOKiGtk1Au0UP1SouGdUvpA980ss2riFovCgWH6iklD8ykgMJwZG7U4Xfo69ck8WnRP'<br /> '%2F9HFiwpiHkQhRNRK07HdzVVgg5ZcFSOb8cwbRoY4gsXX4%2F47B%2FonNq3PmkoFhf9tZvuBdw9tJISr2DtXxtbnM3'<br /> '%2FXDaRECZJoCmYUAz; Hm_lvt_eaa57ca47dacb4ad4f5a257001a3457c=1644845962; '<br /> '__utma=59123430.523319485.1628591907.1644845962.1644845962.1; __utmc=59123430; '<br /> '__utmz=59123430.1644845962.1.1.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; '<br /> 'NotRobot=CfDJ8GsLOKiGtk1Au0UP1SouGdX5NcTiyWMo9Pd8pxu4uYUp7-nXO1w49RxvsR0kqxG1huN5aQVrM31'<br /> '-w7xxu8CkbR9gppSiyaOs3QxWVUjcWgdHZ4O3h0TuWAAQhbJRQaO7iw; __utmb=59123430.5.10.1644845962; '<br /> 'Hm_lpvt_eaa57ca47dacb4ad4f5a257001a3457c=1644846507 '<br />}<br />word = input('输入')<br />start = int(input('请输入爬取起始页'))<br />end = int(input('请输入爬取终止页'))<br />for i in range(start, end + 1):<br /> url = f'https://zzk.cnblogs.com/s/blogpost?Keywords={word}&pageindex={i}'<br /> response = requests.get(url, headers=headers).text<br /> # print(response.text)<br /> with open(f'{word}第{i}页.html', 'w', encoding='utf-8') as f:<br /> f.write(response)<br /> print('爬取成功')