爬取图片作业
代码
import requests
from bs4 import BeautifulSoup
# 解析url翻页操作
# 'https://sc.chinaz.com/tupian/index_1.html' 这样不行
'https://sc.chinaz.com/tupian/'
'https://sc.chinaz.com/tupian/index_2.html'
'https://sc.chinaz.com/tupian/index_3.html'
class TuPian:
def res_url(self, url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
res = requests.get(url, headers=headers)
html = res.content.decode('utf-8')
return html
def parse_html(self, html, li):
soup = BeautifulSoup(html, 'lxml')
first_div = soup.find('div', id="container")
a_tags = first_div.find_all('a')
for i in range(0, len(a_tags), 3):
item = {}
# print(a_tags[i])
img = a_tags[i].find('img')
# print(img)
item['name'] = img['alt']
item['url'] = 'https:' + img['src2']
li.append(item)
def write_data(self, li):
for i in range(len(li)):
item = li[i]
name = item['name']
url = item['url']
res = requests.get(url)
with open(name + '.png', 'wb') as f:
f.write(res.content)
print(f"第{i + 1}张爬取完成!")
def main(self):
li = []
basic_url = 'https://sc.chinaz.com/tupian/index_{}.html'
num = int(input("请输入你要爬取的页数:"))
for i in range(1, num + 1):
if i == 1:
html = self.res_url('https://sc.chinaz.com/tupian/')
else:
html = self.res_url(basic_url.format(i))
self.parse_html(html, li)
self.write_data(li)
if __name__ == '__main__':
t = TuPian()
t.main()