爬取图片作业
爬取图片作业

代码

import requests
from bs4 import BeautifulSoup
# 解析url翻页操作
# 'https://sc.chinaz.com/tupian/index_1.html' 这样不行
'https://sc.chinaz.com/tupian/'
'https://sc.chinaz.com/tupian/index_2.html'
'https://sc.chinaz.com/tupian/index_3.html'
class TuPian:
    def res_url(self, url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
        }
        res = requests.get(url, headers=headers)
        html = res.content.decode('utf-8')
        return html
    def parse_html(self, html, li):
        soup = BeautifulSoup(html, 'lxml')
        first_div = soup.find('div', id="container")
        a_tags = first_div.find_all('a')
        for i in range(0, len(a_tags), 3):
            item = {}
            # print(a_tags[i])
            img = a_tags[i].find('img')
            # print(img)
            item['name'] = img['alt']
            item['url'] = 'https:' + img['src2']
            li.append(item)
    def write_data(self, li):
        for i in range(len(li)):
            item = li[i]
            name = item['name']
            url = item['url']
            res = requests.get(url)
            with open(name + '.png', 'wb') as f:
                f.write(res.content)
                print(f"第{i + 1}张爬取完成！")
    def main(self):
        li = []
        basic_url = 'https://sc.chinaz.com/tupian/index_{}.html'
        num = int(input("请输入你要爬取的页数："))
        for i in range(1, num + 1):
            if i == 1:
                html = self.res_url('https://sc.chinaz.com/tupian/')
            else:
                html = self.res_url(basic_url.format(i))
            self.parse_html(html, li)
        self.write_data(li)
if __name__ == '__main__':
    t = TuPian()
    t.main()
爬虫作业

王家驹-第六次作业-第十二期爬虫

爬取图片作业

代码

爬取的内容

图片实例