1. import requests
    2. from bs4 import BeautifulSoup
    3. from hashlib import md5
    4. import os
    5. class JXAriUniversity(object):
    6. def init(self):
    7. self.baseURL = 'http://www.jxau.edu.cn/10/list.htm'
    8. self.edu = 'http://www.jxau.edu.cn'
    9. self.headers = {
    10. 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Mobile Safari/537.36'
    11. }
    12. def respose(self, url):
    13. content = requests.get(url,
    14. headers=self.headers).content.decode()
    15. return content
    16. def getRequest(self):
    17. content = self.respose(self.baseURL)
    18. soup = BeautifulSoup(content, 'lxml')
    19. list = soup.select('.news_list li')
    20. edu_list = []
    21. for item in list:
    22. dict = {}
    23. dict['href'] = self.edu + item.select_one('a').get('href')
    24. dict['title'] = item.select_one('a').get('title')
    25. edu_list.append(dict)
    26. for item in edu_list:
    27. edu_url = item['href']
    28. edu_title = item['title']
    29. edu_html = self.respose(edu_url)
    30. soup_edu = BeautifulSoup(edu_html, 'lxml')
    31. image_list = soup_edu.select('.wp_articlecontent img')
    32. images = []
    33. for item in image_list:
    34. image_url = self.edu + item.get('src')
    35. self.saveImage(image_url, 'edu/江西农业大学/{}'.format(edu_title))
    36. def saveImage(self, image_url, file_path):
    37. respose = requests.get(image_url)
    38. if respose.status_code == 200:
    39. data = respose.content
    40. try:
    41. if not os.path.exists(file_path):
    42. print('文件夹', file_path, '不存在,重新建立')
    43. os.makedirs(file_path)
    44. # 获得图片后缀
    45. file_suffix = os.path.splitext(image_url)[1]
    46. # 拼接图片名(包含路径)
    47. filename = '{}/{}{}'.format(file_path, md5(data).hexdigest(),
    48. file_suffix)
    49. with open(filename, 'wb')as f:
    50. f.write(data)
    51. except IOError as e:
    52. print('文件操作失败:' + e)
    53. except Exception as e:
    54. print('错误:' + e)
    55. def run(self):
    56. self.getRequest()
    57. if name == 'main':
    58. JXAriUniversity().run()

    截屏2020-11-11 15.48.50.png