使用正则表达式爬取网页图片

  1. import re
  2. import requests
  3. pattern = re.compile(r'<img src=*"/.*?\.jpg"') # 使用正则表达式
  4. url = "http://www.tipdm.com/tipdm/index.html"
  5. response = requests.get(url)
  6. urlList = re.findall(pattern, response.text)
  7. print(urlList)
  8. # 遍历出图片,存放到本地中
  9. source = "http://www.tipdm.com" # url地址
  10. i = 1 # 定义初始值
  11. for ul in urlList:
  12. url = source + ul.lstrip('<img src="').rstrip('"') # 将URL地址和img下的src元素进行拼接操作
  13. print(url) # 打印图片的url
  14. fileName = str(i) + ".jpg" # 拼接文件名称
  15. response = requests.get(url) #
  16. with open(fileName, "wb") as f: # 写入到当前项目路径中
  17. f.write(response.content) # 写入数据
  18. i = i + 1 # 加1循环操作