一、采集公众号文章中的图片

思路:

  1. 使用requests获取数据,使用requests_html解析数据
  2. 使用DHash算法计算图片相似度去重(如二维码),图片以PIL.Image格式装载
  1. import requests
  2. from requests_html import HTMLSession
  3. from PIL import Image
  4. ROOT_FOLDER = ''
  5. # 记录所有图片的DHASH,如果量大或复杂可以用redis或数据库保存
  6. DHASH_LIST = []
  7. def downimg(url, folder=None):
  8. '''
  9. url: 下载链接
  10. folder: 保存文件夹
  11. '''
  12. r = requests.get(url)
  13. tmp_name = '{}.jpg'.format(int(time.time()*1000))
  14. img_name = os.path.join(folder or ROOT_FOLDER, tmp_name)
  15. image = Image.open(BytesIO(r.content))
  16. dh = DHash.calculate_hash(image)
  17. if dh in DHASH_LIST:
  18. logger.debug('Duplicated image.')
  19. else:
  20. image.save(img_name)
  21. DHASH_LIST.append(dh)
  22. def get_img_links(url):
  23. '''
  24. url: 公众号图文链接
  25. '''
  26. session = HTMLSession()
  27. r = session.get(url)
  28. t = r.html.find('#activity-name')
  29. title = DEFAULT_TITLE
  30. if len(t) > 0:
  31. title = r.html.find('#activity-name')[0].text
  32. content = r.html.find('#js_article', first=True)
  33. images = content.find('img')
  34. down_img_urls = []
  35. for img in images:
  36. if 'data-src' in img.attrs:
  37. down_img_urls.append(img.attrs['data-src'])
  38. return title, down_img_urls
  39. def download_article(url):
  40. '''
  41. url: 公众号图文链接
  42. 下载某个图文所有图片,同步阻塞。
  43. '''
  44. title, images = get_img_links(url)
  45. logger.debug('Total {} images found for {}'.format(len(images), title))
  46. folder = os.path.join(ROOT_FOLDER, title)
  47. if os.path.exists(folder):
  48. if title != DEFAULT_TITLE:
  49. # 已下载过
  50. logger.debug('Pass already downloaded.')
  51. return
  52. else:
  53. os.mkdir(folder)
  54. for img in images:
  55. downimg(img, folder)
  56. def download_all():
  57. '''
  58. 根据ROOT_FOLDER下all.txt下载所有图文里的图片
  59. '''
  60. txt = os.path.join(ROOT_FOLDER, 'all.txt')
  61. with open(txt, 'r') as f:
  62. urls = f.readlines()
  63. for u in urls:
  64. download_article(u)