一、采集公众号文章中的图片
思路:
- 使用requests获取数据,使用requests_html解析数据
- 使用DHash算法计算图片相似度去重(如二维码),图片以PIL.Image格式装载
import requestsfrom requests_html import HTMLSessionfrom PIL import ImageROOT_FOLDER = ''# 记录所有图片的DHASH,如果量大或复杂可以用redis或数据库保存DHASH_LIST = []def downimg(url, folder=None):'''url: 下载链接folder: 保存文件夹'''r = requests.get(url)tmp_name = '{}.jpg'.format(int(time.time()*1000))img_name = os.path.join(folder or ROOT_FOLDER, tmp_name)image = Image.open(BytesIO(r.content))dh = DHash.calculate_hash(image)if dh in DHASH_LIST:logger.debug('Duplicated image.')else:image.save(img_name)DHASH_LIST.append(dh)def get_img_links(url):'''url: 公众号图文链接'''session = HTMLSession()r = session.get(url)t = r.html.find('#activity-name')title = DEFAULT_TITLEif len(t) > 0:title = r.html.find('#activity-name')[0].textcontent = r.html.find('#js_article', first=True)images = content.find('img')down_img_urls = []for img in images:if 'data-src' in img.attrs:down_img_urls.append(img.attrs['data-src'])return title, down_img_urlsdef download_article(url):'''url: 公众号图文链接下载某个图文所有图片,同步阻塞。'''title, images = get_img_links(url)logger.debug('Total {} images found for {}'.format(len(images), title))folder = os.path.join(ROOT_FOLDER, title)if os.path.exists(folder):if title != DEFAULT_TITLE:# 已下载过logger.debug('Pass already downloaded.')returnelse:os.mkdir(folder)for img in images:downimg(img, folder)def download_all():'''根据ROOT_FOLDER下all.txt下载所有图文里的图片'''txt = os.path.join(ROOT_FOLDER, 'all.txt')with open(txt, 'r') as f:urls = f.readlines()for u in urls:download_article(u)
