一、采集公众号文章中的图片
思路:
- 使用requests获取数据,使用requests_html解析数据
- 使用DHash算法计算图片相似度去重(如二维码),图片以PIL.Image格式装载
import requests
from requests_html import HTMLSession
from PIL import Image
ROOT_FOLDER = ''
# 记录所有图片的DHASH,如果量大或复杂可以用redis或数据库保存
DHASH_LIST = []
def downimg(url, folder=None):
'''
url: 下载链接
folder: 保存文件夹
'''
r = requests.get(url)
tmp_name = '{}.jpg'.format(int(time.time()*1000))
img_name = os.path.join(folder or ROOT_FOLDER, tmp_name)
image = Image.open(BytesIO(r.content))
dh = DHash.calculate_hash(image)
if dh in DHASH_LIST:
logger.debug('Duplicated image.')
else:
image.save(img_name)
DHASH_LIST.append(dh)
def get_img_links(url):
'''
url: 公众号图文链接
'''
session = HTMLSession()
r = session.get(url)
t = r.html.find('#activity-name')
title = DEFAULT_TITLE
if len(t) > 0:
title = r.html.find('#activity-name')[0].text
content = r.html.find('#js_article', first=True)
images = content.find('img')
down_img_urls = []
for img in images:
if 'data-src' in img.attrs:
down_img_urls.append(img.attrs['data-src'])
return title, down_img_urls
def download_article(url):
'''
url: 公众号图文链接
下载某个图文所有图片,同步阻塞。
'''
title, images = get_img_links(url)
logger.debug('Total {} images found for {}'.format(len(images), title))
folder = os.path.join(ROOT_FOLDER, title)
if os.path.exists(folder):
if title != DEFAULT_TITLE:
# 已下载过
logger.debug('Pass already downloaded.')
return
else:
os.mkdir(folder)
for img in images:
downimg(img, folder)
def download_all():
'''
根据ROOT_FOLDER下all.txt下载所有图文里的图片
'''
txt = os.path.join(ROOT_FOLDER, 'all.txt')
with open(txt, 'r') as f:
urls = f.readlines()
for u in urls:
download_article(u)