抖音分享url 获取,和签名算法分析见: https://www.yuque.com/dakuohao/initit/dqbv72
页面分析
- 抖音的签名算法经常变,不适合使用api去爬取数据
 - 既然是js版的网页,可以使用google的无头浏览器headless bower技术,渲染页面,自动拿到数据,等于是模拟了一个浏览器的全部行为。
 - 该方式不好做反爬虫处理
爬取过程
 
- 启动浏览器打开网址 https://www.iesdouyin.com/share/user/97241835037
 - 渲染页面,然后请求用户信息数据,请求用户视频数据,请求用户喜欢的视频数据
 - 这里选用户的视频数据接口,做个例子
 - ajax请求https://www.iesdouyin.com/web/api/v2/aweme/post这个接口,得到的是json数据
 

 
响应数据分析
{"status_code": 0, //状态码 0表示请求正常"aweme_list": [//视频列表{"aweme_id": "6708340636640218379", //视频id"desc": "一定要看到最后系列 #弹指变身术 #变身", //描述"cha_list": null,"video": {"play_addr": {//视频播放地址"uri": "v0200fe00000bkct20j6j2qrpmnl55tg","url_list": ["https://aweme.snssdk.com/aweme/v1/play/?video_id=v0200fe00000bkct20j6j2qrpmnl55tg&line=0&ratio=540p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1","https://api.amemv.com/aweme/v1/play/?video_id=v0200fe00000bkct20j6j2qrpmnl55tg&line=1&ratio=540p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1"]},"cover": {//动态图地址"uri": "2943900017ae4444916c5","url_list": ["https://p9-dy.byteimg.com/aweme/300x400/2943900017ae4444916c5.jpeg","https://p1-dy.byteimg.com/aweme/300x400/2943900017ae4444916c5.jpeg","https://p3-dy.byteimg.com/aweme/300x400/2943900017ae4444916c5.jpeg"]},"height": 1280,"width": 720,"dynamic_cover": {//1280*720动态图"uri": "2935e0004a520262ef032","url_list": ["https://p9-dy.byteimg.com/obj/2935e0004a520262ef032","https://p1-dy.byteimg.com/obj/2935e0004a520262ef032","https://p3-dy.byteimg.com/obj/2935e0004a520262ef032"]},"origin_cover": {//封面图 大图"uri": "large/29299000a5bd1adde07fb","url_list": ["http://p9-dy.byteimg.com/large/29299000a5bd1adde07fb.jpeg","http://p1-dy.byteimg.com/large/29299000a5bd1adde07fb.jpeg","http://p3-dy.byteimg.com/large/29299000a5bd1adde07fb.jpeg"]},"ratio": "540p","download_addr": {//下载地址"uri": "v0200fe00000bkct20j6j2qrpmnl55tg","url_list": ["https://aweme.snssdk.com/aweme/v1/play/?video_id=v0200fe00000bkct20j6j2qrpmnl55tg&line=0&ratio=540p&watermark=1&media_type=4&vr_type=0&improve_bitrate=0&logo_name=aweme","https://api.amemv.com/aweme/v1/play/?video_id=v0200fe00000bkct20j6j2qrpmnl55tg&line=1&ratio=540p&watermark=1&media_type=4&vr_type=0&improve_bitrate=0&logo_name=aweme"]},"has_watermark": true,"bit_rate": null,"duration": 10700},"statistics": {"aweme_id": "6708340636640218379","comment_count": 43,"digg_count": 1316,"play_count": 0,"share_count": 22,"forward_count": 0},"text_extra": [//参与的 活动或专题{"start": 10,"end": 16,"type": 1,"hashtag_name": "弹指变身术","hashtag_id": 1636478480196621},{"start": 17,"end": 20,"type": 1,"hashtag_name": "变身","hashtag_id": 1555318033810433}],"video_labels": null,"aweme_type": 4,"image_infos": null,"position": null,"uniqid_position": null,"comment_list": null,"geofencing": null,"video_text": null,"label_top_text": null,"promotions": null,"long_video": null},//...多个对象],"max_cursor": 1558603429000, //最大游标,默认是0,然后本次请求得到的值是下一次请求的值"min_cursor": 1561972997000, //暂时无用"has_more": true//true表示还有更多数据,继续请求下一页}
代码实现
1. 安装puppeteer
--环境要求 nodejs 8版本以上npm init 项目名cd 项目名npm install --save puppeteer安装过程会自动下载一个chrome浏览器到node_module目录,大概140M网速慢的可以使用yarn add puppeteer或者使用cnpm install -save puppeteer
无头浏览器 Puppeteer 教程和说明参考: https://www.yuque.com/dakuohao/initit/cvf5gl
就是可以用代码模拟浏览器的所有行为
2.爬取数据
const puppeteer = require('puppeteer');// 站点地址urlvar url = `https://www.iesdouyin.com/share/user/97241835037`;class GetDouYin {constructor() {this.page = null;this.browser = null;}async init() {// 构造浏览器对象this.browser = await puppeteer.launch({'headless': false,// 增加该参数会 显示浏览器 并显示所有操作});// this.browser = await puppeteer.launch();//不显示浏览器// 创建页面this.page = await this.browser.newPage();// 模拟浏览器信息const UA = "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A365 Safari/600.1.4";this.page.setUserAgent(UA);// 允许执行js脚本this.page.setJavaScriptEnabled(true);// 页面视口大小this.page.setViewport({width: 400, height: 900});this.getVideo();//确保 滚屏加载数据for (let i =0;i<5;i++){this.scrollPage();}//关闭浏览器// await this.browser.close();}/* 页面滚动方法 */async scrollPage() {console.log('鼠标滚动,加载数据');//执行js代码(滚动页面)await this.page.evaluate(() => {window.scrollTo(100, document.body.offsetHeight)})};async getVideo() {// 打开页面await this.page.goto(url);let page = await this.page;//监听响应事件page.on('response', response => {// 查看所有请求地址// console.log(response.url());// 匹配所需数据的请求地址if (response.url().indexOf('https://www.iesdouyin.com/web/api/v2/aweme/post') !== -1) {// 获取数据并转为json格式let promise = response.json().then(data => {//todo 打印输出数据,或者是保存入库console.log(JSON.stringify(data));if (data.has_more) {this.scrollPage();}})}});}}let getDouYin = new GetDouYin();getDouYin.init();
