cainiao天地数据爬取

获取token和其他参数

首先分析获取该网站的隐藏链接, 必须得点击下图界面中查询按钮后才会通过cainaio的后端返回一个url,包含验证参数token, 以及其他的请求参数等
image.png

伪代码示例

对于点击网页的控制, 利用webdrive工具pyppeteer库中的api进行js控制获取

  1. import asyncio
  2. import urllib3
  3. from pyppeteer import launch, launcher
  4. from app.constant import WIDTH, HEIGHT, HEADLESS
  5. from utils.chromium import intercept_request, get_cookies, set_user_agent
  6. # launcher.AUTOMATION_ARGS.remove('--enable-automation') # linux
  7. launcher.DEFAULT_ARGS.remove('--enable-automation') # Windows
  8. POINT_DOMAIN_URL = 'https://fly.cainiao.com/login.htm?fromUrl=https://fly.cainiao.com/branch/branchOverallForBranch.htm'
  9. async def get_cookies_token(username, password, stat_date):
  10. browser = await launch(headless=HEADLESS, slowMo=15, autoClose=False,
  11. args=['--disable-infobars', '--disable-extensions', '--hide-scrollbars', '--mute-audio',
  12. '--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox',
  13. '--disable-translate', '--safebrowsing-disable-auto-update',
  14. '--disable-bundled-ppapi-flash', '--window-size={},{}'.format(WIDTH, HEIGHT),
  15. ], dumpio=True)
  16. page = await browser.newPage()
  17. user_agent = set_user_agent() # set chromium info
  18. await page.setUserAgent(user_agent)
  19. await page.setViewport({'width': WIDTH, 'height': HEIGHT})
  20. await page.goto(POINT_DOMAIN_URL)
  21. await page.evaluate('''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => undefined } }) }''')
  22. await page.evaluate('''() =>{ window.navigator.chrome = { runtime: {}, }; }''')
  23. await page.evaluate('''() =>{ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); }''')
  24. await page.evaluate('''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
  25. ####
  26. # login fly.cainiao.com
  27. await asyncio.sleep(1)
  28. frames = page.frames
  29. user = await frames[2].querySelector('#fm-login-id')
  30. await user.type(username, {'delay': 100})
  31. await asyncio.sleep(2)
  32. passwd = await frames[2].querySelector('#fm-login-password')
  33. await passwd.type(password, {'delay': 100})
  34. await asyncio.sleep(2)
  35. log_in = await frames[2].querySelector('#login-form > div.fm-btn > button')
  36. await log_in.click({'delay': 50})
  37. await asyncio.sleep(3)
  38. # await page.reload({'delay': 50})
  39. # await asyncio.sleep(2)
  40. # page_content = await page.content()
  41. # with open('login.html', 'w', encoding='utf8') as wf:
  42. # wf.write(page_content)
  43. ####
  44. # close web popup if exist
  45. mask = await page.evaluate('''document.getElementsByClassName('edpl-icon icon-edpl-close')''')
  46. while 1:
  47. try:
  48. await page.click(
  49. 'body > div.edpl-wrap > div > div.edpl-mask > div > span',
  50. delay=200)
  51. except Exception as warning:
  52. logger.info(f'popup windows warning: {warning}')
  53. if warning:
  54. break
  55. await asyncio.sleep(2)
  56. await page.reload({'delay': 80})
  57. # ####
  58. # # move mouse cursor to left sidebar and click current select(express or point url)
  59. # await page.mouse.move(25, 150, steps=5)
  60. # try:
  61. # await page.click(
  62. # 'body > div.edpl-wrap > div > div.edpl-nav > div:nth-child(2) > ul:nth-child(1) > ul:nth-child(3) > li > a',
  63. # delay=200)
  64. # await asyncio.sleep(2)
  65. # await page.reload({'delay': 50})
  66. # except Exception as err:
  67. # logger.warning(f'warning: {err}')
  68. # pass
  69. ####
  70. # save cookies and token to redis
  71. cookies = await get_cookies(page)
  72. await asyncio.sleep(2)
  73. await page.setRequestInterception(True)
  74. page.on('request', lambda req: asyncio.ensure_future(
  75. intercept_request(req, cookies, stat_date)))
  76. try:
  77. await page.click('#J_SearchBtn', {'delay': 30})
  78. except Exception as e:
  79. logger.error(f'{e}')
  80. await page.close()
  81. await browser.close()
  82. return None
  83. await page.close()
  84. await browser.close()
  85. return True
  86. if __name__ == '__main__':
  87. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  88. from datetime import datetime, timedelta
  89. from app.constant import ACCOUNT
  90. stat_date = str(datetime.today().date() - timedelta(days=1))
  91. username = ACCOUNT[1]['username']
  92. password = ACCOUNT[1]['password']
  93. login = asyncio.get_event_loop().run_until_complete(get_cookies_token(username, password, stat_date))
  94. if not login:
  95. print('failed')
  96. else:
  97. print('success')