主要参数

  • 环境:python2.7 + windows
  • 以爬取readhub为例,稍微修改可以爬取其他网站资料(但请不要用做敏感信息爬取,毋用做违法用途)
  1. #coding=utf-8
  2. from selenium import webdriver
  3. import os,time,sys
  4. import time
  5. from selenium.common.exceptions import TimeoutException
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. reload(sys)
  8. sys.setdefaultencoding('utf-8')
  9. def main():
  10. ##headless
  11. options=webdriver.ChromeOptions()
  12. #options.add_argument('--headless')
  13. driver=webdriver.Chrome(chrome_options=options)
  14. #driver = webdriver.Chrome()
  15. driver.get("https://readhub.me/")
  16. fp = open("D:/readhub.txt","a+")
  17. for i in range(6):
  18. js="var q=document.documentElement.scrollTop=1000000"
  19. driver.execute_script(js)
  20. time.sleep(1)
  21. print(i)
  22. titles = driver.find_elements_by_xpath('//*[@id="itemList"]//h2')
  23. links = driver.find_elements_by_xpath('//*[@id="itemList"]//h2/a')
  24. fp.write(time.strftime('%Y-%m-%d',time.localtime(time.time())))
  25. fp.write("\r\n")
  26. for i in range(len(titles)):
  27. print(titles[i].text + "-" + links[i].get_attribute("href"))
  28. fp.write(titles[i].text)
  29. fp.write(" - ")
  30. fp.write(links[i].get_attribute("href"))
  31. fp.write("\n")
  32. fp.close()
  33. driver.quit()
  34. main()