主要参数
- 环境:python2.7 + windows
- 以爬取readhub为例,稍微修改可以爬取其他网站资料(但请不要用做敏感信息爬取,毋用做违法用途)
#coding=utf-8
from selenium import webdriver
import os,time,sys
import time
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
reload(sys)
sys.setdefaultencoding('utf-8')
def main():
##headless
options=webdriver.ChromeOptions()
#options.add_argument('--headless')
driver=webdriver.Chrome(chrome_options=options)
#driver = webdriver.Chrome()
driver.get("https://readhub.me/")
fp = open("D:/readhub.txt","a+")
for i in range(6):
js="var q=document.documentElement.scrollTop=1000000"
driver.execute_script(js)
time.sleep(1)
print(i)
titles = driver.find_elements_by_xpath('//*[@id="itemList"]//h2')
links = driver.find_elements_by_xpath('//*[@id="itemList"]//h2/a')
fp.write(time.strftime('%Y-%m-%d',time.localtime(time.time())))
fp.write("\r\n")
for i in range(len(titles)):
print(titles[i].text + "-" + links[i].get_attribute("href"))
fp.write(titles[i].text)
fp.write(" - ")
fp.write(links[i].get_attribute("href"))
fp.write("\n")
fp.close()
driver.quit()
main()