主要参数
- 环境:python2.7 + windows
- 以爬取readhub为例,稍微修改可以爬取其他网站资料(但请不要用做敏感信息爬取,毋用做违法用途)
#coding=utf-8from selenium import webdriverimport os,time,sysimport timefrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.support.wait import WebDriverWaitreload(sys)sys.setdefaultencoding('utf-8')def main(): ##headless options=webdriver.ChromeOptions() #options.add_argument('--headless') driver=webdriver.Chrome(chrome_options=options) #driver = webdriver.Chrome() driver.get("https://readhub.me/") fp = open("D:/readhub.txt","a+") for i in range(6): js="var q=document.documentElement.scrollTop=1000000" driver.execute_script(js) time.sleep(1) print(i) titles = driver.find_elements_by_xpath('//*[@id="itemList"]//h2') links = driver.find_elements_by_xpath('//*[@id="itemList"]//h2/a') fp.write(time.strftime('%Y-%m-%d',time.localtime(time.time()))) fp.write("\r\n") for i in range(len(titles)): print(titles[i].text + "-" + links[i].get_attribute("href")) fp.write(titles[i].text) fp.write(" - ") fp.write(links[i].get_attribute("href")) fp.write("\n") fp.close() driver.quit()main()