1. from selenium import webdriver
    2. from selenium.common.exceptions import TimeoutException
    3. from selenium.webdriver.common.by import By
    4. from selenium.webdriver.support import expected_conditions as EC
    5. from selenium.webdriver.support.wait import WebDriverWait
    6. from pyquery import PyQuery as pq
    7. import time
    8. import os
    9. import urllib
    10. browser = webdriver.Chrome("E:\soft\chromedriver\chromedriver_win32\chromedriver.exe")
    11. wait = WebDriverWait(browser, 10)
    12. url = 'http://www.mafengwo.cn/jd/10206/gonglve.html'
    13. browser.get(url)
    14. aomeng = {}
    15. def index_page():
    16. try:
    17. # 获取总页数
    18. page_total = browser.find_elements_by_css_selector('span.count')
    19. total = page_total[0].text
    20. total_page = total[1:3]
    21. for i in range(1, int(total_page) - 4):
    22. print('正在爬取第', str(i), '页')
    23. # 实现下一页
    24. submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.m-pagination > a.pi.pg-next')))
    25. submit.click()
    26. html = browser.page_source
    27. parse_page(html)
    28. # print(html)
    29. except TimeoutException:
    30. print('超时')
    31. finally:
    32. browser.close()
    33. def parse_page(html):
    34. doc = pq(html, parser="html")
    35. items = doc('.bd .scenic-list.clearfix .img').parent().items()
    36. for item in items:
    37. # 标题
    38. aomeng['title'] = item.find('h3').text()
    39. # 子页链接
    40. aomeng['href'] = item.attr('href')
    41. # 图片链接
    42. aomeng['img'] = item.find('.img').find('img').attr('src')
    43. # 延迟3秒
    44. time.sleep(3)
    45. # 存储图片
    46. save_img(aomeng)
    47. def save_img(param):
    48. # 判断是否能存在路径
    49. if not os.path.exists(param['title']):
    50. os.makedirs(param['title'])
    51. # 文件写入
    52. with open('{0}/{0}.{1}'.format(param['title'], 'txt'), 'a', encoding='utf-8')as f:
    53. f.write(param['href'])
    54. print('写入txt文件成功')
    55. try:
    56. file_path = '{0}/{0}.{1}'.format(param['title'], 'jpg')
    57. if not os.path.exists(file_path):
    58. urllib.request.urlretrieve(param['img'], file_path)
    59. print('img 存入success!')
    60. else:
    61. print('Download Complete')
    62. except:
    63. print('img 存入fail!')
    64. pass
    65. index_page()