1. import csv
    2. from lxml import etree
    3. from fake_useragent import UserAgent
    4. import random
    5. import time
    6. import requests
    7. class IP(object):
    8. def __init__(self):
    9. self.url = 'http://www.xiladaili.com/gaoni/{}/'
    10. def get_html(self, url):
    11. headers = {'User-Agent': UserAgent().random}
    12. html = requests.get(url=url, headers=headers).text
    13. return html
    14. def xpath_func(self, html, xpath_bds):
    15. obj = etree.HTML(html)
    16. ip_lists = obj.xpath(xpath_bds)
    17. return ip_lists
    18. def parse_html(self, url):
    19. html = self.get_html(url)
    20. xpath_bds = '//tr/td/text()'
    21. ip_lists = self.xpath_func(html, xpath_bds)
    22. print(ip_lists)
    23. for i in range(0, len(ip_lists), 8):
    24. # 检查
    25. try:
    26. self.fangwen(ip_lists[i])
    27. self.save_ip(ip_lists[i])
    28. print("可用")
    29. except Exception:
    30. print("此ip不可用")
    31. pass
    32. # if self.fangwen(ip_lists[i]) == 200:
    33. # self.save_ip(ip_lists[i])
    34. # else:
    35. # print('此ip不可用')
    36. def save_ip(self, L):
    37. with open('ip.txt', 'a', encoding='utf-8', newline='') as f:
    38. f.write(L)
    39. f.write('\r\n')
    40. print('save success')
    41. def fangwen(self, ip):
    42. proxies = {
    43. 'http': 'http://{}'.format(ip),
    44. 'https': 'https://{}'.format(ip)
    45. }
    46. headers = {'User-Agent': UserAgent().random}
    47. res = requests.get('http://www.baidu.com', proxies=proxies, headers=headers, timeout=5)
    48. code = res.status_code
    49. print(code)
    50. return code
    51. def run(self):
    52. for i in range(1, 500):
    53. url = self.url.format(i)
    54. self.parse_html(url)
    55. time.sleep(random.randint(1, 10))
    56. if __name__ == '__main__': # 1, 8, 15
    57. spider = IP()
    58. spider.run()