import csvfrom lxml import etreefrom fake_useragent import UserAgentimport randomimport timeimport requestsclass IP(object): def __init__(self): self.url = 'http://www.xiladaili.com/gaoni/{}/' def get_html(self, url): headers = {'User-Agent': UserAgent().random} html = requests.get(url=url, headers=headers).text return html def xpath_func(self, html, xpath_bds): obj = etree.HTML(html) ip_lists = obj.xpath(xpath_bds) return ip_lists def parse_html(self, url): html = self.get_html(url) xpath_bds = '//tr/td/text()' ip_lists = self.xpath_func(html, xpath_bds) print(ip_lists) for i in range(0, len(ip_lists), 8): # 检查 try: self.fangwen(ip_lists[i]) self.save_ip(ip_lists[i]) print("可用") except Exception: print("此ip不可用") pass # if self.fangwen(ip_lists[i]) == 200: # self.save_ip(ip_lists[i]) # else: # print('此ip不可用') def save_ip(self, L): with open('ip.txt', 'a', encoding='utf-8', newline='') as f: f.write(L) f.write('\r\n') print('save success') def fangwen(self, ip): proxies = { 'http': 'http://{}'.format(ip), 'https': 'https://{}'.format(ip) } headers = {'User-Agent': UserAgent().random} res = requests.get('http://www.baidu.com', proxies=proxies, headers=headers, timeout=5) code = res.status_code print(code) return code def run(self): for i in range(1, 500): url = self.url.format(i) self.parse_html(url) time.sleep(random.randint(1, 10))if __name__ == '__main__': # 1, 8, 15 spider = IP() spider.run()