import csv
from lxml import etree
from fake_useragent import UserAgent
import random
import time
import requests
class IP(object):
def __init__(self):
self.url = 'http://www.xiladaili.com/gaoni/{}/'
def get_html(self, url):
headers = {'User-Agent': UserAgent().random}
html = requests.get(url=url, headers=headers).text
return html
def xpath_func(self, html, xpath_bds):
obj = etree.HTML(html)
ip_lists = obj.xpath(xpath_bds)
return ip_lists
def parse_html(self, url):
html = self.get_html(url)
xpath_bds = '//tr/td/text()'
ip_lists = self.xpath_func(html, xpath_bds)
print(ip_lists)
for i in range(0, len(ip_lists), 8):
# 检查
try:
self.fangwen(ip_lists[i])
self.save_ip(ip_lists[i])
print("可用")
except Exception:
print("此ip不可用")
pass
# if self.fangwen(ip_lists[i]) == 200:
# self.save_ip(ip_lists[i])
# else:
# print('此ip不可用')
def save_ip(self, L):
with open('ip.txt', 'a', encoding='utf-8', newline='') as f:
f.write(L)
f.write('\r\n')
print('save success')
def fangwen(self, ip):
proxies = {
'http': 'http://{}'.format(ip),
'https': 'https://{}'.format(ip)
}
headers = {'User-Agent': UserAgent().random}
res = requests.get('http://www.baidu.com', proxies=proxies, headers=headers, timeout=5)
code = res.status_code
print(code)
return code
def run(self):
for i in range(1, 500):
url = self.url.format(i)
self.parse_html(url)
time.sleep(random.randint(1, 10))
if __name__ == '__main__': # 1, 8, 15
spider = IP()
spider.run()