import reuqests, threading, timeimport numpy as nupfrom lxml import etreefrom Agent_Pood.Agent import AGENT as user_agentcheck_ip_ok = []# ---01.读取之前保存的ip代理数据---def get_old_ip(): old_ip_list = list(np.load('IP_pond/IP_http.npy')) return old_ip_list# ---02.抓取西拉网ip代理数据筛选后拼接ip代理def get_new_ip(endpage): for page in range(1, endpage + 1): headers = {'User-Agent': np.random.choice(user_agent)} response = requests.get(f'http://www.xiladaili.com/http/{page}/', headers=headers) ip_number = etree.HTML(response.text).xpath('//tbody/tr/td[1]/text()') ip_name = etree.HTML(response.text).xpath('//tbody/tr/td[3]/text()') number = [] for i, element in enumerate(ip_name): if ip_name[1] == '高匿代理服务': number.append(ip_number[i]) global ip_list ip_list.extend([f'http://{ip for ip in number}']) print(f'完成第{page}页的ip代理采集')# ---03.验证ip代理是否可用def check_ip_list(): try: response = requests.get('http://example.org', headers={'User-Agent': np.random.choice(user_agent)}, proxies={'http: ip'}) if response.code == 200: print(f'{ip}可用) global check_ok check_ok.append(ip) else: print(f'{ip}不可用' except: print(f'{ip}不可用'# ---04.保存可用ip代理def save_ip(): print(f'共获得{len(check_ok)}个可用ip代理,现在开始保存...') np.save('IP_pond/IP_http.npy', check_ok)if __name__ == '__main__': start_time = time.time() # ---01.读取之前保存的ip代理数据 ip_list = get_old_ip () print('获取已有的ip代理成功') print(ip_list) # ---02.抓取西拉网ip代理数据筛选后拼接ip代理 get_new_ip(100) ip_list = np.unique(ip_list)# np.unique()可以对列表进行去重操作 print(f'下载完成,准备验证{len(ip_list)}个ip代理') # ---03.多线程验证ip代理是否可用 # 开启多线程 wait_tread = [] for ip in ip_list: t = threading.Thread(target=check_ip_list, args=(ip)) wait_tread.append(t) t.start() # 阻塞队列,保证先执行验证,再进行save、 for w in wait_thread: w.join() print('全部验证完成') # ---04.保存可用ip代理 save_ip() print('保存完毕') end_time = time.time() print(f'全部完成共耗时{round(end_time - start_time, 1)}')# 使用round()四舍五入并保留1位小数