1. import reuqests, threading, time
    2. import numpy as nup
    3. from lxml import etree
    4. from Agent_Pood.Agent import AGENT as user_agent
    5. check_ip_ok = []
    6. # ---01.读取之前保存的ip代理数据---
    7. def get_old_ip():
    8. old_ip_list = list(np.load('IP_pond/IP_http.npy'))
    9. return old_ip_list
    10. # ---02.抓取西拉网ip代理数据筛选后拼接ip代理
    11. def get_new_ip(endpage):
    12. for page in range(1, endpage + 1):
    13. headers = {'User-Agent': np.random.choice(user_agent)}
    14. response = requests.get(f'http://www.xiladaili.com/http/{page}/', headers=headers)
    15. ip_number = etree.HTML(response.text).xpath('//tbody/tr/td[1]/text()')
    16. ip_name = etree.HTML(response.text).xpath('//tbody/tr/td[3]/text()')
    17. number = []
    18. for i, element in enumerate(ip_name):
    19. if ip_name[1] == '高匿代理服务':
    20. number.append(ip_number[i])
    21. global ip_list
    22. ip_list.extend([f'http://{ip for ip in number}'])
    23. print(f'完成第{page}页的ip代理采集')
    24. # ---03.验证ip代理是否可用
    25. def check_ip_list():
    26. try:
    27. response = requests.get('http://example.org', headers={'User-Agent': np.random.choice(user_agent)}, proxies={'http: ip'})
    28. if response.code == 200:
    29. print(f'{ip}可用)
    30. global check_ok
    31. check_ok.append(ip)
    32. else:
    33. print(f'{ip}不可用'
    34. except:
    35. print(f'{ip}不可用'
    36. # ---04.保存可用ip代理
    37. def save_ip():
    38. print(f'共获得{len(check_ok)}个可用ip代理,现在开始保存...')
    39. np.save('IP_pond/IP_http.npy', check_ok)
    40. if __name__ == '__main__':
    41. start_time = time.time()
    42. # ---01.读取之前保存的ip代理数据
    43. ip_list = get_old_ip ()
    44. print('获取已有的ip代理成功')
    45. print(ip_list)
    46. # ---02.抓取西拉网ip代理数据筛选后拼接ip代理
    47. get_new_ip(100)
    48. ip_list = np.unique(ip_list)# np.unique()可以对列表进行去重操作
    49. print(f'下载完成,准备验证{len(ip_list)}个ip代理')
    50. # ---03.多线程验证ip代理是否可用
    51. # 开启多线程
    52. wait_tread = []
    53. for ip in ip_list:
    54. t = threading.Thread(target=check_ip_list, args=(ip))
    55. wait_tread.append(t)
    56. t.start()
    57. # 阻塞队列,保证先执行验证,再进行save、
    58. for w in wait_thread:
    59. w.join()
    60. print('全部验证完成')
    61. # ---04.保存可用ip代理
    62. save_ip()
    63. print('保存完毕')
    64. end_time = time.time()
    65. print(f'全部完成共耗时{round(end_time - start_time, 1)}')# 使用round()四舍五入并保留1位小数