多线程百度url爬取工具:

]]V6A4O6UTH8PJF_()O5L{O.png

  1. import requests
  2. import re
  3. import threading
  4. from bs4 import BeautifulSoup as bs
  5. from queue import Queue
  6. import datetime
  7. class urlcollector(threading.Thread):
  8. def __init__(self,queue):#构造函数
  9. threading.Thread.__init__(self)
  10. self._queue=queue
  11. def run(self):#启动线程
  12. while not self._queue.empty():
  13. url=self._queue.get()
  14. self.spider(url)
  15. def spider(self,url):
  16. headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"}
  17. r=requests.get(url=url,headers=headers)
  18. html=r.content
  19. soup=bs(html,'lxml')
  20. links=soup.find_all('a',attrs={"data-click":re.compile('.'),"class":None})
  21. for i in links:
  22. url_real=i['href']
  23. try:
  24. r_link_real=requests.get(url_real,headers=headers,timeout=8)
  25. if r_link_real.status_code==200:
  26. print(url_real)
  27. except Exception as e:
  28. print (e)
  29. pass
  30. def main(keyword):
  31. start_time=datetime.datetime.now()
  32. queue_=Queue()
  33. threads=[]
  34. thread_num=8
  35. for i in range(0,760,10):#生成搜索栏url
  36. url_="https://www.baidu.com/s?wd=%s & pn=%s" %(keyword,str(i))
  37. queue_.put(url_)
  38. for t in range(thread_num):
  39. t=urlcollector(queue_)#实例化线程对象
  40. threads.append(t)#将线程对象加入到线程列表中
  41. for i in threads:#启动线程
  42. i.start()
  43. for i in threads:#等待线程结束
  44. i.join()
  45. end_time=datetime.datetime.now()
  46. print("%d进程耗时:{}秒".format(end_time-start_time)%(thread_num))
  47. if __name__=='__main__':
  48. main("吴亦凡")

多线程网站目录扫描

  1. import requests
  2. import sys
  3. from queue import Queue
  4. import threading
  5. requests.packages.urllib3.disable_warnings()
  6. class DirScan(threading.Thread):
  7. #构造函数
  8. def __init__(self,queue):
  9. threading.Thread.__init__(self)
  10. self._queue=queue
  11. #启动线程
  12. def run(self):
  13. while not self._queue.empty():#查看资源是否可用
  14. url=self._queue.get()#如果可用,则获取使用权
  15. try:
  16. headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
  17. proxies={"https":"http://127.0.0.1:8080"}
  18. #print(url)
  19. r=requests.get(url=url,headers=headers,timeout=4,proxies=proxies,verify=False)
  20. #print(r.status_code)
  21. if r.status_code==200:#验证路径是否有效
  22. print("[*]"+url)
  23. # else:
  24. # print(r.status_code)
  25. except Exception as e:
  26. #print(e)
  27. exit()
  28. def start(url,ext,count):
  29. queue_=Queue()
  30. f=open('./%s.txt'%ext,'r', encoding='UTF-8')#从字典文件拼接路径
  31. for i in f:
  32. #result=(url+i.rstrip('\n'))
  33. queue_.put(url+i.rstrip('\n')[0:])#将路径加入队列
  34. #print(url+i.rstrip('\n'))
  35. threads=[]
  36. thread_count=int(count)
  37. for i in range(thread_count):#创建线程对象,并添加到线程列表
  38. threads.append(DirScan(queue_))
  39. for i in threads:#启动线程
  40. i.start()
  41. for i in threads:#等待线程结束
  42. i.join()
  43. if __name__=='__main__':
  44. if len(sys.argv)!=4:
  45. print("瓜皮,参数没传够")
  46. sys.exit(-1)
  47. else:
  48. start(sys.argv[1],sys.argv[2],sys.argv[3])
  49. #python web_scanner.py https://www.baidu.com php_dic 20

要注意路径的拼接是否正确,

c段web地址扫描:

  1. import requests
  2. import time
  3. import threading
  4. import sys
  5. from queue import Queue
  6. from IPy import IP
  7. from bs4 import BeautifulSoup
  8. requests.packages.urllib3.disable_warnings()
  9. headers={'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"}
  10. proxies={"https":"http://127.0.0.1:8080"}
  11. class DirScan(threading.Thread):
  12. def __init__(self,queue):#线程类构造函数
  13. threading.Thread.__init__(self)
  14. self.queue_=queue
  15. def run(self):#线程类主函数
  16. while not self.queue_.empty():#如果存在资源可用
  17. url=self.queue_.get()#请求url资源
  18. try:
  19. #print(url)
  20. requests.adapters.DEFAULT_RETRIES = 5 #增加重连次数
  21. s = requests.session()
  22. s.keep_alive = False #关闭多余连接
  23. r=requests.get(url=url,timeout=6,headers=headers,proxies=proxies,verify=False)
  24. #print(r.status_code)
  25. r.encoding=r.apparent_encoding
  26. if r.status_code==200:#验证url是否可以成功访问
  27. #print("[*] Web Service Found!%s"%url)
  28. soup = BeautifulSoup(r.text, 'lxml')
  29. titles = soup.find_all('head')
  30. for i in titles:
  31. if i.find("title").string is not None:
  32. print("[*] Web Service Found!%s"%url+"\t"+i.find("title").string)
  33. except Exception as e:
  34. #print(e)
  35. pass
  36. def create(ips):#生成相应的curl地址
  37. queue_=Queue()
  38. ip=IP(ips,make_net=True)
  39. ports=["80"]
  40. for i in ip:
  41. for j in ports:
  42. queue_.put('http://'+str(i)+":"+j)
  43. return queue_
  44. def main(ips):
  45. queue_=create(ips)
  46. threads=[]
  47. thread_count=100
  48. for i in range(thread_count):
  49. threads.append(DirScan(queue_))
  50. for i in threads:
  51. i.start()
  52. for i in threads:
  53. i.join()
  54. if __name__=="__main__":
  55. if len(sys.argv)==2:
  56. start=time.time()
  57. main(sys.argv[1])
  58. print(time.time()-start)
  59. sys.exit()
  60. else:
  61. main("35.229.181.0/24")
  62. #print("Usage:%s 192.168.1.1/24"%(sys.argv[0]))
  63. sys.exit(-1)