1. #!/usr/bin/python
    2. # -*- coding: UTF-8 -*-
    3. # @date: 2020/2/27 16:31
    4. # @name: Fofa_Spider
    5. # @author:Mke2fs
    6. from tld import get_tld
    7. import requests,re,time,base64,urllib
    8. import requests
    9. import random
    10. import re
    11. import time
    12. from threading import Thread
    13. from selenium import webdriver
    14. from selenium.webdriver.chrome.options import Options
    15. Host = 'https://fofa.so/'
    16. #cookies每次都要新加进去,修改cookie就能跑
    17. data= {\
    18. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
    19. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    20. 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    21. 'Accept-Encoding': 'gzip, deflate',
    22. '_fofapro_ars_session': '3a96d0e7e6caf3b25d06723f682ef807',
    23. 'Connection': 'close',
    24. 'Upgrade-Insecure-Requests': '1',
    25. 'If-None-Match':'W/"fc89a1c5bc61e3b8e515db61cef74ac0',
    26. 'Cache-Control': 'max-age=0'
    27. }
    28. zhanzhang_headers = {
    29. 'Host': 'rank.chinaz.com',
    30. 'Cache-Control': 'max-age=0',
    31. 'Upgrade-Insecure-Requests': '1',
    32. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
    33. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    34. 'Accept-Encoding': 'gzip, deflate',
    35. 'Accept-Language': 'zh-CN,zh;q=0.9',
    36. 'Cookie': 'BDTUJIAID=febc82b216a29e116730505bc1e471a9; inputbox_urls=%5b%22passivcashincome.com%22%2c%22feifeizuida.com%22%5d; UM_distinctid=16e63892b4e3b1-031b6053dcfc9f-7711b3e-100200-16e63892b4fa8a; Hm_lvt_aecc9715b0f5d5f7f34fba48a3c511d6=1579746706; CNZZDATA433095=cnzz_eid%3D297046501-1578041490-null%26ntime%3D1583974744; CNZZDATA5082706=cnzz_eid%3D902178444-1578044637-null%26ntime%3D1583975389; qHistory=aHR0cDovL3Rvb2wuY2hpbmF6LmNvbV/nq5nplb/lt6Xlhbd8aHR0cDovL3JhbmsuY2hpbmF6LmNvbV/nmb7luqbmnYPph43mn6Xor6J8aHR0cDovL3Rvb2wuY2hpbmF6LmNvbS90b29scy9lc2NhcGUuYXNweF9Fc2NhcGXliqDlr4Yv6Kej5a+GfGh0dHA6Ly93aG9pcy5jaGluYXouY29tL3JldmVyc2UrV2hvaXPlj43mn6V8aHR0cDovL3dob2lzLmNoaW5hei5jb20vK1dob2lz5p+l6K+i',
    37. 'Connection': 'close'
    38. }
    39. cookies = {'_fofapro_ars_session': '3a96d0e7e6caf3b25d06723f682ef807'}
    40. def getdata(Host):
    41. html = requests.get(Host, headers=data, cookies=cookies).content
    42. # print(requests.get(Host,data=data).cookies)
    43. #print(html.decode("utf-8"))
    44. IP=re.findall \
    45. ('<a target="_blank" href="(.*)">', html.decode('utf-8'))
    46. #print(IP)
    47. aa=[]
    48. for ii in IP:
    49. pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # 匹配模式只匹配web服务
    50. #pattern = r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b" # 匹配IP
    51. if re.findall(pattern, ii) !=[]:
    52. reallist=re.findall(pattern, ii)
    53. reallist=sorted(set(reallist), key=reallist.index) #去除列表重复项
    54. reallist="".join(reallist) #列表转字符,二层嵌套转换为单层列表
    55. aa.append(reallist)
    56. return aa
    57. def init(search_content,pages):
    58. Host = 'https://fofa.so/'
    59. for page in range(3,pages+1): #页数控制
    60. quary = 'result?qbase64=' + str(base64.b64encode(search_content.encode("utf-8")), "utf-8") + '&page=' + str(page)
    61. Hosts = Host + quary
    62. print(Hosts,'剩余查询次数'+str(500-page))
    63. getlist=getdata(str(Hosts))
    64. time.sleep(random.randint(1,3))
    65. getlist=" ".join(getlist)
    66. getlist=getlist.replace('http://beian.miit.gov.cn','')
    67. #print(getlist)
    68. print(list(getlist.split(' ')))
    69. ###暂时关闭权重查询模块
    70. #IRank(list(getlist.split(' ')))
    71. write2file(list(getlist.split(' ')))
    72. def write2file(sites):
    73. for ii in sites:
    74. with open('Coremail-2020.txt', 'a', encoding='utf-8') as l:
    75. l.write(ii + '\n')
    76. def IRank(sub):
    77. print('[+] 正在后台打开谷歌浏览器...')
    78. chrome_option = Options()
    79. chrome_option.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
    80. chrome_option.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    81. chrome_option.add_experimental_option('excludeSwitches', ['enable-logging'])#关闭控制台日志,看着太乱
    82. driver=webdriver.Chrome(options=chrome_option)
    83. driver.set_page_load_timeout(5000)
    84. print('[+] 正在查询中,请稍等 ~')
    85. num=0
    86. for line in sub:
    87. try:
    88. quanzhong=line.strip('\n')
    89. site=quanzhong.strip('https://')
    90. driver.get('https://www.aizhan.com/seo/{domain}'.format(domain=site))
    91. baidurank_pattern = re.compile(r'<img src="//statics.aizhan.com/images/br/(.*?).png')
    92. try:
    93. html_text = driver.page_source.encode('utf-8')
    94. baidurank = re.findall(baidurank_pattern,html_text.decode('utf-8'))[0]
    95. except:
    96. time.sleep(random.randint(1,3))
    97. html_text = driver.page_source.encode('utf-8')
    98. baidurank = re.findall(baidurank_pattern,html_text.decode('utf-8'))[0]
    99. num=num+1
    100. print("[+] 正在查询第"+str(num)+"条"+" 百度权重:"+str(baidurank)+" url: "+site)
    101. if int(baidurank) > 0:
    102. with open('iRank_Thinkcmf.txt','a',encoding='utf-8') as l:
    103. l.write(site+'\n')
    104. except Exception as e:
    105. pass
    106. driver.close()
    107. if __name__ == "__main__":
    108. rule='app="Coremail-邮件系统" && host=".com"' #输入查询参数跑到第700页
    109. p2=999
    110. init(rule, p2)
    111. """
    112. 规则库:
    113. Struts2:
    114. app="struts2" && country="CN" && host=".com"
    115. app="ThinkPHP" && region="Shanghai" && host=".com"
    116. app="thinkcmf" && region="Zhejiang" && host=".com"
    117. app="Coremail-邮件系统" && country="CN" && host=".com"
    118. app="Coremail-邮件系统" && host=".com"
    119. """