Fofa普通会员爬虫-2020-12-6 - 《杂七杂八安全相关》

#!/usr/bin/python
# -*- coding: UTF-8 -*-
# @date: 2020/2/27 16:31
# @name: Fofa_Spider
# @author：Mke2fs
from tld import get_tld
import requests,re,time,base64,urllib
import requests
import random
import re
import time
from threading import Thread
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
Host = 'https://fofa.so/'
#cookies每次都要新加进去，修改cookie就能跑
data= {\
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'_fofapro_ars_session': '3a96d0e7e6caf3b25d06723f682ef807',
'Connection': 'close',
'Upgrade-Insecure-Requests': '1',
'If-None-Match':'W/"fc89a1c5bc61e3b8e515db61cef74ac0',
'Cache-Control': 'max-age=0'
 }
zhanzhang_headers = {
    'Host': 'rank.chinaz.com',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cookie': 'BDTUJIAID=febc82b216a29e116730505bc1e471a9; inputbox_urls=%5b%22passivcashincome.com%22%2c%22feifeizuida.com%22%5d; UM_distinctid=16e63892b4e3b1-031b6053dcfc9f-7711b3e-100200-16e63892b4fa8a; Hm_lvt_aecc9715b0f5d5f7f34fba48a3c511d6=1579746706; CNZZDATA433095=cnzz_eid%3D297046501-1578041490-null%26ntime%3D1583974744; CNZZDATA5082706=cnzz_eid%3D902178444-1578044637-null%26ntime%3D1583975389; qHistory=aHR0cDovL3Rvb2wuY2hpbmF6LmNvbV/nq5nplb/lt6Xlhbd8aHR0cDovL3JhbmsuY2hpbmF6LmNvbV/nmb7luqbmnYPph43mn6Xor6J8aHR0cDovL3Rvb2wuY2hpbmF6LmNvbS90b29scy9lc2NhcGUuYXNweF9Fc2NhcGXliqDlr4Yv6Kej5a+GfGh0dHA6Ly93aG9pcy5jaGluYXouY29tL3JldmVyc2UrV2hvaXPlj43mn6V8aHR0cDovL3dob2lzLmNoaW5hei5jb20vK1dob2lz5p+l6K+i',
    'Connection': 'close'
}
cookies = {'_fofapro_ars_session': '3a96d0e7e6caf3b25d06723f682ef807'}
def getdata(Host):
    html = requests.get(Host, headers=data, cookies=cookies).content
   # print(requests.get(Host,data=data).cookies)
    #print(html.decode("utf-8"))
    IP=re.findall \
         ('<a target="_blank" href="(.*)">', html.decode('utf-8'))
    #print(IP)
    aa=[]
    for ii in IP:
        pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')  # 匹配模式只匹配web服务
        #pattern = r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b"   # 匹配IP
        if re.findall(pattern, ii) !=[]:
            reallist=re.findall(pattern, ii)
            reallist=sorted(set(reallist), key=reallist.index) #去除列表重复项
            reallist="".join(reallist)                         #列表转字符，二层嵌套转换为单层列表
            aa.append(reallist)
    return aa
def init(search_content,pages):
    Host = 'https://fofa.so/'
    for page in range(3,pages+1):   #页数控制
        quary = 'result?qbase64=' + str(base64.b64encode(search_content.encode("utf-8")), "utf-8") + '&page=' + str(page)
        Hosts = Host + quary
        print(Hosts,'剩余查询次数'+str(500-page))
        getlist=getdata(str(Hosts))
        time.sleep(random.randint(1,3))
        getlist=" ".join(getlist)
        getlist=getlist.replace('http://beian.miit.gov.cn','')
        #print(getlist)
        print(list(getlist.split(' ')))
        ###暂时关闭权重查询模块
        #IRank(list(getlist.split(' ')))
        write2file(list(getlist.split(' ')))
def write2file(sites):
    for ii in sites:
        with open('Coremail-2020.txt', 'a', encoding='utf-8') as l:
            l.write(ii + '\n')
def IRank(sub):
    print('[+] 正在后台打开谷歌浏览器...')
    chrome_option = Options()
    chrome_option.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
    chrome_option.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    chrome_option.add_experimental_option('excludeSwitches', ['enable-logging'])#关闭控制台日志，看着太乱
    driver=webdriver.Chrome(options=chrome_option)
    driver.set_page_load_timeout(5000)
    print('[+] 正在查询中，请稍等 ~')
    num=0
    for line in sub:
        try:
            quanzhong=line.strip('\n')
            site=quanzhong.strip('https://')
            driver.get('https://www.aizhan.com/seo/{domain}'.format(domain=site))
            baidurank_pattern = re.compile(r'<img src="//statics.aizhan.com/images/br/(.*?).png')
            try:
                html_text = driver.page_source.encode('utf-8')
                baidurank = re.findall(baidurank_pattern,html_text.decode('utf-8'))[0]
            except:
                time.sleep(random.randint(1,3))
                html_text = driver.page_source.encode('utf-8')
                baidurank = re.findall(baidurank_pattern,html_text.decode('utf-8'))[0]
            num=num+1
            print("[+] 正在查询第"+str(num)+"条"+"   百度权重:"+str(baidurank)+"   url: "+site)
            if int(baidurank) > 0:
                with open('iRank_Thinkcmf.txt','a',encoding='utf-8') as l:
                    l.write(site+'\n')
        except Exception as e:
            pass
    driver.close()
if __name__ == "__main__":
    rule='app="Coremail-邮件系统"  && host=".com"'  #输入查询参数跑到第700页
    p2=999
    init(rule, p2)
"""
规则库：
Struts2:
app="struts2" && country="CN" && host=".com"
app="ThinkPHP" && region="Shanghai" && host=".com"
app="thinkcmf" && region="Zhejiang" && host=".com"
app="Coremail-邮件系统" && country="CN" && host=".com"
app="Coremail-邮件系统"  && host=".com"
"""