国家专利数据库检索V2 - 《Python爬虫笔记》


import requests
from bs4 import BeautifulSoup
import time
url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
             'Connection': 'keep-alive'}
cookie = dict(cookies_are="WEE_SID=7ppLQ6_O6fNNT1kUEhhdjNme5Dd6XpTxHdD5AqOHXnJ_R3VkjJK2!-1371704578!792269866!1543155986382; IS_LOGIN=true; wee_username=d2VpbWVuZ3hpbjIwMTI%3D; wee_password=YWR3YWhhZHM2MTM2ODc5; JSESSIONID=7ppLQ6_O6fNNT1kUEhhdjNme5Dd6XpTxHdD5AqOHXnJ_R3VkjJK2!-1371704578!792269866")
info = {"searchCondition.searchExp": "(申请日='20090101') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))",
        "searchCondition.executableSearchExp": "VDB:((APD='20090101' AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))",
        "resultPagination.start": 0,
        "resultPagination.limit": 12,
        "searchCondition.literatureSF": "(申请日='20090101') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))"}
# 代理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "HX7YIG4D7IR9907D"
proxyPass = "D9A0153CCBC8081F"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    "host": proxyHost,
    "port": proxyPort,
    "user": proxyUser,
    "pass": proxyPass,
}
proxies = {
    "http": proxyMeta,
    "https": proxyMeta,
}
s = requests.session()
html = s.post(url, headers=head, proxies=proxies, data=info, cookies=cookie).content


def patent_requests(date):
    import requests
    res = []
    result = []
    url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"
    head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
                 'Connection': 'keep-alive'}
    cookie = dict(cookies_are="WEE_SID=O1xKu2UpRWGCOPDH_QJnZv-Qcx-EBjY-Z64OGBBgv3x1s4UoRg5Y!792269866!-627030802!1543147054377; IS_LOGIN=true; JSESSIONID=O1xKu2UpRWGCOPDH_QJnZv-Qcx-EBjY-Z64OGBBgv3x1s4UoRg5Y!792269866!-627030802")
    pages = 1
    while True:
        info = {"searchCondition.searchExp": "(申请日=%s) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))" % date,
                "searchCondition.executableSearchExp": "VDB:((APD=%s AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))" % date,
                "resultPagination.start": (pages - 1) * 12,
                "resultPagination.limit": 12,
                "searchCondition.literatureSF": "(申请日=%s) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))" % date}
        # 代理服务器
        proxyHost = "http-dyn.abuyun.com"
        proxyPort = "9020"
        # 代理隧道验证信息
        proxyUser = "HX7YIG4D7IR9907D"
        proxyPass = "D9A0153CCBC8081F"
        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }
        proxies = {
            "http": proxyMeta,
            "https": proxyMeta,
        }
        html = requests.post(url, proxies=proxies, data=info, cookies=cookie, headers=head).json()
        page = html["searchResultDTO"]["pagination"]["totalCount"]
        print("共%d条记录" % page)
        if html["searchResultDTO"]["searchResultRecord"] != []:
            for i in html["searchResultDTO"]["searchResultRecord"]:
                lawStatus = i["lawStatus"]
                patentType = i["patentType"]
                name = i["fieldMap"]["TIVIEW"]
                fnum = i["fieldMap"]["FNUM"]
                cpnum = i["fieldMap"]["CPNUM"]
                pnum = i["fieldMap"]["PNUM"]
                VID = i["fieldMap"]["VID"]
                APSN = i["fieldMap"]["APSN"]
                APD = i["fieldMap"]["APD"]
                PN_BAK = i["fieldMap"]["PN_BAK"]
                PD = i["fieldMap"]["PD"]    # 公开日
                IC = i["fieldMap"]["IC"]    # IPC分类号
                CPC = i["fieldMap"]["CPC"]  # CPC分类号
                PAVIEW = i["fieldMap"]["PAVIEW"]   # 申请人
                INVIEW = i["fieldMap"]["INVIEW"]   # 发明人
                AA = i["fieldMap"]["AA"]  # 地址
                AGT = i["fieldMap"]["AGT"]  # 代理人
                AGY = i["fieldMap"]["AGY"]  # 代理机构
                OWNER_STATUS = i["fieldMap"]["AGY"]  # 代理机构
                res.extend([lawStatus, patentType, name, fnum, cpnum, pnum, VID, APSN, APD, PN_BAK, PD, IC, CPC, PAVIEW, INVIEW, AA, AGT, AGY, OWNER_STATUS])
                middle = res.copy()
                result.append(middle)
                res.clear()
                pages += 1
        else:
            print("检索完毕！")
            return result
test = patent_requests("20090101")
"IS_LOGIN=true; WEE_SID=oi5Kv5x04H_asaak8hnaCfGhSkgSyurNL47sRSKG8RCihySrFPjj!792269866!-627030802!1543147330676; JSESSIONID=oi5Kv5x04H_asaak8hnaCfGhSkgSyurNL47sRSKG8RCihySrFPjj!792269866!-627030802"

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import selenium.webdriver.support.ui as ui
from bs4 import BeautifulSoup
import time
import json
import random
import requests
def get_cookies():
    options = Options()
    # options.add_argument('-headless')
    driver = webdriver.Firefox(executable_path=r'C:\Users\viemax\Desktop\geckodriver.exe', options=options)
    url = "http://www.pss-system.gov.cn/sipopublicsearch/portal/uiIndex.shtml"
    driver.get(url)
    time.sleep(1)
    driver.find_element_by_id("j_username").send_keys("weimengxin2012")
    driver.find_element_by_id("j_password_show").send_keys("adwahads6136879")
    # html_login = driver.page_source
    valid = input("请手动输入验证码：")
    driver.find_element_by_id("j_validation_code").send_keys(valid)
    driver.find_element_by_link_text("登录").click()
    time.sleep(1)
    cookie = driver.get_cookies()
    cookie_1 = cookie[0]["name"] + "=" + cookie[0]["value"]
    cookie_2 = cookie[1]["name"] + "=" + cookie[1]["value"]
    cookie_3 = cookie[2]["name"] + "=" + cookie[2]["value"]
    cookie_all = cookie_3 + "; " + cookie_1 + "; " + cookie_2
    cookies_head = dict()
    cookies_head["cookies_are"] = cookie_all
    driver.quit()
    return cookies_head
cookies_headless = get_cookies()
def requests_login_pss_system(date):
    res = []
    result = []
    pages = 1
    url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"
    head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            'Connection': 'keep-alive',
            'Origin': 'http://www.pss-system.gov.cn'}
    while True:
        info = {"searchCondition.searchExp": "(申请日='%s') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))" % date,
                "searchCondition.executableSearchExp": "VDB:((APD='%s' AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))" % date,
                "resultPagination.start": (pages - 1) * 12,
                "resultPagination.limit": 12,
                "searchCondition.literatureSF": "(申请日='%s') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))" % date}
        s = requests.session()
        s.keep_alive = False
        html = s.post(url, headers=head, data=info, cookies=cookies_headless).json()
        time.sleep(3 + random.random() * 2.5)
        page = html["searchResultDTO"]["pagination"]["totalCount"]
        if page == -1:
            print("page=-1，第%d页" % pages)
            pages += 1
            s.close()
        else:
            print("共%d条记录" % page)
            if html["searchResultDTO"]["searchResultRecord"] != []:
                for i in html["searchResultDTO"]["searchResultRecord"]:
                    lawStatus = i["lawStatus"]
                    patentType = i["patentType"]
                    name = i["fieldMap"]["TIVIEW"]
                    fnum = i["fieldMap"]["FNUM"]
                    cpnum = i["fieldMap"]["CPNUM"]
                    pnum = i["fieldMap"]["PNUM"]
                    VID = i["fieldMap"]["VID"]
                    APSN = i["fieldMap"]["APSN"]
                    APD = i["fieldMap"]["APD"]
                    PN_BAK = i["fieldMap"]["PN_BAK"]
                    PD = i["fieldMap"]["PD"]  # 公开日
                    IC = i["fieldMap"]["IC"]  # IPC分类号
                    CPC = i["fieldMap"]["CPC"]  # CPC分类号
                    PAVIEW = i["fieldMap"]["PAVIEW"]  # 申请人
                    INVIEW = i["fieldMap"]["INVIEW"]  # 发明人
                    AA = i["fieldMap"]["AA"]  # 地址
                    AGT = i["fieldMap"]["AGT"]  # 代理人
                    AGY = i["fieldMap"]["AGY"]  # 代理机构
                    OWNER_STATUS = i["fieldMap"]["AGY"]  # 代理机构
                    res.extend([lawStatus, patentType, name, fnum, cpnum, pnum, VID, APSN, APD, PN_BAK, PD, IC, CPC, PAVIEW, INVIEW, AA, AGT, AGY, OWNER_STATUS])
                    print(res)
                    middle = res.copy()
                    result.append(middle)
                    res.clear()
                    pages += 1
                    time.sleep(2)
                    s.close()
            else:
                print("检索完毕！")
                return result
test = requests_login_pss_system("20160101")
import requests
import random
date = '20090101'
res = []
result = []
pages = 4
url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"
head = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
    'Connection': 'keep-alive'}
# while True:
cookies_js = dict(cookies_are="IS_LOGIN=true; WEE_SID=iNxQozE_ihlq-6IwM2XQb6F3SKSTY30wNrb7e8YS9I7QMfu6QRax!1250085850!1820912863!1543246131519; avoid_declare=declare_pass; JSESSIONID=iNxQozE_ihlq-6IwM2XQb6F3SKSTY30wNrb7e8YS9I7QMfu6QRax!1250085850!1820912863")
info = {
    "searchCondition.searchExp": "(申请日='20090101') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))" ,
    "searchCondition.executableSearchExp": "VDB:((APD='20090101' AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))",
    "searchCondition.dbId": "VDB",
    "searchCondition.searchType": "Sino_foreign",
    "searchCondition.sortFields": "-APD,+PD",
    "searchCondition.extendInfo['MODE']": "MODE_TABLE",
    "resultPagination.start": (pages - 1) * 12,
    "resultPagination.limit": 12,
    "resultPagination.sumLimit": 10,
    "searchCondition.literatureSF": "(申请日='20090101') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))"}
# s = requests.session()
# s.keep_alive = False
html = requests.post(url, headers=head, data=info, cookies=cookies_headless).json()