# -*- coding: utf-8 -*-
# @Time : 11/19/18 8:30 AM
# @Author : Wai Mengxin
# @Email : weimengxin2012@hotmail.com
# @File : zhuanli.py
# @Software: PyCharm
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import selenium.webdriver.support.ui as ui
from bs4 import BeautifulSoup
import time
import json
# def get_cookies():
# options = Options()
# # options.add_argument('-headless')
# driver = webdriver.Firefox(executable_path=r'C:\Users\lenovo\Desktop\geckodriver.exe', options=options)
# url = "http://www.pss-system.gov.cn/sipopublicsearch/portal/uiIndex.shtml"
# driver.get(url)
# time.sleep(1)
# driver.find_element_by_id("j_username").send_keys("weimengxin2012")
# driver.find_element_by_id("j_password_show").send_keys("adwahads6136879")
# valid = input("请手动输入验证码:")
# driver.find_element_by_id("j_validation_code").send_keys(valid)
# driver.find_element_by_link_text("登录").click()
# time.sleep(1)
# cookies = driver.get_cookies()
# driver.quit()
# return cookies
#
#
# cookies_list_1 = get_cookies() # 获取cookies信息
def read_cookies():
with open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'r') as load_f:
cookies_dict = json.load(load_f)
return cookies_dict["cookies"]
def get_page(html_file):
obj = BeautifulSoup(html_file, "lxml")
page = obj.find_all("div", class_="page_top")[-1].get_text()
last = page.find("页", 5)
all_page = page[6:last]
page_num = int(all_page)
# print("总共%d页" % page_num)
return page_num
def get_data(html_file):
res = []
result = []
obj = BeautifulSoup(html_file, "lxml")
data = obj.find_all("div", class_="list-container")[0].find_all("li")
for i in data:
title = i.a["title"]
res.append(title) # 获取专利title
refer = i.find("div", class_="btn-group left clear").find_all("a")
for k in refer:
res.append(k.get_text())
if res[1] != '【公开】':
res.insert(1, "NaN") # 获取专利引证关系
info = i.find("div", class_="item-content-body left").find_all("p")
for m in info:
res.append(m.get_text()) # 获取专利其他各类信息
res_copy = res.copy()
result.append(res_copy)
res.clear()
return result
def get_result_list(date):
result = []
options = Options()
# options.add_argument('-headless')
driver2 = webdriver.FirefoxProfile()
driver2.set_preference('permissions.default.image', 2) # 某些firefox只需要这个
driver2.set_preference('browser.migration.version', 9001) # 部分需要加上禁用css
driver2.set_preference('permissions.default.stylesheet', 2) #禁用flash
driver2.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') #禁用js
driver2 = webdriver.Firefox(executable_path=r'C:\Users\lenovo\Desktop\geckodriver.exe', options=options)
driver2.implicitly_wait(12)
driver2.get("http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/tableSearch-showTableSearchIndex.shtml")
driver2.delete_all_cookies()
cookies_list = read_cookies() # 读取cookies数据
for c in cookies_list:
driver2.add_cookie(c)
driver2.refresh()
time.sleep(1)
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='I']")))
driver2.find_element_by_xpath("//a[@value='I']").click() # "中国发明申请"
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='U']")))
driver2.find_element_by_xpath("//a[@value='U']").click() # "中国实用新型"
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='D']")))
driver2.find_element_by_xpath("//a[@value='D']").click() # "中国外观设计"
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='HK']")))
driver2.find_element_by_xpath("//a[@value='HK']").click() # 香港
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='MO']")))
driver2.find_element_by_xpath("//a[@value='MO']").click() # 澳门
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, c)))
driver2.find_element_by_xpath("//a[@value='TW']").click() # 台湾
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.ID, "searchExpDisplay")))
driver2.find_element_by_id("searchExpDisplay").send_keys("申请日=" + date)
driver2.find_element_by_xpath("//div[@class='box-content-bottom']/a[3]").click()
ui.WebDriverWait(driver2, 40).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']")))
driver2.find_element_by_xpath("//a[@class='btn filter_btn']").click()
time.sleep(2)
ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//div[@role='alertdialog']")))
driver2.find_element_by_xpath("//input[@value='IVDB047']").click() # 优先权号
driver2.find_element_by_xpath("//input[@value='IVDB090']").click() # 外观设计洛迦诺分类号
driver2.find_element_by_xpath("//input[@value='IVDB102']").click() # 代理人
driver2.find_element_by_xpath("//input[@value='IVDB103']").click() # 代理机构
driver2.find_element_by_xpath("//input[@value='IVDB113']").click() # C-SETS
driver2.find_element_by_xpath("//button[@data-id='应用']").click() # 应用
ui.WebDriverWait(driver2, 60).until(EC.visibility_of_element_located((By.XPATH, "//div[@id='resultList']")))
try:
ui.WebDriverWait(driver2, 60).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']")))
except:
driver2.refresh()
html = driver2.page_source
page = get_page(html)
res = get_data(html)
result.extend(res)
t = 1
print("-{申请日:%s}-已完成第%d页检索,共%d页" % (date, t, page))
while True:
try:
if t <= page:
try:
ui.WebDriverWait(driver2, 50).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']")))
except:
driver2.refresh()
driver2.find_element_by_link_text("下一页").click()
html = driver2.page_source
res = get_data(html)
result.extend(res)
t += 1
print("-{申请日:%s}-已完成第%d页检索,共%d页" % (date, t, page))
else:
driver2.quit()
break
except:
driver2.quit()
print("-{申请日:%s}-检索完毕!" % date)
break
return result
# start = time.time()
# intelligence_rights_20170101_20170110 = get_result_list("20170101:20170110")
# end = time.time()
# print("总共耗时:%f" % (end - start))
import multiprocessing as mp
import pss_system
import time
def output_data(all_data, output_file_name):
'''该函数将最终数据输出为CSV文件'''
import pandas as pd
name = ["专利名", "是否公开", "同族", "引证", "被引", "申请号", "申请日", "公开号", "公开日", "IPC", "申请人", "发明人", "CPC", "权利要求"]
table = pd.DataFrame(columns=name, data=all_data)
table.to_csv("C:\\Users\\lenovo\\Desktop\\" + output_file_name + ".csv")
return table
if __name__ == "__main__":
# 多进程并发
date_list = []
for i in range(20170111, 20170132):
date_list.append(str(i))
# date_list = ["20170111:20170115", "20170116:20170117", "2017018:20170119", "20170120:20170122", "20170123:20170131",
# "20170201::20170211","20170212:20170215", "20170216:20170220"]
start = time.time()
pool = mp.Pool(1)
res_uid = pool.map(pss_system.get_result_list, date_list)
end = time.time()
print("总共耗时:%f" % (end - start))
import json
fileObject = open(r'C:\Users\lenovo\Desktop\sample.json', 'w')
for ip in intelligence_rights_20170101_20170110:
fileObject.write(ip)
fileObject.write('\n')
fileObject.close()
data_dict = dict()
data_dict["20170101_20170110"] = intelligence_rights_20170101_20170110
jsObj = json.dumps(data_dict)
cookie = dict()
cookie["data"] = result
jsObj = json.dumps(cookie)
fileObject = open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'w')
fileObject.write(jsObj)
fileObject.close()
def read_cookies():
with open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'r') as load_f:
cookies_dict = json.load(load_f)
return cookies_dict["cookies"]
test = json.load(r"C:\Users\lenovo\Desktop\jsonFile.json")
import requests
# url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/tableSearch-showTableSearchIndex.shtml"
url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Connection': 'keep-alive'}
cookie = dict(cookies_are="WEE_SID=w9FKZ8cmfSrotk0yNqM8Ijq_XqJlxoR_dV2TSaziEXkNLhyFg-iW!792269866!-627030802!1543141574438; IS_LOGIN=true; JSESSIONID=w9FKZ8cmfSrotk0yNqM8Ijq_XqJlxoR_dV2TSaziEXkNLhyFg-iW!792269866!-627030802")
info = {"searchCondition.searchExp": "(申请日=20170101) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))",
"searchCondition.executableSearchExp": "VDB:((APD='20170101' AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))",
"resultPagination.start": 24,
"resultPagination.limit": 12,
"searchCondition.literatureSF": "(申请日=20170101) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))"}
html = requests.post(url, data=info, cookies=cookie, headers=head).json()