# -*- coding: utf-8 -*-# @Time : 11/19/18 8:30 AM# @Author : Wai Mengxin# @Email : weimengxin2012@hotmail.com# @File : zhuanli.py# @Software: PyCharmfrom selenium import webdriverfrom selenium.webdriver.firefox.options import Optionsfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECimport selenium.webdriver.support.ui as uifrom bs4 import BeautifulSoupimport timeimport json# def get_cookies():# options = Options()# # options.add_argument('-headless')# driver = webdriver.Firefox(executable_path=r'C:\Users\lenovo\Desktop\geckodriver.exe', options=options)# url = "http://www.pss-system.gov.cn/sipopublicsearch/portal/uiIndex.shtml"# driver.get(url)# time.sleep(1)# driver.find_element_by_id("j_username").send_keys("weimengxin2012")# driver.find_element_by_id("j_password_show").send_keys("adwahads6136879")# valid = input("请手动输入验证码:")# driver.find_element_by_id("j_validation_code").send_keys(valid)# driver.find_element_by_link_text("登录").click()# time.sleep(1)# cookies = driver.get_cookies()# driver.quit()# return cookies### cookies_list_1 = get_cookies() # 获取cookies信息def read_cookies(): with open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'r') as load_f: cookies_dict = json.load(load_f) return cookies_dict["cookies"]def get_page(html_file): obj = BeautifulSoup(html_file, "lxml") page = obj.find_all("div", class_="page_top")[-1].get_text() last = page.find("页", 5) all_page = page[6:last] page_num = int(all_page) # print("总共%d页" % page_num) return page_numdef get_data(html_file): res = [] result = [] obj = BeautifulSoup(html_file, "lxml") data = obj.find_all("div", class_="list-container")[0].find_all("li") for i in data: title = i.a["title"] res.append(title) # 获取专利title refer = i.find("div", class_="btn-group left clear").find_all("a") for k in refer: res.append(k.get_text()) if res[1] != '【公开】': res.insert(1, "NaN") # 获取专利引证关系 info = i.find("div", class_="item-content-body left").find_all("p") for m in info: res.append(m.get_text()) # 获取专利其他各类信息 res_copy = res.copy() result.append(res_copy) res.clear() return resultdef get_result_list(date): result = [] options = Options() # options.add_argument('-headless') driver2 = webdriver.FirefoxProfile() driver2.set_preference('permissions.default.image', 2) # 某些firefox只需要这个 driver2.set_preference('browser.migration.version', 9001) # 部分需要加上禁用css driver2.set_preference('permissions.default.stylesheet', 2) #禁用flash driver2.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') #禁用js driver2 = webdriver.Firefox(executable_path=r'C:\Users\lenovo\Desktop\geckodriver.exe', options=options) driver2.implicitly_wait(12) driver2.get("http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/tableSearch-showTableSearchIndex.shtml") driver2.delete_all_cookies() cookies_list = read_cookies() # 读取cookies数据 for c in cookies_list: driver2.add_cookie(c) driver2.refresh() time.sleep(1) ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='I']"))) driver2.find_element_by_xpath("//a[@value='I']").click() # "中国发明申请" ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='U']"))) driver2.find_element_by_xpath("//a[@value='U']").click() # "中国实用新型" ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='D']"))) driver2.find_element_by_xpath("//a[@value='D']").click() # "中国外观设计" ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='HK']"))) driver2.find_element_by_xpath("//a[@value='HK']").click() # 香港 ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='MO']"))) driver2.find_element_by_xpath("//a[@value='MO']").click() # 澳门 ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, c))) driver2.find_element_by_xpath("//a[@value='TW']").click() # 台湾 ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.ID, "searchExpDisplay"))) driver2.find_element_by_id("searchExpDisplay").send_keys("申请日=" + date) driver2.find_element_by_xpath("//div[@class='box-content-bottom']/a[3]").click() ui.WebDriverWait(driver2, 40).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']"))) driver2.find_element_by_xpath("//a[@class='btn filter_btn']").click() time.sleep(2) ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//div[@role='alertdialog']"))) driver2.find_element_by_xpath("//input[@value='IVDB047']").click() # 优先权号 driver2.find_element_by_xpath("//input[@value='IVDB090']").click() # 外观设计洛迦诺分类号 driver2.find_element_by_xpath("//input[@value='IVDB102']").click() # 代理人 driver2.find_element_by_xpath("//input[@value='IVDB103']").click() # 代理机构 driver2.find_element_by_xpath("//input[@value='IVDB113']").click() # C-SETS driver2.find_element_by_xpath("//button[@data-id='应用']").click() # 应用 ui.WebDriverWait(driver2, 60).until(EC.visibility_of_element_located((By.XPATH, "//div[@id='resultList']"))) try: ui.WebDriverWait(driver2, 60).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']"))) except: driver2.refresh() html = driver2.page_source page = get_page(html) res = get_data(html) result.extend(res) t = 1 print("-{申请日:%s}-已完成第%d页检索,共%d页" % (date, t, page)) while True: try: if t <= page: try: ui.WebDriverWait(driver2, 50).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']"))) except: driver2.refresh() driver2.find_element_by_link_text("下一页").click() html = driver2.page_source res = get_data(html) result.extend(res) t += 1 print("-{申请日:%s}-已完成第%d页检索,共%d页" % (date, t, page)) else: driver2.quit() break except: driver2.quit() print("-{申请日:%s}-检索完毕!" % date) break return result# start = time.time()# intelligence_rights_20170101_20170110 = get_result_list("20170101:20170110")# end = time.time()# print("总共耗时:%f" % (end - start))
import multiprocessing as mpimport pss_systemimport timedef output_data(all_data, output_file_name): '''该函数将最终数据输出为CSV文件''' import pandas as pd name = ["专利名", "是否公开", "同族", "引证", "被引", "申请号", "申请日", "公开号", "公开日", "IPC", "申请人", "发明人", "CPC", "权利要求"] table = pd.DataFrame(columns=name, data=all_data) table.to_csv("C:\\Users\\lenovo\\Desktop\\" + output_file_name + ".csv") return tableif __name__ == "__main__": # 多进程并发 date_list = [] for i in range(20170111, 20170132): date_list.append(str(i)) # date_list = ["20170111:20170115", "20170116:20170117", "2017018:20170119", "20170120:20170122", "20170123:20170131", # "20170201::20170211","20170212:20170215", "20170216:20170220"] start = time.time() pool = mp.Pool(1) res_uid = pool.map(pss_system.get_result_list, date_list) end = time.time() print("总共耗时:%f" % (end - start))
import jsonfileObject = open(r'C:\Users\lenovo\Desktop\sample.json', 'w')for ip in intelligence_rights_20170101_20170110: fileObject.write(ip) fileObject.write('\n')fileObject.close()data_dict = dict()data_dict["20170101_20170110"] = intelligence_rights_20170101_20170110jsObj = json.dumps(data_dict)cookie = dict()cookie["data"] = resultjsObj = json.dumps(cookie)fileObject = open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'w')fileObject.write(jsObj)fileObject.close()def read_cookies(): with open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'r') as load_f: cookies_dict = json.load(load_f) return cookies_dict["cookies"]test = json.load(r"C:\Users\lenovo\Desktop\jsonFile.json")
import requests# url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/tableSearch-showTableSearchIndex.shtml"url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Connection': 'keep-alive'}cookie = dict(cookies_are="WEE_SID=w9FKZ8cmfSrotk0yNqM8Ijq_XqJlxoR_dV2TSaziEXkNLhyFg-iW!792269866!-627030802!1543141574438; IS_LOGIN=true; JSESSIONID=w9FKZ8cmfSrotk0yNqM8Ijq_XqJlxoR_dV2TSaziEXkNLhyFg-iW!792269866!-627030802")info = {"searchCondition.searchExp": "(申请日=20170101) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))", "searchCondition.executableSearchExp": "VDB:((APD='20170101' AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))", "resultPagination.start": 24, "resultPagination.limit": 12, "searchCondition.literatureSF": "(申请日=20170101) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))"}html = requests.post(url, data=info, cookies=cookie, headers=head).json()