1. # -*- coding: utf-8 -*-
    2. # @Time : 11/19/18 8:30 AM
    3. # @Author : Wai Mengxin
    4. # @Email : weimengxin2012@hotmail.com
    5. # @File : zhuanli.py
    6. # @Software: PyCharm
    7. from selenium import webdriver
    8. from selenium.webdriver.firefox.options import Options
    9. from selenium.webdriver.support.wait import WebDriverWait
    10. from selenium.webdriver.common.by import By
    11. from selenium.webdriver.support import expected_conditions as EC
    12. import selenium.webdriver.support.ui as ui
    13. from bs4 import BeautifulSoup
    14. import time
    15. import json
    16. # def get_cookies():
    17. # options = Options()
    18. # # options.add_argument('-headless')
    19. # driver = webdriver.Firefox(executable_path=r'C:\Users\lenovo\Desktop\geckodriver.exe', options=options)
    20. # url = "http://www.pss-system.gov.cn/sipopublicsearch/portal/uiIndex.shtml"
    21. # driver.get(url)
    22. # time.sleep(1)
    23. # driver.find_element_by_id("j_username").send_keys("weimengxin2012")
    24. # driver.find_element_by_id("j_password_show").send_keys("adwahads6136879")
    25. # valid = input("请手动输入验证码:")
    26. # driver.find_element_by_id("j_validation_code").send_keys(valid)
    27. # driver.find_element_by_link_text("登录").click()
    28. # time.sleep(1)
    29. # cookies = driver.get_cookies()
    30. # driver.quit()
    31. # return cookies
    32. #
    33. #
    34. # cookies_list_1 = get_cookies() # 获取cookies信息
    35. def read_cookies():
    36. with open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'r') as load_f:
    37. cookies_dict = json.load(load_f)
    38. return cookies_dict["cookies"]
    39. def get_page(html_file):
    40. obj = BeautifulSoup(html_file, "lxml")
    41. page = obj.find_all("div", class_="page_top")[-1].get_text()
    42. last = page.find("页", 5)
    43. all_page = page[6:last]
    44. page_num = int(all_page)
    45. # print("总共%d页" % page_num)
    46. return page_num
    47. def get_data(html_file):
    48. res = []
    49. result = []
    50. obj = BeautifulSoup(html_file, "lxml")
    51. data = obj.find_all("div", class_="list-container")[0].find_all("li")
    52. for i in data:
    53. title = i.a["title"]
    54. res.append(title) # 获取专利title
    55. refer = i.find("div", class_="btn-group left clear").find_all("a")
    56. for k in refer:
    57. res.append(k.get_text())
    58. if res[1] != '【公开】':
    59. res.insert(1, "NaN") # 获取专利引证关系
    60. info = i.find("div", class_="item-content-body left").find_all("p")
    61. for m in info:
    62. res.append(m.get_text()) # 获取专利其他各类信息
    63. res_copy = res.copy()
    64. result.append(res_copy)
    65. res.clear()
    66. return result
    67. def get_result_list(date):
    68. result = []
    69. options = Options()
    70. # options.add_argument('-headless')
    71. driver2 = webdriver.FirefoxProfile()
    72. driver2.set_preference('permissions.default.image', 2) # 某些firefox只需要这个
    73. driver2.set_preference('browser.migration.version', 9001) # 部分需要加上禁用css
    74. driver2.set_preference('permissions.default.stylesheet', 2) #禁用flash
    75. driver2.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') #禁用js
    76. driver2 = webdriver.Firefox(executable_path=r'C:\Users\lenovo\Desktop\geckodriver.exe', options=options)
    77. driver2.implicitly_wait(12)
    78. driver2.get("http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/tableSearch-showTableSearchIndex.shtml")
    79. driver2.delete_all_cookies()
    80. cookies_list = read_cookies() # 读取cookies数据
    81. for c in cookies_list:
    82. driver2.add_cookie(c)
    83. driver2.refresh()
    84. time.sleep(1)
    85. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='I']")))
    86. driver2.find_element_by_xpath("//a[@value='I']").click() # "中国发明申请"
    87. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='U']")))
    88. driver2.find_element_by_xpath("//a[@value='U']").click() # "中国实用新型"
    89. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='D']")))
    90. driver2.find_element_by_xpath("//a[@value='D']").click() # "中国外观设计"
    91. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='HK']")))
    92. driver2.find_element_by_xpath("//a[@value='HK']").click() # 香港
    93. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//a[@value='MO']")))
    94. driver2.find_element_by_xpath("//a[@value='MO']").click() # 澳门
    95. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, c)))
    96. driver2.find_element_by_xpath("//a[@value='TW']").click() # 台湾
    97. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.ID, "searchExpDisplay")))
    98. driver2.find_element_by_id("searchExpDisplay").send_keys("申请日=" + date)
    99. driver2.find_element_by_xpath("//div[@class='box-content-bottom']/a[3]").click()
    100. ui.WebDriverWait(driver2, 40).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']")))
    101. driver2.find_element_by_xpath("//a[@class='btn filter_btn']").click()
    102. time.sleep(2)
    103. ui.WebDriverWait(driver2, 10).until(EC.visibility_of_element_located((By.XPATH, "//div[@role='alertdialog']")))
    104. driver2.find_element_by_xpath("//input[@value='IVDB047']").click() # 优先权号
    105. driver2.find_element_by_xpath("//input[@value='IVDB090']").click() # 外观设计洛迦诺分类号
    106. driver2.find_element_by_xpath("//input[@value='IVDB102']").click() # 代理人
    107. driver2.find_element_by_xpath("//input[@value='IVDB103']").click() # 代理机构
    108. driver2.find_element_by_xpath("//input[@value='IVDB113']").click() # C-SETS
    109. driver2.find_element_by_xpath("//button[@data-id='应用']").click() # 应用
    110. ui.WebDriverWait(driver2, 60).until(EC.visibility_of_element_located((By.XPATH, "//div[@id='resultList']")))
    111. try:
    112. ui.WebDriverWait(driver2, 60).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']")))
    113. except:
    114. driver2.refresh()
    115. html = driver2.page_source
    116. page = get_page(html)
    117. res = get_data(html)
    118. result.extend(res)
    119. t = 1
    120. print("-{申请日:%s}-已完成第%d页检索,共%d页" % (date, t, page))
    121. while True:
    122. try:
    123. if t <= page:
    124. try:
    125. ui.WebDriverWait(driver2, 50).until_not(EC.visibility_of_element_located((By.XPATH, "//div[@class='blockUI blockOverlay']")))
    126. except:
    127. driver2.refresh()
    128. driver2.find_element_by_link_text("下一页").click()
    129. html = driver2.page_source
    130. res = get_data(html)
    131. result.extend(res)
    132. t += 1
    133. print("-{申请日:%s}-已完成第%d页检索,共%d页" % (date, t, page))
    134. else:
    135. driver2.quit()
    136. break
    137. except:
    138. driver2.quit()
    139. print("-{申请日:%s}-检索完毕!" % date)
    140. break
    141. return result
    142. # start = time.time()
    143. # intelligence_rights_20170101_20170110 = get_result_list("20170101:20170110")
    144. # end = time.time()
    145. # print("总共耗时:%f" % (end - start))
    1. import multiprocessing as mp
    2. import pss_system
    3. import time
    4. def output_data(all_data, output_file_name):
    5. '''该函数将最终数据输出为CSV文件'''
    6. import pandas as pd
    7. name = ["专利名", "是否公开", "同族", "引证", "被引", "申请号", "申请日", "公开号", "公开日", "IPC", "申请人", "发明人", "CPC", "权利要求"]
    8. table = pd.DataFrame(columns=name, data=all_data)
    9. table.to_csv("C:\\Users\\lenovo\\Desktop\\" + output_file_name + ".csv")
    10. return table
    11. if __name__ == "__main__":
    12. # 多进程并发
    13. date_list = []
    14. for i in range(20170111, 20170132):
    15. date_list.append(str(i))
    16. # date_list = ["20170111:20170115", "20170116:20170117", "2017018:20170119", "20170120:20170122", "20170123:20170131",
    17. # "20170201::20170211","20170212:20170215", "20170216:20170220"]
    18. start = time.time()
    19. pool = mp.Pool(1)
    20. res_uid = pool.map(pss_system.get_result_list, date_list)
    21. end = time.time()
    22. print("总共耗时:%f" % (end - start))
    1. import json
    2. fileObject = open(r'C:\Users\lenovo\Desktop\sample.json', 'w')
    3. for ip in intelligence_rights_20170101_20170110:
    4. fileObject.write(ip)
    5. fileObject.write('\n')
    6. fileObject.close()
    7. data_dict = dict()
    8. data_dict["20170101_20170110"] = intelligence_rights_20170101_20170110
    9. jsObj = json.dumps(data_dict)
    10. cookie = dict()
    11. cookie["data"] = result
    12. jsObj = json.dumps(cookie)
    13. fileObject = open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'w')
    14. fileObject.write(jsObj)
    15. fileObject.close()
    16. def read_cookies():
    17. with open(r"C:\Users\lenovo\Desktop\data_2017\cookies.json", 'r') as load_f:
    18. cookies_dict = json.load(load_f)
    19. return cookies_dict["cookies"]
    20. test = json.load(r"C:\Users\lenovo\Desktop\jsonFile.json")
    1. import requests
    2. # url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/tableSearch-showTableSearchIndex.shtml"
    3. url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"
    4. head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
    5. 'Connection': 'keep-alive'}
    6. cookie = dict(cookies_are="WEE_SID=w9FKZ8cmfSrotk0yNqM8Ijq_XqJlxoR_dV2TSaziEXkNLhyFg-iW!792269866!-627030802!1543141574438; IS_LOGIN=true; JSESSIONID=w9FKZ8cmfSrotk0yNqM8Ijq_XqJlxoR_dV2TSaziEXkNLhyFg-iW!792269866!-627030802")
    7. info = {"searchCondition.searchExp": "(申请日=20170101) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))",
    8. "searchCondition.executableSearchExp": "VDB:((APD='20170101' AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))",
    9. "resultPagination.start": 24,
    10. "resultPagination.limit": 12,
    11. "searchCondition.literatureSF": "(申请日=20170101) AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))"}
    12. html = requests.post(url, data=info, cookies=cookie, headers=head).json()