1. from selenium import webdriver
    2. from selenium.webdriver.firefox.options import Options
    3. from selenium.webdriver.support.wait import WebDriverWait
    4. from selenium.webdriver.common.by import By
    5. from selenium.webdriver.support import expected_conditions as EC
    6. import selenium.webdriver.support.ui as ui
    7. from bs4 import BeautifulSoup
    8. import time
    9. import json
    10. import random
    11. import requests
    12. def get_cookies():
    13. options = Options()
    14. # options.add_argument('-headless')
    15. driver = webdriver.Firefox(executable_path='/Users/viemaxwei/Downloads/geckodriver 2', options=options)
    16. url = "http://www.pss-system.gov.cn/sipopublicsearch/portal/uiIndex.shtml"
    17. driver.get(url)
    18. time.sleep(1)
    19. driver.find_element_by_id("j_username").send_keys("weimengxin2018")
    20. driver.find_element_by_id("j_password_show").send_keys("adwahads6136879")
    21. # html_login = driver.page_source
    22. valid = input("请手动输入验证码:")
    23. driver.find_element_by_id("j_validation_code").send_keys(valid)
    24. driver.find_element_by_link_text("登录").click()
    25. time.sleep(3)
    26. cookie = driver.get_cookies()
    27. cookie_1 = cookie[0]["name"] + "=" + cookie[0]["value"]
    28. cookie_2 = cookie[1]["name"] + "=" + cookie[1]["value"]
    29. cookie_3 = cookie[2]["name"] + "=" + cookie[2]["value"]
    30. cookie_all = cookie_3 + "; " + cookie_1 + "; " + cookie_2
    31. cookies_head = dict()
    32. cookies_head["cookies_are"] = cookie_all
    33. driver.quit()
    34. return cookies_head
    35. def requests_login_pss_system(date, pages):
    36. res = []
    37. result = []
    38. # pages = 1
    39. url = "http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showSearchResult-startWa.shtml"
    40. head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
    41. 'Connection': 'keep-alive',
    42. 'Origin': 'http://www.pss-system.gov.cn'}
    43. # while True:
    44. info = {"searchCondition.searchExp": "(申请日='%s') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))" % date,
    45. "searchCondition.executableSearchExp": "VDB:((APD='%s' AND (CC='HK' OR CC='MO' OR CC='TW' OR ((DOC_TYPE='I' OR DOC_TYPE='U' OR DOC_TYPE='D') AND CC='CN'))))" % date,
    46. "resultPagination.start": (pages - 1) * 12,
    47. "resultPagination.limit": 12,
    48. "searchCondition.literatureSF": "(申请日='%s') AND 公开国家/地区/组织=(HK OR MO OR TW OR (发明类型=('I' OR 'U' OR 'D') AND 公开国家/地区/组织=(CN)))" % date}
    49. s = requests.session()
    50. # s.keep_alive = False
    51. html = s.post(url, headers=head, data=info, cookies=cookies_headless).json()
    52. time.sleep(2 + random.random() * 2.5)
    53. page = html["searchResultDTO"]["pagination"]["totalCount"]
    54. if page == -1:
    55. print("page=-1,第%d页" % pages)
    56. pages += 1
    57. s.close()
    58. else:
    59. print("共%d条记录" % page)
    60. if html["searchResultDTO"]["searchResultRecord"] != []:
    61. for i in html["searchResultDTO"]["searchResultRecord"]:
    62. lawStatus = i["lawStatus"]
    63. patentType = i["patentType"]
    64. name = i["fieldMap"]["TIVIEW"]
    65. fnum = i["fieldMap"]["FNUM"]
    66. cpnum = i["fieldMap"]["CPNUM"]
    67. pnum = i["fieldMap"]["PNUM"]
    68. VID = i["fieldMap"]["VID"]
    69. APSN = i["fieldMap"]["APSN"]
    70. APD = i["fieldMap"]["APD"]
    71. PN_BAK = i["fieldMap"]["PN_BAK"]
    72. PD = i["fieldMap"]["PD"] # 公开日
    73. IC = i["fieldMap"]["IC"] # IPC分类号
    74. # CPC = i["fieldMap"]["CPC"] # CPC分类号
    75. PAVIEW = i["fieldMap"]["PAVIEW"] # 申请人
    76. INVIEW = i["fieldMap"]["INVIEW"] # 发明人
    77. AA = i["fieldMap"]["AA"] # 地址
    78. AGT = i["fieldMap"]["AGT"] # 代理人
    79. AGY = i["fieldMap"]["AGY"] # 代理机构
    80. OWNER_STATUS = i["fieldMap"]["AGY"] # 代理机构
    81. res.extend([lawStatus, patentType, name, fnum, cpnum, pnum, VID, APSN, APD, PN_BAK, PD, IC, PAVIEW, INVIEW, AA, AGT, AGY, OWNER_STATUS])
    82. print(res)
    83. middle = res.copy()
    84. result.append(middle)
    85. res.clear()
    86. pages += 1
    87. time.sleep(0.7)
    88. s.close()
    89. return result
    90. else:
    91. print("检索完毕!")
    92. def run_fiction(date, start_page, end_page, list_con):
    93. start = time.time()
    94. try:
    95. for ind in range(start_page, end_page): # 1001 1101
    96. print("正在检索 %d 页" % ind)
    97. test = requests_login_pss_system(date, ind)
    98. list_con.extend(test)
    99. if ind % 4 == 0:
    100. print("Code冷却中。。。")
    101. time.sleep(3 + random.random() * 7)
    102. except:
    103. print("完成!")
    104. end = time.time()
    105. print("20170101用时:%f" % (end - start))
    106. res_10 = []
    107. cookies_headless = get_cookies()
    108. run = run_fiction("20170110", 240, 1501, res_10)