代码实现

说明todo

  1. import multiprocessing as mp
  2. import os
  3. import re
  4. import time
  5. import pandas as pd
  6. import requests
  7. from bs4 import BeautifulSoup
  8. headers = {
  9. 'cookie': 'ip_ck=58aI7/31j7QuMDE5ODcyLjE2MaTA5NTU2NTI%3D; z_pro_city=s_provice%3Dzhejiang%26s_city%3Dningbo; userProvinceId=26; userCityId=154; userCountyId=929; userLocationId=12374; realLocationId=12374; userFidLocationId=12374; dwhis=%22s%22%3A%2228%22%2C%22m%22%3A%22125%22%2C%22p%22%3A%221296179%22; lv=1614734211; vn=11; visited_subcateId=6|0|443|529|28; visited_subcateProId=6-0|0-0|443-0|529-0|28-0,1289592|57-0; visited_serachKw=RTX%20%203090%7CRTX%20%203090/robots.txt%7CRTX%203090%7CGT%20210%7C210%7CRTX%203080; listSubcateId=6; Adshow=4; questionnaire_pv=1614729617',
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81'
  11. }
  12. base_url = "https://detail.zol.com.cn"
  13. pattern = re.compile(u'.+[0-9]+.+')
  14. products_A = {
  15. "hdd": "https://detail.zol.com.cn/hard_drives/s5994/", # 机械硬盘
  16. "ssd": "https://detail.zol.com.cn/solid_state_drive/s8155/", # 固态硬盘
  17. "motherboard": "https://detail.zol.com.cn/motherboard/", # 主板
  18. "power": "https://detail.zol.com.cn/power/", # 电源
  19. "memory": "https://detail.zol.com.cn/memory/s5974/zhejiang/", # 内存
  20. "case": "https://detail.zol.com.cn/case/", # 机箱
  21. "lcd": "https://detail.zol.com.cn/lcd/", # 液晶显示器
  22. }
  23. products_B = {
  24. "vga": "https://detail.zol.com.cn/vga/", # 显卡
  25. "cooling_product": "https://detail.zol.com.cn/cooling_product/", # 散热器
  26. }
  27. def get_all_urls(base_page: str) -> list:
  28. url_list = []
  29. url_pattern = base_page + "{0}.html"
  30. index = 0
  31. while True:
  32. index += 1
  33. cur_page = url_pattern.format(index)
  34. response = requests.get(cur_page, headers=headers)
  35. soup = BeautifulSoup(response.text, features="lxml")
  36. items = soup.find("div", attrs={"class": "content"}).find("div", attrs={"class": "pic-mode-box"}). \
  37. find("ul").find_all("li")
  38. for item in items:
  39. try:
  40. url_list.append(base_url + item.find("a").get("href"))
  41. except:
  42. continue
  43. end_flag = soup.find("div", attrs={"class": "history-tips"}).get_text()
  44. if pattern.match(end_flag) is not None:
  45. break
  46. return url_list
  47. class MyCrawler:
  48. def __init__(self, url_list: list, type: str):
  49. self.type = type
  50. self.workerNum = mp.cpu_count()
  51. self.urlQueue = mp.Manager().Queue()
  52. self.resultList = mp.Manager().list()
  53. for url in url_list:
  54. self.urlQueue.put(url)
  55. def job(self, index: int):
  56. Process_id = 'Process-' + str(index)
  57. while not self.urlQueue.empty():
  58. url = self.urlQueue.get()
  59. try:
  60. start = time.time()
  61. map = {"link": url}
  62. response = requests.get(map["link"], headers=headers)
  63. soup = BeautifulSoup(response.text, features="lxml")
  64. map["name"] = soup.find("h1").get_text()
  65. map["price"] = soup.find("b", attrs={"class": "price-type"}).get_text()
  66. if self.type == "A":
  67. section = soup.find("div", attrs={"class": "section"}).find("div",
  68. attrs={"class": "section-content"})
  69. params = section.find_all("p")
  70. for param in params:
  71. key = param.find("span").get_text()[:-1].replace("\n", " ").replace("\r", " ")
  72. value = str(param.contents[1]).replace("\n", " ").replace("\r", " ")
  73. map[key] = value
  74. else:
  75. section = soup.find("div", attrs={"class": "section", "id": "proParamSection"}).find("div", attrs={
  76. "class": "section-content"})
  77. params = section.find_all("tr")
  78. for param in params:
  79. tds = param.find_all("td")
  80. key = str(tds[0].get_text()).replace("\n", " ").replace("\r", " ")
  81. value = str(tds[1].get_text()).replace("\n", " ").replace("\r", " ")
  82. map[key] = value
  83. print("单次耗时:{0}".format(time.time() - start))
  84. print(map)
  85. self.resultList.append(map)
  86. # self.resultList.append(map)
  87. except Exception as e:
  88. print(Process_id, self.urlQueue.qsize(), url, "Error", e)
  89. def run(self):
  90. start = time.time()
  91. N = mp.cpu_count()
  92. pool = mp.Pool(processes=N)
  93. for i in range(N):
  94. pool.apply_async(self.job, args=(i,))
  95. print('Started process')
  96. pool.close()
  97. pool.join()
  98. end = time.time()
  99. print('Pool + Queue :', end - start)
  100. print('Main process Ended!')
  101. if __name__ == "__main__":
  102. for key, value in products_A.items():
  103. # 判断是否已经爬过了
  104. file_path = "../load/{0}.csv".format(key)
  105. is_exist = os.path.exists(file_path)
  106. if is_exist:
  107. print("{0}爬过了".format(key))
  108. continue
  109. # 获取所有产品的url
  110. url_list = get_all_urls(value)
  111. print("产品数量:{0}".format(len(url_list)))
  112. crawler = MyCrawler(url_list, "A")
  113. crawler.run()
  114. # print("list:\n", crawler.resultList)
  115. native_list = crawler.resultList[:]
  116. df = pd.DataFrame(native_list)
  117. df.to_csv(file_path)
  118. for key, value in products_B.items():
  119. # 判断是否已经爬过了
  120. file_path = "../load/{0}.csv".format(key)
  121. is_exist = os.path.exists(file_path)
  122. if is_exist:
  123. print("{0}爬过了".format(key))
  124. continue
  125. # 获取所有产品的url
  126. url_list = get_all_urls(value)
  127. print("产品数量:{0}".format(len(url_list)))
  128. crawler = MyCrawler(url_list, "B")
  129. crawler.run()
  130. # print("list:\n", crawler.resultList)
  131. native_list = crawler.resultList[:]
  132. df = pd.DataFrame(native_list)
  133. df.to_csv(file_path)

参考链接