一、简介:

突然多了这样一个需求,有很多行业标准需要查,但是一个一个搜太麻烦了,爬虫呢我这辈子可能都学不会,所以选择走个50步。

二、部分代码:

1.轮子:

  1. import pandas as pd
  2. import numpy as np
  3. import urllib.parse#转义
  4. from openpyxl import load_workbook
  5. from tkinter import *#UI需要
  6. from tkinter import filedialog#选择路径需要
  7. import tkinter.font as tkFont#字体设置
  8. import tkinter.ttk#进度条需要

2.UI:

  1. window = Tk() # 创建主窗口
  2. # window.geometry('+500+300')
  3. #window.attributes('-topmost', 1)
  4. ft = tkFont.Font(family='SourceHanSansCN-Bold', size=8)
  5. window.title('聚合搜索') # 窗口命名为归档工具
  6. window.geometry('720x210') # 设置窗口为720*480
  7. screenWidth = window.winfo_screenwidth() # 获取显示区域的宽度
  8. screenHeight = window.winfo_screenheight() # 获取显示区域的高度
  9. width = 1920 # 设定窗口宽度
  10. height = 1040 # 设定窗口高度
  11. left = (screenWidth - width) / 2
  12. top = (screenHeight - height) / 2
  13. window.resizable(width=False, height=False)
  14. #window.iconbitmap('项目10.ico') # 更改窗口图标
  15. window.attributes('-alpha', 0.9) # 窗口透明度
  16. progressbarOne = tkinter.ttk.Progressbar(window) # 设置进度条
  17. progressbarOne.place(relx=0.16, width=450, height=7, rely=0.5) # 设置进度条长宽高
  18. progressbarTwo = tkinter.ttk.Progressbar(window) # 设置进度条
  19. progressbarTwo.place(relx=0.16, width=450, height=7, rely=0.6) # 设置进度条长宽高
  20. def select_file():
  21. selected_file = filedialog.askopenfilename() # 使用askopenfilename函数一个文件
  22. varF_1.set(selected_file)
  23. varF_1 = StringVar()
  24. LF_1 = Label(window, text='输入:', font=ft).place(relx=0.1, rely=0.15)
  25. EnF_1 = Entry(window, textvariable=varF_1).place(relx=0.16, width=450, rely=0.15)
  26. Button(window, text='选择路径', command=select_file, font=ft).place(relx=0.8, width=80, rely=0.15)
  27. btn0 = Button(window, text='运行', font=ft, command=lambda: [father()]).place(
  28. relx=0.45, rely=0.7)
  29. Label(window, text='version:1.0.0.0', font=ft).place(relx=0.41, rely=0.85)

3.实现,以百度为例:

  1. df = pd.read_excel(varF_1.get())
  2. progressbarTwo['maximum'] = len(list_3) # 进度值最大值
  3. progressbarTwo['value'] = 0 # 进度值初始值
  4. df1 = list(np.array(df['标准代号'].str.split('\n')))#当有合并单元格的时候,以\n切片元素
  5. list_2 = [x for item in df1 for x in item]#去掉list内括号
  6. list_3 = [x.strip() for x in list_2 if x.strip() != '']#去掉''元素
  7. # list_3 = list(set(list_2))
  8. list_3 = list(set(list_3))#去重
  9. list_3.sort(key=None, reverse=False)#排序
  10. listA = []
  11. listB = []
  12. listC = []
  13. listD = []
  14. url = 'https://www.baidu.com/s?tn=51076811_dg&wd='
  15. url2 = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=51076811_dg&wd='
  16. url3 = 'https://m.baidu.com/ssid=21ec6c6478732f17/from=1001192y/s?word='
  17. quick = '=HYPERLINK('#写入excel时转换为超链接
  18. for i in range(len(list_3)):
  19. if not '\u4e00' <= list_3[i][2] <= '\u9fff':#判断第二个字符是否为汉字
  20. a = str(list_3[i]).replace('/', '%2F')#替换'/''%2F'
  21. c = a.replace(' ', '+')#替换' '','
  22. b = c.replace(' ', '+')#替换' '','
  23. d = url + b + '%20-advertisement'#合并链接
  24. f = url3 + b + '+-advertisemen'
  25. e = quick + 'B' + str(i + 2) + ')'#转换为超链接
  26. g = quick + 'D' + str(i + 2) + ')'
  27. listB.append(e)
  28. listA.append(d)
  29. listC.append(f)
  30. listD.append(g)
  31. else:
  32. d = url2 + list_3[i] + '%20-advertisement'
  33. f = url3 + urllib.parse.quote(list_3[i].encode('gb2312'))+ '+-advertisemen'#将中文转义
  34. print(f)
  35. e = quick + 'B' + str(i + 2) + ')'
  36. g = quick + 'D' + str(i + 2) + ')'
  37. listB.append(e)
  38. listA.append(d)
  39. listC.append(f)
  40. listD.append(g)
  41. progressbarTwo['value'] += 1 # 进度条+1
  42. window.update()
  43. df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB,'phone':listC,'phoneurl':listD})
  44. print(df2)
  45. book = load_workbook(varF_1.get())
  46. writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
  47. writer.book = book
  48. df2.to_excel(writer, sheet_name='BaiDu', index=0)
  49. worksheet = book['BaiDu']
  50. worksheet.column_dimensions['A'].width = 25
  51. worksheet.column_dimensions['B'].width = 10
  52. worksheet.column_dimensions['C'].width = 100
  53. worksheet.column_dimensions['D'].width = 10
  54. worksheet.column_dimensions['E'].width = 100
  55. writer.save()
  56. progressbarOne['value'] += 1 # 进度条+1
  57. window.update()

三、常见搜索网站头部:

名称 尾+去广告
百度搜索英文及数字 https://www.baidu.com/s?tn=51076811_dg&wd= %20-advertisement
百度搜索中文(可适用于英文及数字) https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=51076811_dg&wd= ‘%20-advertisement’
百度移动 https://m.baidu.com/ssid=21ec6c6478732f17/from=1001192y/s?word= ‘+-advertisement’
谷歌 https://www.google.com/search?q= ‘%20-advertisement’
必应国内 https://cn.bing.com/search?q= ‘+-advertisement&?FORM=BEHPTB’
必应国际 https://cn.bing.com/search?q= ‘+-advertisement&?FORM=BEHPTB&ensearch=1’
搜狗 https://www.sogou.com/web?query= ‘+-advertisement’

四、全代码:

  1. import pandas as pd
  2. import numpy as np
  3. import urllib.parse#转义
  4. from openpyxl import load_workbook
  5. from tkinter import *#UI需要
  6. from tkinter import filedialog#选择路径需要
  7. import tkinter.font as tkFont#字体设置
  8. import tkinter.ttk#进度条需要
  9. window = Tk() # 创建主窗口
  10. # window.geometry('+500+300')
  11. # window.attributes('-topmost', 1)
  12. ft = tkFont.Font(family='SourceHanSansCN-Bold', size=8)
  13. window.title('聚合搜索') # 窗口命名为归档工具
  14. window.geometry('720x210') # 设置窗口为720*480
  15. screenWidth = window.winfo_screenwidth() # 获取显示区域的宽度
  16. screenHeight = window.winfo_screenheight() # 获取显示区域的高度
  17. width = 1920 # 设定窗口宽度
  18. height = 1040 # 设定窗口高度
  19. left = (screenWidth - width) / 2
  20. top = (screenHeight - height) / 2
  21. window.resizable(width=False, height=False)
  22. # window.iconbitmap('项目10.ico') # 更改窗口图标
  23. window.attributes('-alpha', 0.9) # 窗口透明度
  24. progressbarOne = tkinter.ttk.Progressbar(window) # 设置进度条
  25. progressbarOne.place(relx=0.16, width=450, height=7, rely=0.5) # 设置进度条长宽高
  26. progressbarTwo = tkinter.ttk.Progressbar(window) # 设置进度条
  27. progressbarTwo.place(relx=0.16, width=450, height=7, rely=0.6) # 设置进度条长宽高
  28. def select_file():
  29. selected_file = filedialog.askopenfilename() # 使用askopenfilename函数一个文件
  30. varF_1.set(selected_file)
  31. varF_1 = StringVar()
  32. LF_1 = Label(window, text='输入:', font=ft).place(relx=0.1, rely=0.15)
  33. EnF_1 = Entry(window, textvariable=varF_1).place(relx=0.16, width=450, rely=0.15)
  34. Button(window, text='选择路径', command=select_file, font=ft).place(relx=0.8, width=80, rely=0.15)
  35. btn0 = Button(window, text='运行', font=ft, command=lambda: [father()]).place(
  36. relx=0.45, rely=0.7)
  37. Label(window, text='version:1.0.0.0', font=ft).place(relx=0.41, rely=0.85)
  38. def father():
  39. progressbarOne['maximum'] = 6 # 进度值最大值
  40. progressbarOne['value'] = 0 # 进度值初始值
  41. df = pd.read_excel(varF_1.get())
  42. progressbarTwo['maximum'] = len(list_3) # 进度值最大值
  43. progressbarTwo['value'] = 0 # 进度值初始值
  44. df1 = list(np.array(df['标准代号'].str.split('\n')))#当有合并单元格的时候,以\n切片元素
  45. list_2 = [x for item in df1 for x in item]#去掉list内括号
  46. list_3 = [x.strip() for x in list_2 if x.strip() != '']#去掉''元素
  47. # list_3 = list(set(list_2))
  48. list_3 = list(set(list_3))#去重
  49. list_3.sort(key=None, reverse=False)#排序
  50. listA = []
  51. listB = []
  52. listC = []
  53. listD = []
  54. url = 'https://www.baidu.com/s?tn=51076811_dg&wd='
  55. url2 = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=51076811_dg&wd='
  56. url3 = 'https://m.baidu.com/ssid=21ec6c6478732f17/from=1001192y/s?word='
  57. quick = '=HYPERLINK('#写入excel时转换为超链接
  58. for i in range(len(list_3)):
  59. if not '\u4e00' <= list_3[i][2] <= '\u9fff':#判断第二个字符是否为汉字
  60. a = str(list_3[i]).replace('/', '%2F')#替换'/''%2F'
  61. c = a.replace(' ', '+')#替换' '','
  62. b = c.replace(' ', '+')#替换' '','
  63. d = url + b + '%20-advertisement'#合并链接
  64. f = url3 + b + '+-advertisemen'
  65. e = quick + 'B' + str(i + 2) + ')'#转换为超链接
  66. g = quick + 'D' + str(i + 2) + ')'
  67. listB.append(e)
  68. listA.append(d)
  69. listC.append(f)
  70. listD.append(g)
  71. else:
  72. d = url2 + list_3[i] + '%20-advertisement'
  73. f = url3 + urllib.parse.quote(list_3[i].encode('gb2312'))+ '+-advertisemen'#将中文转义
  74. print(f)
  75. e = quick + 'B' + str(i + 2) + ')'
  76. g = quick + 'D' + str(i + 2) + ')'
  77. listB.append(e)
  78. listA.append(d)
  79. listC.append(f)
  80. listD.append(g)
  81. progressbarTwo['value'] += 1 # 进度条+1
  82. window.update()
  83. df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB,'phone':listC,'phoneurl':listD})
  84. print(df2)
  85. book = load_workbook(varF_1.get())
  86. writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
  87. writer.book = book
  88. df2.to_excel(writer, sheet_name='BaiDu', index=0)
  89. worksheet = book['BaiDu']
  90. worksheet.column_dimensions['A'].width = 25
  91. worksheet.column_dimensions['B'].width = 10
  92. worksheet.column_dimensions['C'].width = 100
  93. worksheet.column_dimensions['D'].width = 10
  94. worksheet.column_dimensions['E'].width = 100
  95. writer.save()
  96. progressbarOne['value'] += 1 # 进度条+1
  97. window.update()
  98. progressbarTwo['maximum'] = len(list_3) # 进度值最大值
  99. progressbarTwo['value'] = 0 # 进度值初始值
  100. df1 = list(np.array(df['标准代号'].str.split('\n')))
  101. list_2 = [x for item in df1 for x in item]
  102. list_3 = [x.strip() for x in list_2 if x.strip() != '']
  103. # list_3 = list(set(list_2))
  104. list_3 = list(set(list_3))
  105. list_3.sort(key=None, reverse=False)
  106. listA = []
  107. listB = []
  108. url = 'https://www.google.com/search?q='
  109. quick = '=HYPERLINK('
  110. for i in range(len(list_3)):
  111. if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
  112. a = str(list_3[i]).replace('/', '%2F')
  113. c = a.replace(' ', '+')
  114. b = c.replace(' ', '+')
  115. d = url + b + '%20-advertisement'
  116. e = quick + 'B' + str(i + 2) + ')'
  117. listB.append(e)
  118. listA.append(d)
  119. else:
  120. d = url + list_3[i] + '%20-advertisement'
  121. e = quick + 'B' + str(i + 2) + ')'
  122. listB.append(e)
  123. listA.append(d)
  124. progressbarTwo['value'] += 1 # 进度条+1
  125. window.update()
  126. df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
  127. print(df2)
  128. book = load_workbook(varF_1.get())
  129. writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
  130. writer.book = book
  131. df2.to_excel(writer, sheet_name='Google', index=0)
  132. worksheet = book['Google']
  133. worksheet.column_dimensions['A'].width = 25
  134. worksheet.column_dimensions['B'].width = 10
  135. worksheet.column_dimensions['C'].width = 100
  136. writer.save()
  137. progressbarOne['value'] += 1 # 进度条+1
  138. window.update()
  139. progressbarTwo['maximum'] = len(list_3) # 进度值最大值
  140. progressbarTwo['value'] = 0 # 进度值初始值
  141. df1 = list(np.array(df['标准代号'].str.split('\n')))
  142. list_2 = [x for item in df1 for x in item]
  143. list_3 = [x.strip() for x in list_2 if x.strip() != '']
  144. # list_3 = list(set(list_2))
  145. list_3 = list(set(list_3))
  146. list_3.sort(key=None, reverse=False)
  147. listA = []
  148. listB = []
  149. url = 'https://cn.bing.com/search?q='
  150. quick = '=HYPERLINK('
  151. for i in range(len(list_3)):
  152. if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
  153. a = str(list_3[i]).replace('/', '%2F')
  154. c = a.replace(' ', '+')
  155. b = c.replace(' ', '+')
  156. d = url + b + '+-advertisement&?FORM=BEHPTB'
  157. e = quick + 'B' + str(i + 2) + ')'
  158. listB.append(e)
  159. listA.append(d)
  160. else:
  161. d = url + list_3[i] + '+-advertisement&?FORM=BEHPTB'
  162. e = quick + 'B' + str(i + 2) + ')'
  163. listB.append(e)
  164. listA.append(d)
  165. progressbarTwo['value'] += 1 # 进度条+1
  166. window.update()
  167. df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
  168. print(df2)
  169. book = load_workbook(varF_1.get())
  170. writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
  171. writer.book = book
  172. df2.to_excel(writer, sheet_name='BingCN', index=0)
  173. worksheet = book['BingCN']
  174. worksheet.column_dimensions['A'].width = 25
  175. worksheet.column_dimensions['B'].width = 10
  176. worksheet.column_dimensions['C'].width = 100
  177. writer.save()
  178. progressbarOne['value'] += 1 # 进度条+1
  179. window.update()
  180. progressbarTwo['maximum'] = len(list_3) # 进度值最大值
  181. progressbarTwo['value'] = 0 # 进度值初始值
  182. df1 = list(np.array(df['标准代号'].str.split('\n')))
  183. list_2 = [x for item in df1 for x in item]
  184. list_3 = [x.strip() for x in list_2 if x.strip() != '']
  185. # list_3 = list(set(list_2))
  186. list_3 = list(set(list_3))
  187. list_3.sort(key=None, reverse=False)
  188. listA = []
  189. listB = []
  190. url = 'https://cn.bing.com/search?q='
  191. quick = '=HYPERLINK('
  192. for i in range(len(list_3)):
  193. if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
  194. a = str(list_3[i]).replace('/', '%2F')
  195. c = a.replace(' ', '+')
  196. b = c.replace(' ', '+')
  197. d = url + b + '+-advertisement&?FORM=BEHPTB&ensearch=1'
  198. e = quick + 'B' + str(i + 2) + ')'
  199. listB.append(e)
  200. listA.append(d)
  201. else:
  202. d = url + list_3[i] + '+-advertisement&?FORM=BEHPTB&ensearch=1'
  203. e = quick + 'B' + str(i + 2) + ')'
  204. listB.append(e)
  205. listA.append(d)
  206. progressbarTwo['value'] += 1 # 进度条+1
  207. window.update()
  208. df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
  209. print(df2)
  210. book = load_workbook(varF_1.get())
  211. writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
  212. writer.book = book
  213. df2.to_excel(writer, sheet_name='Bingnet', index=0)
  214. worksheet = book['Bingnet']
  215. worksheet.column_dimensions['A'].width = 25
  216. worksheet.column_dimensions['B'].width = 10
  217. worksheet.column_dimensions['C'].width = 100
  218. writer.save()
  219. progressbarOne['value'] += 1 # 进度条+1
  220. window.update()
  221. progressbarTwo['maximum'] = len(list_3) # 进度值最大值
  222. progressbarTwo['value'] = 0 # 进度值初始值
  223. df1 = list(np.array(df['标准代号'].str.split('\n')))
  224. list_2 = [x for item in df1 for x in item]
  225. list_3 = [x.strip() for x in list_2 if x.strip() != '']
  226. # list_3 = list(set(list_2))
  227. list_3 = list(set(list_3))
  228. list_3.sort(key=None, reverse=False)
  229. listA = []
  230. listB = []
  231. url = 'https://www.sogou.com/web?query='
  232. quick = '=HYPERLINK('
  233. for i in range(len(list_3)):
  234. if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
  235. a = str(list_3[i]).replace('/', '%2F')
  236. c = a.replace(' ', '+')
  237. b = c.replace(' ', '+')
  238. d = url + b + '%20-advertisement'
  239. e = quick + 'B' + str(i + 2) + ')'
  240. listB.append(e)
  241. listA.append(d)
  242. else:
  243. d = url + list_3[i] + '+-advertisement'
  244. e = quick + 'B' + str(i + 2) + ')'
  245. listB.append(e)
  246. listA.append(d)
  247. progressbarTwo['value'] += 1 # 进度条+1
  248. window.update()
  249. df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
  250. print(df2)
  251. book = load_workbook(varF_1.get())
  252. writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
  253. writer.book = book
  254. df2.to_excel(writer, sheet_name='SoGou', index=0)
  255. worksheet = book['SoGou']
  256. worksheet.column_dimensions['A'].width = 25
  257. worksheet.column_dimensions['B'].width = 10
  258. worksheet.column_dimensions['C'].width = 100
  259. writer.save()
  260. progressbarOne['value'] += 1 # 进度条+1
  261. window.update()
  262. window.mainloop()