一、简介:
突然多了这样一个需求,有很多行业标准需要查,但是一个一个搜太麻烦了,爬虫呢我这辈子可能都学不会,所以选择走个50步。
二、部分代码:
1.轮子:
import pandas as pd
import numpy as np
import urllib.parse#转义
from openpyxl import load_workbook
from tkinter import *#UI需要
from tkinter import filedialog#选择路径需要
import tkinter.font as tkFont#字体设置
import tkinter.ttk#进度条需要
2.UI:
window = Tk() # 创建主窗口
# window.geometry('+500+300')
#window.attributes('-topmost', 1)
ft = tkFont.Font(family='SourceHanSansCN-Bold', size=8)
window.title('聚合搜索') # 窗口命名为归档工具
window.geometry('720x210') # 设置窗口为720*480
screenWidth = window.winfo_screenwidth() # 获取显示区域的宽度
screenHeight = window.winfo_screenheight() # 获取显示区域的高度
width = 1920 # 设定窗口宽度
height = 1040 # 设定窗口高度
left = (screenWidth - width) / 2
top = (screenHeight - height) / 2
window.resizable(width=False, height=False)
#window.iconbitmap('项目10.ico') # 更改窗口图标
window.attributes('-alpha', 0.9) # 窗口透明度
progressbarOne = tkinter.ttk.Progressbar(window) # 设置进度条
progressbarOne.place(relx=0.16, width=450, height=7, rely=0.5) # 设置进度条长宽高
progressbarTwo = tkinter.ttk.Progressbar(window) # 设置进度条
progressbarTwo.place(relx=0.16, width=450, height=7, rely=0.6) # 设置进度条长宽高
def select_file():
selected_file = filedialog.askopenfilename() # 使用askopenfilename函数一个文件
varF_1.set(selected_file)
varF_1 = StringVar()
LF_1 = Label(window, text='输入:', font=ft).place(relx=0.1, rely=0.15)
EnF_1 = Entry(window, textvariable=varF_1).place(relx=0.16, width=450, rely=0.15)
Button(window, text='选择路径', command=select_file, font=ft).place(relx=0.8, width=80, rely=0.15)
btn0 = Button(window, text='运行', font=ft, command=lambda: [father()]).place(
relx=0.45, rely=0.7)
Label(window, text='version:1.0.0.0', font=ft).place(relx=0.41, rely=0.85)
3.实现,以百度为例:
df = pd.read_excel(varF_1.get())
progressbarTwo['maximum'] = len(list_3) # 进度值最大值
progressbarTwo['value'] = 0 # 进度值初始值
df1 = list(np.array(df['标准代号'].str.split('\n')))#当有合并单元格的时候,以\n切片元素
list_2 = [x for item in df1 for x in item]#去掉list内括号
list_3 = [x.strip() for x in list_2 if x.strip() != '']#去掉''元素
# list_3 = list(set(list_2))
list_3 = list(set(list_3))#去重
list_3.sort(key=None, reverse=False)#排序
listA = []
listB = []
listC = []
listD = []
url = 'https://www.baidu.com/s?tn=51076811_dg&wd='
url2 = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=51076811_dg&wd='
url3 = 'https://m.baidu.com/ssid=21ec6c6478732f17/from=1001192y/s?word='
quick = '=HYPERLINK('#写入excel时转换为超链接
for i in range(len(list_3)):
if not '\u4e00' <= list_3[i][2] <= '\u9fff':#判断第二个字符是否为汉字
a = str(list_3[i]).replace('/', '%2F')#替换'/'为'%2F'
c = a.replace(' ', '+')#替换' '为','
b = c.replace(' ', '+')#替换' '为','
d = url + b + '%20-advertisement'#合并链接
f = url3 + b + '+-advertisemen'
e = quick + 'B' + str(i + 2) + ')'#转换为超链接
g = quick + 'D' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
listC.append(f)
listD.append(g)
else:
d = url2 + list_3[i] + '%20-advertisement'
f = url3 + urllib.parse.quote(list_3[i].encode('gb2312'))+ '+-advertisemen'#将中文转义
print(f)
e = quick + 'B' + str(i + 2) + ')'
g = quick + 'D' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
listC.append(f)
listD.append(g)
progressbarTwo['value'] += 1 # 进度条+1
window.update()
df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB,'phone':listC,'phoneurl':listD})
print(df2)
book = load_workbook(varF_1.get())
writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
writer.book = book
df2.to_excel(writer, sheet_name='BaiDu', index=0)
worksheet = book['BaiDu']
worksheet.column_dimensions['A'].width = 25
worksheet.column_dimensions['B'].width = 10
worksheet.column_dimensions['C'].width = 100
worksheet.column_dimensions['D'].width = 10
worksheet.column_dimensions['E'].width = 100
writer.save()
progressbarOne['value'] += 1 # 进度条+1
window.update()
三、常见搜索网站头部:
名称 | 头 | 尾+去广告 |
---|---|---|
百度搜索英文及数字 | ‘https://www.baidu.com/s?tn=51076811_dg&wd=‘ | %20-advertisement |
百度搜索中文(可适用于英文及数字) | ‘https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=51076811_dg&wd=‘ | ‘%20-advertisement’ |
百度移动 | ‘https://m.baidu.com/ssid=21ec6c6478732f17/from=1001192y/s?word=‘ | ‘+-advertisement’ |
谷歌 | ‘https://www.google.com/search?q=‘ | ‘%20-advertisement’ |
必应国内 | ‘https://cn.bing.com/search?q=‘ | ‘+-advertisement&?FORM=BEHPTB’ |
必应国际 | ‘https://cn.bing.com/search?q=‘ | ‘+-advertisement&?FORM=BEHPTB&ensearch=1’ |
搜狗 | ‘https://www.sogou.com/web?query=‘ | ‘+-advertisement’ |
四、全代码:
import pandas as pd
import numpy as np
import urllib.parse#转义
from openpyxl import load_workbook
from tkinter import *#UI需要
from tkinter import filedialog#选择路径需要
import tkinter.font as tkFont#字体设置
import tkinter.ttk#进度条需要
window = Tk() # 创建主窗口
# window.geometry('+500+300')
# window.attributes('-topmost', 1)
ft = tkFont.Font(family='SourceHanSansCN-Bold', size=8)
window.title('聚合搜索') # 窗口命名为归档工具
window.geometry('720x210') # 设置窗口为720*480
screenWidth = window.winfo_screenwidth() # 获取显示区域的宽度
screenHeight = window.winfo_screenheight() # 获取显示区域的高度
width = 1920 # 设定窗口宽度
height = 1040 # 设定窗口高度
left = (screenWidth - width) / 2
top = (screenHeight - height) / 2
window.resizable(width=False, height=False)
# window.iconbitmap('项目10.ico') # 更改窗口图标
window.attributes('-alpha', 0.9) # 窗口透明度
progressbarOne = tkinter.ttk.Progressbar(window) # 设置进度条
progressbarOne.place(relx=0.16, width=450, height=7, rely=0.5) # 设置进度条长宽高
progressbarTwo = tkinter.ttk.Progressbar(window) # 设置进度条
progressbarTwo.place(relx=0.16, width=450, height=7, rely=0.6) # 设置进度条长宽高
def select_file():
selected_file = filedialog.askopenfilename() # 使用askopenfilename函数一个文件
varF_1.set(selected_file)
varF_1 = StringVar()
LF_1 = Label(window, text='输入:', font=ft).place(relx=0.1, rely=0.15)
EnF_1 = Entry(window, textvariable=varF_1).place(relx=0.16, width=450, rely=0.15)
Button(window, text='选择路径', command=select_file, font=ft).place(relx=0.8, width=80, rely=0.15)
btn0 = Button(window, text='运行', font=ft, command=lambda: [father()]).place(
relx=0.45, rely=0.7)
Label(window, text='version:1.0.0.0', font=ft).place(relx=0.41, rely=0.85)
def father():
progressbarOne['maximum'] = 6 # 进度值最大值
progressbarOne['value'] = 0 # 进度值初始值
df = pd.read_excel(varF_1.get())
progressbarTwo['maximum'] = len(list_3) # 进度值最大值
progressbarTwo['value'] = 0 # 进度值初始值
df1 = list(np.array(df['标准代号'].str.split('\n')))#当有合并单元格的时候,以\n切片元素
list_2 = [x for item in df1 for x in item]#去掉list内括号
list_3 = [x.strip() for x in list_2 if x.strip() != '']#去掉''元素
# list_3 = list(set(list_2))
list_3 = list(set(list_3))#去重
list_3.sort(key=None, reverse=False)#排序
listA = []
listB = []
listC = []
listD = []
url = 'https://www.baidu.com/s?tn=51076811_dg&wd='
url2 = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=51076811_dg&wd='
url3 = 'https://m.baidu.com/ssid=21ec6c6478732f17/from=1001192y/s?word='
quick = '=HYPERLINK('#写入excel时转换为超链接
for i in range(len(list_3)):
if not '\u4e00' <= list_3[i][2] <= '\u9fff':#判断第二个字符是否为汉字
a = str(list_3[i]).replace('/', '%2F')#替换'/'为'%2F'
c = a.replace(' ', '+')#替换' '为','
b = c.replace(' ', '+')#替换' '为','
d = url + b + '%20-advertisement'#合并链接
f = url3 + b + '+-advertisemen'
e = quick + 'B' + str(i + 2) + ')'#转换为超链接
g = quick + 'D' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
listC.append(f)
listD.append(g)
else:
d = url2 + list_3[i] + '%20-advertisement'
f = url3 + urllib.parse.quote(list_3[i].encode('gb2312'))+ '+-advertisemen'#将中文转义
print(f)
e = quick + 'B' + str(i + 2) + ')'
g = quick + 'D' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
listC.append(f)
listD.append(g)
progressbarTwo['value'] += 1 # 进度条+1
window.update()
df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB,'phone':listC,'phoneurl':listD})
print(df2)
book = load_workbook(varF_1.get())
writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
writer.book = book
df2.to_excel(writer, sheet_name='BaiDu', index=0)
worksheet = book['BaiDu']
worksheet.column_dimensions['A'].width = 25
worksheet.column_dimensions['B'].width = 10
worksheet.column_dimensions['C'].width = 100
worksheet.column_dimensions['D'].width = 10
worksheet.column_dimensions['E'].width = 100
writer.save()
progressbarOne['value'] += 1 # 进度条+1
window.update()
progressbarTwo['maximum'] = len(list_3) # 进度值最大值
progressbarTwo['value'] = 0 # 进度值初始值
df1 = list(np.array(df['标准代号'].str.split('\n')))
list_2 = [x for item in df1 for x in item]
list_3 = [x.strip() for x in list_2 if x.strip() != '']
# list_3 = list(set(list_2))
list_3 = list(set(list_3))
list_3.sort(key=None, reverse=False)
listA = []
listB = []
url = 'https://www.google.com/search?q='
quick = '=HYPERLINK('
for i in range(len(list_3)):
if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
a = str(list_3[i]).replace('/', '%2F')
c = a.replace(' ', '+')
b = c.replace(' ', '+')
d = url + b + '%20-advertisement'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
else:
d = url + list_3[i] + '%20-advertisement'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
progressbarTwo['value'] += 1 # 进度条+1
window.update()
df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
print(df2)
book = load_workbook(varF_1.get())
writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
writer.book = book
df2.to_excel(writer, sheet_name='Google', index=0)
worksheet = book['Google']
worksheet.column_dimensions['A'].width = 25
worksheet.column_dimensions['B'].width = 10
worksheet.column_dimensions['C'].width = 100
writer.save()
progressbarOne['value'] += 1 # 进度条+1
window.update()
progressbarTwo['maximum'] = len(list_3) # 进度值最大值
progressbarTwo['value'] = 0 # 进度值初始值
df1 = list(np.array(df['标准代号'].str.split('\n')))
list_2 = [x for item in df1 for x in item]
list_3 = [x.strip() for x in list_2 if x.strip() != '']
# list_3 = list(set(list_2))
list_3 = list(set(list_3))
list_3.sort(key=None, reverse=False)
listA = []
listB = []
url = 'https://cn.bing.com/search?q='
quick = '=HYPERLINK('
for i in range(len(list_3)):
if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
a = str(list_3[i]).replace('/', '%2F')
c = a.replace(' ', '+')
b = c.replace(' ', '+')
d = url + b + '+-advertisement&?FORM=BEHPTB'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
else:
d = url + list_3[i] + '+-advertisement&?FORM=BEHPTB'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
progressbarTwo['value'] += 1 # 进度条+1
window.update()
df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
print(df2)
book = load_workbook(varF_1.get())
writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
writer.book = book
df2.to_excel(writer, sheet_name='BingCN', index=0)
worksheet = book['BingCN']
worksheet.column_dimensions['A'].width = 25
worksheet.column_dimensions['B'].width = 10
worksheet.column_dimensions['C'].width = 100
writer.save()
progressbarOne['value'] += 1 # 进度条+1
window.update()
progressbarTwo['maximum'] = len(list_3) # 进度值最大值
progressbarTwo['value'] = 0 # 进度值初始值
df1 = list(np.array(df['标准代号'].str.split('\n')))
list_2 = [x for item in df1 for x in item]
list_3 = [x.strip() for x in list_2 if x.strip() != '']
# list_3 = list(set(list_2))
list_3 = list(set(list_3))
list_3.sort(key=None, reverse=False)
listA = []
listB = []
url = 'https://cn.bing.com/search?q='
quick = '=HYPERLINK('
for i in range(len(list_3)):
if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
a = str(list_3[i]).replace('/', '%2F')
c = a.replace(' ', '+')
b = c.replace(' ', '+')
d = url + b + '+-advertisement&?FORM=BEHPTB&ensearch=1'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
else:
d = url + list_3[i] + '+-advertisement&?FORM=BEHPTB&ensearch=1'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
progressbarTwo['value'] += 1 # 进度条+1
window.update()
df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
print(df2)
book = load_workbook(varF_1.get())
writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
writer.book = book
df2.to_excel(writer, sheet_name='Bingnet', index=0)
worksheet = book['Bingnet']
worksheet.column_dimensions['A'].width = 25
worksheet.column_dimensions['B'].width = 10
worksheet.column_dimensions['C'].width = 100
writer.save()
progressbarOne['value'] += 1 # 进度条+1
window.update()
progressbarTwo['maximum'] = len(list_3) # 进度值最大值
progressbarTwo['value'] = 0 # 进度值初始值
df1 = list(np.array(df['标准代号'].str.split('\n')))
list_2 = [x for item in df1 for x in item]
list_3 = [x.strip() for x in list_2 if x.strip() != '']
# list_3 = list(set(list_2))
list_3 = list(set(list_3))
list_3.sort(key=None, reverse=False)
listA = []
listB = []
url = 'https://www.sogou.com/web?query='
quick = '=HYPERLINK('
for i in range(len(list_3)):
if (u'/u0041' <= list_3[i] <= u'/u005a') or (u'/u0061' <= list_3[i] <= u'/u007a'):
a = str(list_3[i]).replace('/', '%2F')
c = a.replace(' ', '+')
b = c.replace(' ', '+')
d = url + b + '%20-advertisement'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
else:
d = url + list_3[i] + '+-advertisement'
e = quick + 'B' + str(i + 2) + ')'
listB.append(e)
listA.append(d)
progressbarTwo['value'] += 1 # 进度条+1
window.update()
df2 = pd.DataFrame({'方法编号': list_3, '链接': listA, '跳转': listB})
print(df2)
book = load_workbook(varF_1.get())
writer = pd.ExcelWriter(varF_1.get(), engine='openpyxl')
writer.book = book
df2.to_excel(writer, sheet_name='SoGou', index=0)
worksheet = book['SoGou']
worksheet.column_dimensions['A'].width = 25
worksheet.column_dimensions['B'].width = 10
worksheet.column_dimensions['C'].width = 100
writer.save()
progressbarOne['value'] += 1 # 进度条+1
window.update()
window.mainloop()