import urllib.request
import urllib.parse

http://so.1ppt.com/cse/search?s=18142763795818420485&entry=1&ie=gbk&nsid=3&ie=gbk&q=%D0%C2%C4%EA%BC%C6%BB%AEppt
# http://so.1ppt.com/cse/search?s=18142763795818420485&entry=1&ie=gbk&nsid=3&ie=gbk&q=%BF%AA%D1%A7%BC%D2%B3%A4%BB%E1ppt
class BaiduSpider(object):

  1. def __init__(self):<br /> self.headers = {<br /> 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'<br /> }<br /> self.base_url = 'https://www.1ppt.com/moban/ppt_moban_'<br /> # 发请求 获得响应<br /> def readPage(self,url):<br /> # 发请求<br /> req = urllib.request.Request(url, headers=self.headers)<br /> res = urllib.request.urlopen(req) # 发请求,获得响应<br /> html = res.read().decode('gbk')
  2. return html
  3. def writePage(self,filename,html,i): # 写入到文件里<br /> with open(filename, 'w', encoding='gbk') as f:<br /> f.write(html)<br /> print(f'正在爬取{i}页')
  4. def main(self):
  5. name = input('请输ppt名称:')
  6. # 起始页<br /> begin = int(input('请输入起始页:'))
  7. # 结束页<br /> end = int(input('请输入结束页:'))
  8. # # 中文网址有问题,需要对naem处理<br /> # kw = {'q': name}<br /> # result = urllib.parse.urlencode(kw)<br /> for i in range(begin, end + 1):
  9. url = self.base_url + str(i)+'.html'<br /> # 函数调用<br /> html = self.readPage(url)<br /> filename = '第' + str(i) + '页.html'<br /> self.writePage(filename, html, i)<br /> print('爬取结束')

if name == ‘main‘:
spider = BaiduSpider() # 创建实例对象
spider.main()